ROOT  6.06/09
Reference Guide
MethodPyGTB.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyGTB *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * GradientBoostingClassifier Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 
20 #pragma GCC diagnostic ignored "-Wunused-parameter"
21 #include <iomanip>
22 #include <fstream>
23 
24 #include <Python.h>
25 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
26 #include <numpy/arrayobject.h>
27 
28 #include "TMath.h"
29 #include "Riostream.h"
30 #include "TMatrix.h"
31 #include "TMatrixD.h"
32 #include "TVectorD.h"
33 
35 #include "TMVA/MethodPyGTB.h"
36 #include "TMVA/Tools.h"
37 #include "TMVA/Ranking.h"
38 #include "TMVA/Types.h"
39 #include "TMVA/Config.h"
40 #include "TMVA/PDF.h"
41 #include "TMVA/ClassifierFactory.h"
42 
43 #include "TMVA/Results.h"
44 
45 
46 
47 using namespace TMVA;
48 
49 REGISTER_METHOD(PyGTB)
50 
52 
53 //_______________________________________________________________________
54 MethodPyGTB::MethodPyGTB(const TString &jobName,
55  const TString &methodTitle,
56  DataSetInfo &dsi,
57  const TString &theOption,
58  TDirectory *theTargetDir) :
59  PyMethodBase(jobName, Types::kPyGTB, methodTitle, dsi, theOption, theTargetDir),
60  loss("deviance"),
61  learning_rate(0.1),
62  n_estimators(100),
63  subsample(1.0),
64  min_samples_split(2),
65  min_samples_leaf(1),
66  min_weight_fraction_leaf(0.0),
67  max_depth(3),
68  init("None"),
69  random_state("None"),
70  max_features("None"),
71  verbose(0),
72  max_leaf_nodes("None"),
73  warm_start(kFALSE)
74 {
75  // standard constructor for the PyGTB
76  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
77 
78 }
79 
80 //_______________________________________________________________________
81 MethodPyGTB::MethodPyGTB(DataSetInfo &theData, const TString &theWeightFile, TDirectory *theTargetDir)
82  : PyMethodBase(Types::kPyGTB, theData, theWeightFile, theTargetDir),
83  loss("deviance"),
84  learning_rate(0.1),
85  n_estimators(100),
86  subsample(1.0),
87  min_samples_split(2),
88  min_samples_leaf(1),
89  min_weight_fraction_leaf(0.0),
90  max_depth(3),
91  init("None"),
92  random_state("None"),
93  max_features("None"),
94  verbose(0),
95  max_leaf_nodes("None"),
96  warm_start(kFALSE)
97 {
98  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
99 }
100 
101 
102 //_______________________________________________________________________
104 {
105 }
106 
107 //_______________________________________________________________________
109 {
110  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
111  return kFALSE;
112 }
113 
114 
115 //_______________________________________________________________________
117 {
119 
120  DeclareOptionRef(loss, "Loss", "{'deviance', 'exponential'}, optional (default='deviance')\
121  loss function to be optimized. 'deviance' refers to\
122  deviance (= logistic regression) for classification\
123  with probabilistic outputs. For loss 'exponential' gradient\
124  boosting recovers the AdaBoost algorithm.");
125 
126  DeclareOptionRef(learning_rate, "LearningRate", "float, optional (default=0.1)\
127  learning rate shrinks the contribution of each tree by `learning_rate`.\
128  There is a trade-off between learning_rate and n_estimators.");
129 
130  DeclareOptionRef(n_estimators, "NEstimators", "int (default=100)\
131  The number of boosting stages to perform. Gradient boosting\
132  is fairly robust to over-fitting so a large number usually\
133  results in better performance.");
134 
135  DeclareOptionRef(subsample, "Subsample", "float, optional (default=1.0)\
136  The fraction of samples to be used for fitting the individual base\
137  learners. If smaller than 1.0 this results in Stochastic Gradient\
138  Boosting. `subsample` interacts with the parameter `n_estimators`.\
139  Choosing `subsample < 1.0` leads to a reduction of variance\
140  and an increase in bias.");
141 
142  DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\
143  The minimum number of samples required to split an internal node.");
144 
145  DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \
146  The minimum number of samples in newly created leaves. A split is \
147  discarded if after the split, one of the leaves would contain less then \
148  ``min_samples_leaf`` samples.");
149 
150  DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
151  The minimum weighted fraction of the input samples required to be at a \
152  leaf node.");
153 
154  DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \
155  The maximum depth of the tree. If None, then nodes are expanded until \
156  all leaves are pure or until all leaves contain less than \
157  min_samples_split samples. \
158  Ignored if ``max_leaf_nodes`` is not None.");
159 
160  DeclareOptionRef(init, "Init", "BaseEstimator, None, optional (default=None)\
161  An estimator object that is used to compute the initial\
162  predictions. ``init`` has to provide ``fit`` and ``predict``.\
163  If None it uses ``loss.init_estimator`");
164 
165  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
166  If int, random_state is the seed used by the random number generator;\
167  If RandomState instance, random_state is the random number generator;\
168  If None, the random number generator is the RandomState instance used\
169  by `np.random`.");
170  DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split");
171  DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\
172  Controls the verbosity of the tree building process.");
173  DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\
174  Grow trees with ``max_leaf_nodes`` in best-first fashion.\
175  Best nodes are defined as relative reduction in impurity.\
176  If None then unlimited number of leaf nodes.\
177  If not None then ``max_depth`` will be ignored.");
178  DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\
179  When set to ``True``, reuse the solution of the previous call to fit\
180  and add more estimators to the ensemble, otherwise, just fit a whole\
181  new forest.");
182 
183 
184 
185 
186 }
187 
188 //_______________________________________________________________________
190 {
191  if (loss != "deviance" && loss != "exponential") {
192  Log() << kFATAL << Form(" Loss = %s... that does not work !! ", loss.Data())
193  << " The options are deviance of exponential."
194  << Endl;
195  }
196 
197  if (learning_rate <= 0) {
198  Log() << kERROR << " LearningRate <=0... that does not work !! "
199  << " I set it to 0.1 .. just so that the program does not crash"
200  << Endl;
201  learning_rate = 0.1;
202  }
203  if (n_estimators <= 0) {
204  Log() << kERROR << " NEstimators <=0... that does not work !! "
205  << " I set it to 100 .. just so that the program does not crash"
206  << Endl;
207  n_estimators = 100;
208  }
209  if (min_samples_split < 0) {
210  Log() << kERROR << " MinSamplesSplit <0... that does not work !! "
211  << " I set it to 2 .. just so that the program does not crash"
212  << Endl;
213  min_samples_split = 2;
214  }
215  if (subsample < 0) {
216  Log() << kERROR << " Subsample <0... that does not work !! "
217  << " I set it to 1.0 .. just so that the program does not crash"
218  << Endl;
219  subsample = 1.0;
220  }
221 
222  if (min_samples_leaf < 0) {
223  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
224  << " I set it to 1.0 .. just so that the program does not crash"
225  << Endl;
226  min_samples_leaf = 1;
227  }
228 
229  if (min_samples_leaf < 0) {
230  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
231  << " I set it to 1.0 .. just so that the program does not crash"
232  << Endl;
233  min_samples_leaf = 1;
234  }
235 
236  if (min_weight_fraction_leaf < 0) {
237  Log() << kERROR << " MinWeightFractionLeaf <0... that does not work !! "
238  << " I set it to 0.0 .. just so that the program does not crash"
239  << Endl;
241  }
242 
243  if (max_depth < 0) {
244  Log() << kERROR << " MaxDepth <0... that does not work !! "
245  << " I set it to 3 .. just so that the program does not crash"
246  << Endl;
247  max_depth = 3;
248  }
249 
250  PyObject *poinit = Eval(init);
251  if (!poinit) {
252  Log() << kFATAL << Form(" Init = %s... that does not work !! ", init.Data())
253  << " The options are None or BaseEstimator. An estimator object that is used to compute the initial"
254  << " predictions. ``init`` has to provide ``fit`` and ``predict``."
255  << " If None it uses ``loss.init_estimator``."
256  << Endl;
257  }
258  Py_DECREF(poinit);
259 
260  PyObject *porandom_state = Eval(random_state);
261  if (!porandom_state) {
262  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
263  << "If int, random_state is the seed used by the random number generator;"
264  << "If RandomState instance, random_state is the random number generator;"
265  << "If None, the random number generator is the RandomState instance used by `np.random`."
266  << Endl;
267  }
268  Py_DECREF(porandom_state);
269 
270  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
271  PyObject *pomax_features = Eval(max_features);
272  if (!pomax_features) {
273  Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data())
274  << "int, float, string or None, optional (default='auto')"
275  << "The number of features to consider when looking for the best split:"
276  << "If int, then consider `max_features` features at each split."
277  << "If float, then `max_features` is a percentage and"
278  << "`int(max_features * n_features)` features are considered at each split."
279  << "If 'auto', then `max_features=sqrt(n_features)`."
280  << "If 'sqrt', then `max_features=sqrt(n_features)`."
281  << "If 'log2', then `max_features=log2(n_features)`."
282  << "If None, then `max_features=n_features`."
283  << Endl;
284  }
285  Py_DECREF(pomax_features);
286 
287 // verbose(0),
288  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
289  if (!pomax_leaf_nodes) {
290  Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data())
291  << " The options are None or integer."
292  << Endl;
293  }
294  Py_DECREF(pomax_leaf_nodes);
295 
296 }
297 
298 
299 //_______________________________________________________________________
301 {
302  ProcessOptions();
303  _import_array();//require to use numpy arrays
304 
305  //Import sklearn
306  // Convert the file name to a Python string.
307  PyObject *pName = PyUnicode_FromString("sklearn.ensemble");
308  // Import the file as a Python module.
309  fModule = PyImport_Import(pName);
310  Py_DECREF(pName);
311 
312  if (!fModule) {
313  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
314  Log() << Endl;
315  }
316 
317 
318  //Training data
319  UInt_t fNvars = Data()->GetNVariables();
320  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
321  int *dims = new int[2];
322  dims[0] = fNrowsTraining;
323  dims[1] = fNvars;
324  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
325  float *TrainData = (float *)(PyArray_DATA(fTrainData));
326 
327 
328  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
329  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
330 
331  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
332  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
333 
334  for (int i = 0; i < fNrowsTraining; i++) {
335  const TMVA::Event *e = Data()->GetTrainingEvent(i);
336  for (UInt_t j = 0; j < fNvars; j++) {
337  TrainData[j + i * fNvars] = e->GetValue(j);
338  }
339  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
340  else TrainDataClasses[i] = TMVA::Types::kBackground;
341 
342  TrainDataWeights[i] = e->GetWeight();
343  }
344 }
345 
347 {
348 // loss("deviance"),
349 // learning_rate(0.1),
350 // n_estimators(100),
351 // subsample(1.0),
352 // min_samples_split(2),
353 // min_samples_leaf(1),
354 // min_weight_fraction_leaf(0.0),
355 // max_depth(3),
356 // init("None"),
357 // random_state("None"),
358 // max_features("None"),
359 // verbose(0),
360 // max_leaf_nodes("None"),
361 // warm_start(kFALSE)
362 
363  //NOTE: max_features must have 3 defferents variables int, float and string
364  //search a solution with PyObject
365  PyObject *poinit = Eval(init);
366  PyObject *porandom_state = Eval(random_state);
367  PyObject *pomax_features = Eval(max_features);
368  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
369 
370  PyObject *args = Py_BuildValue("(sfifiifiOOOiOi)", loss.Data(), \
372  max_depth, poinit, porandom_state, pomax_features, verbose, pomax_leaf_nodes, warm_start);
373 
374  PyObject_Print(args, stdout, 0);
375  std::cout << std::endl;
376 
377  PyObject *pDict = PyModule_GetDict(fModule);
378  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "GradientBoostingClassifier");
379 
380  // Create an instance of the class
381  if (PyCallable_Check(fClassifierClass)) {
382  //instance
383  fClassifier = PyObject_CallObject(fClassifierClass , args);
384  PyObject_Print(fClassifier, stdout, 0);
385  std::cout << std::endl;
386 
387  Py_DECREF(poinit);
388  Py_DECREF(porandom_state);
389  Py_DECREF(pomax_features);
390  Py_DECREF(pomax_leaf_nodes);
391  Py_DECREF(args);
392  } else {
393  PyErr_Print();
394  Py_DECREF(poinit);
395  Py_DECREF(porandom_state);
396  Py_DECREF(pomax_features);
397  Py_DECREF(pomax_leaf_nodes);
398  Py_DECREF(args);
399  Py_DECREF(pDict);
400  Py_DECREF(fClassifierClass);
401  Log() << kFATAL << "Can't call function GradientBoostingClassifier" << Endl;
402  Log() << Endl;
403 
404  }
405 
406  fClassifier = PyObject_CallMethod(fClassifier, (char *)"fit", (char *)"(OOO)", fTrainData, fTrainDataClasses, fTrainDataWeights);
407 // PyObject_Print(fClassifier, stdout, 0);
408 // std::cout<<std::endl;
409  // pValue =PyObject_CallObject(fClassifier, PyUnicode_FromString("classes_"));
410  // PyObject_Print(pValue, stdout, 0);
411 
412  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
413  Log() << Endl;
414  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
415  Log() << Endl;
416 
417  Serialize(path,fClassifier);
418 }
419 
420 //_______________________________________________________________________
422 {
424 }
425 
426 
427 //_______________________________________________________________________
429 {
430  // cannot determine error
431  NoErrorCalc(errLower, errUpper);
432 
434 
435  Double_t mvaValue;
436  const TMVA::Event *e = Data()->GetEvent();
437  UInt_t nvars = e->GetNVariables();
438  int *dims = new int[2];
439  dims[0] = 1;
440  dims[1] = nvars;
441  PyArrayObject *pEvent= (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
442  float *pValue = (float *)(PyArray_DATA(pEvent));
443 
444  for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i);
445 
446  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
447  double *proba = (double *)(PyArray_DATA(result));
448  mvaValue = proba[0]; //getting signal prob
449  Py_DECREF(result);
450  Py_DECREF(pEvent);
451  delete dims;
452  return mvaValue;
453 }
454 
455 //_______________________________________________________________________
457 {
458  if (!PyIsInitialized()) {
459  PyInitialize();
460  }
461 
462  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
463  Log() << Endl;
464  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
465  Log() << Endl;
466  UnSerialize(path,&fClassifier);
467 }
468 
469 //_______________________________________________________________________
471 {
472  // get help message text
473  //
474  // typical length of text line:
475  // "|--------------------------------------------------------------|"
476  Log() << Endl;
477  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
478  Log() << Endl;
479  Log() << "Decision Trees and Rule-Based Models " << Endl;
480  Log() << Endl;
481  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
482  Log() << Endl;
483  Log() << Endl;
484  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
485  Log() << Endl;
486  Log() << "<None>" << Endl;
487 }
488 
const TString & GetWeightFileDir() const
Definition: MethodBase.h:407
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
PyObject * fClassifier
Definition: PyMethodBase.h:114
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:96
Config & gConfig()
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:363
EAnalysisType
Definition: Types.h:124
Basic string class.
Definition: TString.h:137
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
ClassImp(MethodPyGTB) MethodPyGTB
Definition: MethodPyGTB.cxx:51
Double_t min_weight_fraction_leaf
Definition: MethodPyGTB.h:105
static void Serialize(TString file, PyObject *classifier)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:118
static int PyIsInitialized()
static void PyInitialize()
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
TString max_leaf_nodes
Definition: MethodPyGTB.h:137
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
Double_t subsample
Definition: MethodPyGTB.h:95
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:117
void GetHelpMessage() const
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:303
#define None
Definition: TGWin32.h:68
PyObject * fModule
Definition: PyMethodBase.h:113
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
unsigned int UInt_t
Definition: RtypesCore.h:42
bool verbose
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:116
TString random_state
Definition: MethodPyGTB.h:118
const Event * GetEvent() const
Definition: DataSet.cxx:180
MethodPyGTB(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=NULL)
static Int_t init()
double Double_t
Definition: RtypesCore.h:55
Describe directory structure in memory.
Definition: TDirectory.h:41
int type
Definition: TGX11.cxx:120
Double_t learning_rate
Definition: MethodPyGTB.h:89
MsgLogger & Log() const
Definition: Configurable.h:130
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:194
TString max_features
Definition: MethodPyGTB.h:123
UInt_t GetClass() const
Definition: Event.h:86
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:599
void SetWeightFileDir(TString fileDir)
set directory of weight file
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
double result[121]
static void UnSerialize(TString file, PyObject **obj)
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
initialization
virtual void TestClassification()
initialization
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:820