ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
MethodPyGTB.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyGTB *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * GradientBoostingClassifier Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 
20 #pragma GCC diagnostic ignored "-Wunused-parameter"
21 #include <iomanip>
22 #include <fstream>
23 
24 #include <Python.h>
25 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
26 #include <numpy/arrayobject.h>
27 
28 #include "TMath.h"
29 #include "Riostream.h"
30 #include "TMatrix.h"
31 #include "TMatrixD.h"
32 #include "TVectorD.h"
33 
35 #include "TMVA/MethodPyGTB.h"
36 #include "TMVA/Tools.h"
37 #include "TMVA/Ranking.h"
38 #include "TMVA/Types.h"
39 #include "TMVA/Config.h"
40 #include "TMVA/PDF.h"
41 #include "TMVA/ClassifierFactory.h"
42 
43 #include "TMVA/Results.h"
44 
45 
46 
47 using namespace TMVA;
48 
49 REGISTER_METHOD(PyGTB)
50 
52 
53 //_______________________________________________________________________
54 MethodPyGTB::MethodPyGTB(const TString &jobName,
55  const TString &methodTitle,
56  DataSetInfo &dsi,
57  const TString &theOption,
58  TDirectory *theTargetDir) :
59  PyMethodBase(jobName, Types::kPyGTB, methodTitle, dsi, theOption, theTargetDir),
60  loss("deviance"),
61  learning_rate(0.1),
62  n_estimators(100),
63  subsample(1.0),
64  min_samples_split(2),
65  min_samples_leaf(1),
66  min_weight_fraction_leaf(0.0),
67  max_depth(3),
68  init("None"),
69  random_state("None"),
70  max_features("None"),
71  verbose(0),
72  max_leaf_nodes("None"),
73  warm_start(kFALSE)
74 {
75  // standard constructor for the PyGTB
76  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
77 
78 }
79 
80 //_______________________________________________________________________
81 MethodPyGTB::MethodPyGTB(DataSetInfo &theData, const TString &theWeightFile, TDirectory *theTargetDir)
82  : PyMethodBase(Types::kPyGTB, theData, theWeightFile, theTargetDir),
83  loss("deviance"),
84  learning_rate(0.1),
85  n_estimators(100),
86  subsample(1.0),
87  min_samples_split(2),
88  min_samples_leaf(1),
89  min_weight_fraction_leaf(0.0),
90  max_depth(3),
91  init("None"),
92  random_state("None"),
93  max_features("None"),
94  verbose(0),
95  max_leaf_nodes("None"),
96  warm_start(kFALSE)
97 {
98  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
99 }
100 
101 
102 //_______________________________________________________________________
104 {
105 }
106 
107 //_______________________________________________________________________
109 {
110  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
111  return kFALSE;
112 }
113 
114 
115 //_______________________________________________________________________
117 {
119 
120  DeclareOptionRef(loss, "Loss", "{'deviance', 'exponential'}, optional (default='deviance')\
121  loss function to be optimized. 'deviance' refers to\
122  deviance (= logistic regression) for classification\
123  with probabilistic outputs. For loss 'exponential' gradient\
124  boosting recovers the AdaBoost algorithm.");
125 
126  DeclareOptionRef(learning_rate, "LearningRate", "float, optional (default=0.1)\
127  learning rate shrinks the contribution of each tree by `learning_rate`.\
128  There is a trade-off between learning_rate and n_estimators.");
129 
130  DeclareOptionRef(n_estimators, "NEstimators", "int (default=100)\
131  The number of boosting stages to perform. Gradient boosting\
132  is fairly robust to over-fitting so a large number usually\
133  results in better performance.");
134 
135  DeclareOptionRef(subsample, "Subsample", "float, optional (default=1.0)\
136  The fraction of samples to be used for fitting the individual base\
137  learners. If smaller than 1.0 this results in Stochastic Gradient\
138  Boosting. `subsample` interacts with the parameter `n_estimators`.\
139  Choosing `subsample < 1.0` leads to a reduction of variance\
140  and an increase in bias.");
141 
142  DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\
143  The minimum number of samples required to split an internal node.");
144 
145  DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \
146  The minimum number of samples in newly created leaves. A split is \
147  discarded if after the split, one of the leaves would contain less then \
148  ``min_samples_leaf`` samples.");
149 
150  DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
151  The minimum weighted fraction of the input samples required to be at a \
152  leaf node.");
153 
154  DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \
155  The maximum depth of the tree. If None, then nodes are expanded until \
156  all leaves are pure or until all leaves contain less than \
157  min_samples_split samples. \
158  Ignored if ``max_leaf_nodes`` is not None.");
159 
160  DeclareOptionRef(init, "Init", "BaseEstimator, None, optional (default=None)\
161  An estimator object that is used to compute the initial\
162  predictions. ``init`` has to provide ``fit`` and ``predict``.\
163  If None it uses ``loss.init_estimator`");
164 
165  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
166  If int, random_state is the seed used by the random number generator;\
167  If RandomState instance, random_state is the random number generator;\
168  If None, the random number generator is the RandomState instance used\
169  by `np.random`.");
170  DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split");
171  DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\
172  Controls the verbosity of the tree building process.");
173  DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\
174  Grow trees with ``max_leaf_nodes`` in best-first fashion.\
175  Best nodes are defined as relative reduction in impurity.\
176  If None then unlimited number of leaf nodes.\
177  If not None then ``max_depth`` will be ignored.");
178  DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\
179  When set to ``True``, reuse the solution of the previous call to fit\
180  and add more estimators to the ensemble, otherwise, just fit a whole\
181  new forest.");
182 
183 
184 
185 
186 }
187 
188 //_______________________________________________________________________
190 {
191  if (loss != "deviance" && loss != "exponential") {
192  Log() << kFATAL << Form(" Loss = %s... that does not work !! ", loss.Data())
193  << " The options are deviance of exponential."
194  << Endl;
195  }
196 
197  if (learning_rate <= 0) {
198  Log() << kERROR << " LearningRate <=0... that does not work !! "
199  << " I set it to 0.1 .. just so that the program does not crash"
200  << Endl;
201  learning_rate = 0.1;
202  }
203  if (n_estimators <= 0) {
204  Log() << kERROR << " NEstimators <=0... that does not work !! "
205  << " I set it to 100 .. just so that the program does not crash"
206  << Endl;
207  n_estimators = 100;
208  }
209  if (min_samples_split < 0) {
210  Log() << kERROR << " MinSamplesSplit <0... that does not work !! "
211  << " I set it to 2 .. just so that the program does not crash"
212  << Endl;
213  min_samples_split = 2;
214  }
215  if (subsample < 0) {
216  Log() << kERROR << " Subsample <0... that does not work !! "
217  << " I set it to 1.0 .. just so that the program does not crash"
218  << Endl;
219  subsample = 1.0;
220  }
221 
222  if (min_samples_leaf < 0) {
223  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
224  << " I set it to 1.0 .. just so that the program does not crash"
225  << Endl;
226  min_samples_leaf = 1;
227  }
228 
229  if (min_samples_leaf < 0) {
230  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
231  << " I set it to 1.0 .. just so that the program does not crash"
232  << Endl;
233  min_samples_leaf = 1;
234  }
235 
236  if (min_weight_fraction_leaf < 0) {
237  Log() << kERROR << " MinWeightFractionLeaf <0... that does not work !! "
238  << " I set it to 0.0 .. just so that the program does not crash"
239  << Endl;
241  }
242 
243  if (max_depth < 0) {
244  Log() << kERROR << " MaxDepth <0... that does not work !! "
245  << " I set it to 3 .. just so that the program does not crash"
246  << Endl;
247  max_depth = 3;
248  }
249 
250  PyObject *poinit = Eval(init);
251  if (!poinit) {
252  Log() << kFATAL << Form(" Init = %s... that does not work !! ", init.Data())
253  << " The options are None or BaseEstimator. An estimator object that is used to compute the initial"
254  << " predictions. ``init`` has to provide ``fit`` and ``predict``."
255  << " If None it uses ``loss.init_estimator``."
256  << Endl;
257  }
258  Py_DECREF(poinit);
259 
260  PyObject *porandom_state = Eval(random_state);
261  if (!porandom_state) {
262  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
263  << "If int, random_state is the seed used by the random number generator;"
264  << "If RandomState instance, random_state is the random number generator;"
265  << "If None, the random number generator is the RandomState instance used by `np.random`."
266  << Endl;
267  }
268  Py_DECREF(porandom_state);
269 
270  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
271  PyObject *pomax_features = Eval(max_features);
272  if (!pomax_features) {
273  Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data())
274  << "int, float, string or None, optional (default='auto')"
275  << "The number of features to consider when looking for the best split:"
276  << "If int, then consider `max_features` features at each split."
277  << "If float, then `max_features` is a percentage and"
278  << "`int(max_features * n_features)` features are considered at each split."
279  << "If 'auto', then `max_features=sqrt(n_features)`."
280  << "If 'sqrt', then `max_features=sqrt(n_features)`."
281  << "If 'log2', then `max_features=log2(n_features)`."
282  << "If None, then `max_features=n_features`."
283  << Endl;
284  }
285  Py_DECREF(pomax_features);
286 
287 // verbose(0),
288  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
289  if (!pomax_leaf_nodes) {
290  Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data())
291  << " The options are None or integer."
292  << Endl;
293  }
294  Py_DECREF(pomax_leaf_nodes);
295 
296 // warm_start(kFALSE)
297 
298 }
299 
300 
301 //_______________________________________________________________________
303 {
304  ProcessOptions();
305  _import_array();//require to use numpy arrays
306 
307  //Import sklearn
308  // Convert the file name to a Python string.
309  PyObject *pName = PyString_FromString("sklearn.ensemble");
310  // Import the file as a Python module.
311  fModule = PyImport_Import(pName);
312  Py_DECREF(pName);
313 
314  if (!fModule) {
315  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
316  Log() << Endl;
317  }
318 
319 
320  //Training data
321  UInt_t fNvars = Data()->GetNVariables();
322  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
323  int *dims = new int[2];
324  dims[0] = fNrowsTraining;
325  dims[1] = fNvars;
326  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
327  float *TrainData = (float *)(PyArray_DATA(fTrainData));
328 
329 
330  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
331  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
332 
333  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
334  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
335 
336  for (int i = 0; i < fNrowsTraining; i++) {
337  const TMVA::Event *e = Data()->GetTrainingEvent(i);
338  for (UInt_t j = 0; j < fNvars; j++) {
339  TrainData[j + i * fNvars] = e->GetValue(j);
340  }
341  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
342  else TrainDataClasses[i] = TMVA::Types::kBackground;
343 
344  TrainDataWeights[i] = e->GetWeight();
345  }
346 }
347 
349 {
350 // loss("deviance"),
351 // learning_rate(0.1),
352 // n_estimators(100),
353 // subsample(1.0),
354 // min_samples_split(2),
355 // min_samples_leaf(1),
356 // min_weight_fraction_leaf(0.0),
357 // max_depth(3),
358 // init("None"),
359 // random_state("None"),
360 // max_features("None"),
361 // verbose(0),
362 // max_leaf_nodes("None"),
363 // warm_start(kFALSE)
364 
365  //NOTE: max_features must have 3 defferents variables int, float and string
366  //search a solution with PyObject
367  PyObject *poinit = Eval(init);
368  PyObject *porandom_state = Eval(random_state);
369  PyObject *pomax_features = Eval(max_features);
370  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
371 
372  PyObject *args = Py_BuildValue("(sfifiifiOOOiOi)", loss.Data(), \
374  max_depth, poinit, porandom_state, pomax_features, verbose, pomax_leaf_nodes, warm_start);
375 
376  PyObject_Print(args, stdout, 0);
377  std::cout << std::endl;
378 
379  PyObject *pDict = PyModule_GetDict(fModule);
380  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "GradientBoostingClassifier");
381 
382  // Create an instance of the class
383  if (PyCallable_Check(fClassifierClass)) {
384  //instance
385  fClassifier = PyObject_CallObject(fClassifierClass , args);
386  PyObject_Print(fClassifier, stdout, 0);
387  std::cout << std::endl;
388 
389  Py_DECREF(poinit);
390  Py_DECREF(porandom_state);
391  Py_DECREF(pomax_features);
392  Py_DECREF(pomax_leaf_nodes);
393  Py_DECREF(args);
394  } else {
395  PyErr_Print();
396  Py_DECREF(poinit);
397  Py_DECREF(porandom_state);
398  Py_DECREF(pomax_features);
399  Py_DECREF(pomax_leaf_nodes);
400  Py_DECREF(args);
401  Py_DECREF(pDict);
402  Py_DECREF(fClassifierClass);
403  Log() << kFATAL << "Can't call function GradientBoostingClassifier" << Endl;
404  Log() << Endl;
405 
406  }
407 
408  fClassifier = PyObject_CallMethod(fClassifier, (char *)"fit", (char *)"(OOO)", fTrainData, fTrainDataClasses, fTrainDataWeights);
409 // PyObject_Print(fClassifier, stdout, 0);
410 // std::cout<<std::endl;
411  // pValue =PyObject_CallObject(fClassifier, PyString_FromString("classes_"));
412  // PyObject_Print(pValue, stdout, 0);
413 
414  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
415  Log() << Endl;
416  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
417  Log() << Endl;
418 
419  PyObject *model_arg = Py_BuildValue("(O)", fClassifier);
420  PyObject *model_data = PyObject_CallObject(fPickleDumps , model_arg);
421  std::ofstream PyData;
422  PyData.open(path.Data());
423  PyData << PyString_AsString(model_data);
424  PyData.close();
425  Py_DECREF(model_arg);
426  Py_DECREF(model_data);
427 }
428 
429 //_______________________________________________________________________
431 {
433 }
434 
435 
436 //_______________________________________________________________________
438 {
439  // cannot determine error
440  NoErrorCalc(errLower, errUpper);
441 
443 
444  Double_t mvaValue;
445  const TMVA::Event *e = Data()->GetEvent();
446  UInt_t nvars = e->GetNVariables();
447  PyObject *pEvent = PyTuple_New(nvars);
448  for (UInt_t i = 0; i < nvars; i++) {
449 
450  PyObject *pValue = PyFloat_FromDouble(e->GetValue(i));
451  if (!pValue) {
452  Py_DECREF(pEvent);
453  Py_DECREF(fTrainData);
454  Log() << kFATAL << "Error Evaluating MVA " << Endl;
455  }
456  PyTuple_SetItem(pEvent, i, pValue);
457  }
458  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, (char *)"predict_proba", (char *)"(O)", pEvent);
459  double *proba = (double *)(PyArray_DATA(result));
460  mvaValue = proba[1]; //getting signal prob
461  Py_DECREF(result);
462  Py_DECREF(pEvent);
463  return mvaValue;
464 }
465 
466 //_______________________________________________________________________
468 {
469  if (!PyIsInitialized()) {
470  PyInitialize();
471  }
472 
473  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
474  Log() << Endl;
475  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
476  Log() << Endl;
477  std::ifstream PyData;
478  std::stringstream PyDataStream;
479  std::string PyDataString;
480 
481  PyData.open(path.Data());
482  PyDataStream << PyData.rdbuf();
483  PyDataString = PyDataStream.str();
484  PyData.close();
485 
486 // std::cout<<"-----------------------------------\n";
487 // std::cout<<PyDataString.c_str();
488 // std::cout<<"-----------------------------------\n";
489  PyObject *model_arg = Py_BuildValue("(s)", PyDataString.c_str());
490  fClassifier = PyObject_CallObject(fPickleLoads , model_arg);
491 
492 
493  Py_DECREF(model_arg);
494 }
495 
496 //_______________________________________________________________________
498 {
499  // get help message text
500  //
501  // typical length of text line:
502  // "|--------------------------------------------------------------|"
503  Log() << Endl;
504  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
505  Log() << Endl;
506  Log() << "Decision Trees and Rule-Based Models " << Endl;
507  Log() << Endl;
508  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
509  Log() << Endl;
510  Log() << Endl;
511  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
512  Log() << Endl;
513  Log() << "<None>" << Endl;
514 }
515 
const TString & GetWeightFileDir() const
Definition: MethodBase.h:407
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
PyObject * fClassifier
Definition: PyMethodBase.h:112
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:96
Config & gConfig()
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:363
EAnalysisType
Definition: Types.h:124
Basic string class.
Definition: TString.h:137
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
ClassImp(MethodPyGTB) MethodPyGTB
Definition: MethodPyGTB.cxx:51
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
Double_t min_weight_fraction_leaf
Definition: MethodPyGTB.h:105
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:116
static int PyIsInitialized()
static void PyInitialize()
TString max_leaf_nodes
Definition: MethodPyGTB.h:137
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
Double_t subsample
Definition: MethodPyGTB.h:95
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:115
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:303
void GetHelpMessage() const
#define None
Definition: TGWin32.h:68
PyObject * fModule
Definition: PyMethodBase.h:111
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
unsigned int UInt_t
Definition: RtypesCore.h:42
bool verbose
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:114
TString random_state
Definition: MethodPyGTB.h:118
const Event * GetEvent() const
Definition: DataSet.cxx:186
MethodPyGTB(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=NULL)
static Int_t init()
double Double_t
Definition: RtypesCore.h:55
Describe directory structure in memory.
Definition: TDirectory.h:44
int type
Definition: TGX11.cxx:120
Double_t learning_rate
Definition: MethodPyGTB.h:89
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
MsgLogger & Log() const
Definition: Configurable.h:130
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:200
TString max_features
Definition: MethodPyGTB.h:123
UInt_t GetClass() const
Definition: Event.h:86
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
#define REGISTER_METHOD(CLASS)
for example
static PyObject * fPickleLoads
Definition: PyMethodBase.h:124
static PyObject * fPickleDumps
Definition: PyMethodBase.h:123
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:606
void SetWeightFileDir(TString fileDir)
set directory of weight file
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
double result[121]
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
initialization
virtual void TestClassification()
initialization
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:827