Logo ROOT   6.07/09
Reference Guide
MethodPyGTB.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyGTB *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * GradientBoostingClassifier Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 
20 #include <Python.h> // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
21 #include "TMVA/MethodPyGTB.h"
22 
23 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24 #include <numpy/arrayobject.h>
25 
26 #pragma GCC diagnostic ignored "-Wunused-parameter"
27 #include "TMVA/Configurable.h"
28 #include "TMVA/ClassifierFactory.h"
29 #include "TMVA/Config.h"
30 #include "TMVA/DataSet.h"
31 #include "TMVA/Event.h"
32 #include "TMVA/IMethod.h"
33 #include "TMVA/MsgLogger.h"
34 #include "TMVA/PDF.h"
35 #include "TMVA/Ranking.h"
36 #include "TMVA/Results.h"
38 #include "TMVA/Tools.h"
39 #include "TMVA/Types.h"
40 #include "TMVA/Timer.h"
42 
43 #include "Riostream.h"
44 #include "TMath.h"
45 #include "TMatrix.h"
46 #include "TMatrixD.h"
47 #include "TVectorD.h"
48 
49 #include <iomanip>
50 #include <fstream>
51 
52 using namespace TMVA;
53 
54 REGISTER_METHOD(PyGTB)
55 
57 
58 //_______________________________________________________________________
60  const TString &methodTitle,
61  DataSetInfo &dsi,
62  const TString &theOption) :
63  PyMethodBase(jobName, Types::kPyGTB, methodTitle, dsi, theOption),
64  loss("deviance"),
65  learning_rate(0.1),
66  n_estimators(100),
67  subsample(1.0),
68  min_samples_split(2),
69  min_samples_leaf(1),
70  min_weight_fraction_leaf(0.0),
71  max_depth(3),
72  init("None"),
73  random_state("None"),
74  max_features("None"),
75  verbose(0),
76  max_leaf_nodes("None"),
77  warm_start(kFALSE)
78 {
79 }
80 
81 //_______________________________________________________________________
82 MethodPyGTB::MethodPyGTB(DataSetInfo &theData, const TString &theWeightFile)
83  : PyMethodBase(Types::kPyGTB, theData, theWeightFile),
84  loss("deviance"),
85  learning_rate(0.1),
86  n_estimators(100),
87  subsample(1.0),
88  min_samples_split(2),
89  min_samples_leaf(1),
90  min_weight_fraction_leaf(0.0),
91  max_depth(3),
92  init("None"),
93  random_state("None"),
94  max_features("None"),
95  verbose(0),
96  max_leaf_nodes("None"),
97  warm_start(kFALSE)
98 {
99 }
100 
101 
102 //_______________________________________________________________________
104 {
105 }
106 
107 //_______________________________________________________________________
109 {
110  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
111  return kFALSE;
112 }
113 
114 
115 //_______________________________________________________________________
117 {
119 
120  DeclareOptionRef(loss, "Loss", "{'deviance', 'exponential'}, optional (default='deviance')\
121  loss function to be optimized. 'deviance' refers to\
122  deviance (= logistic regression) for classification\
123  with probabilistic outputs. For loss 'exponential' gradient\
124  boosting recovers the AdaBoost algorithm.");
125 
126  DeclareOptionRef(learning_rate, "LearningRate", "float, optional (default=0.1)\
127  learning rate shrinks the contribution of each tree by `learning_rate`.\
128  There is a trade-off between learning_rate and n_estimators.");
129 
130  DeclareOptionRef(n_estimators, "NEstimators", "int (default=100)\
131  The number of boosting stages to perform. Gradient boosting\
132  is fairly robust to over-fitting so a large number usually\
133  results in better performance.");
134 
135  DeclareOptionRef(subsample, "Subsample", "float, optional (default=1.0)\
136  The fraction of samples to be used for fitting the individual base\
137  learners. If smaller than 1.0 this results in Stochastic Gradient\
138  Boosting. `subsample` interacts with the parameter `n_estimators`.\
139  Choosing `subsample < 1.0` leads to a reduction of variance\
140  and an increase in bias.");
141 
142  DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\
143  The minimum number of samples required to split an internal node.");
144 
145  DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \
146  The minimum number of samples in newly created leaves. A split is \
147  discarded if after the split, one of the leaves would contain less then \
148  ``min_samples_leaf`` samples.");
149 
150  DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
151  The minimum weighted fraction of the input samples required to be at a \
152  leaf node.");
153 
154  DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \
155  The maximum depth of the tree. If None, then nodes are expanded until \
156  all leaves are pure or until all leaves contain less than \
157  min_samples_split samples. \
158  Ignored if ``max_leaf_nodes`` is not None.");
159 
160  DeclareOptionRef(init, "Init", "BaseEstimator, None, optional (default=None)\
161  An estimator object that is used to compute the initial\
162  predictions. ``init`` has to provide ``fit`` and ``predict``.\
163  If None it uses ``loss.init_estimator`");
164 
165  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
166  If int, random_state is the seed used by the random number generator;\
167  If RandomState instance, random_state is the random number generator;\
168  If None, the random number generator is the RandomState instance used\
169  by `np.random`.");
170  DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split");
171  DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\
172  Controls the verbosity of the tree building process.");
173  DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\
174  Grow trees with ``max_leaf_nodes`` in best-first fashion.\
175  Best nodes are defined as relative reduction in impurity.\
176  If None then unlimited number of leaf nodes.\
177  If not None then ``max_depth`` will be ignored.");
178  DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\
179  When set to ``True``, reuse the solution of the previous call to fit\
180  and add more estimators to the ensemble, otherwise, just fit a whole\
181  new forest.");
182 
183 
184 
185 
186 }
187 
188 //_______________________________________________________________________
190 {
191  if (loss != "deviance" && loss != "exponential") {
192  Log() << kFATAL << Form(" Loss = %s... that does not work !! ", loss.Data())
193  << " The options are deviance of exponential."
194  << Endl;
195  }
196 
197  if (learning_rate <= 0) {
198  Log() << kERROR << " LearningRate <=0... that does not work !! "
199  << " I set it to 0.1 .. just so that the program does not crash"
200  << Endl;
201  learning_rate = 0.1;
202  }
203  if (n_estimators <= 0) {
204  Log() << kERROR << " NEstimators <=0... that does not work !! "
205  << " I set it to 100 .. just so that the program does not crash"
206  << Endl;
207  n_estimators = 100;
208  }
209  if (min_samples_split < 0) {
210  Log() << kERROR << " MinSamplesSplit <0... that does not work !! "
211  << " I set it to 2 .. just so that the program does not crash"
212  << Endl;
213  min_samples_split = 2;
214  }
215  if (subsample < 0) {
216  Log() << kERROR << " Subsample <0... that does not work !! "
217  << " I set it to 1.0 .. just so that the program does not crash"
218  << Endl;
219  subsample = 1.0;
220  }
221 
222  if (min_samples_leaf < 0) {
223  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
224  << " I set it to 1.0 .. just so that the program does not crash"
225  << Endl;
226  min_samples_leaf = 1;
227  }
228 
229  if (min_samples_leaf < 0) {
230  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
231  << " I set it to 1.0 .. just so that the program does not crash"
232  << Endl;
233  min_samples_leaf = 1;
234  }
235 
236  if (min_weight_fraction_leaf < 0) {
237  Log() << kERROR << " MinWeightFractionLeaf <0... that does not work !! "
238  << " I set it to 0.0 .. just so that the program does not crash"
239  << Endl;
241  }
242 
243  if (max_depth < 0) {
244  Log() << kERROR << " MaxDepth <0... that does not work !! "
245  << " I set it to 3 .. just so that the program does not crash"
246  << Endl;
247  max_depth = 3;
248  }
249 
250  PyObject *poinit = Eval(init);
251  if (!poinit) {
252  Log() << kFATAL << Form(" Init = %s... that does not work !! ", init.Data())
253  << " The options are None or BaseEstimator. An estimator object that is used to compute the initial"
254  << " predictions. ``init`` has to provide ``fit`` and ``predict``."
255  << " If None it uses ``loss.init_estimator``."
256  << Endl;
257  }
258  Py_DECREF(poinit);
259 
260  PyObject *porandom_state = Eval(random_state);
261  if (!porandom_state) {
262  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
263  << "If int, random_state is the seed used by the random number generator;"
264  << "If RandomState instance, random_state is the random number generator;"
265  << "If None, the random number generator is the RandomState instance used by `np.random`."
266  << Endl;
267  }
268  Py_DECREF(porandom_state);
269 
270  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
271  PyObject *pomax_features = Eval(max_features);
272  if (!pomax_features) {
273  Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data())
274  << "int, float, string or None, optional (default='auto')"
275  << "The number of features to consider when looking for the best split:"
276  << "If int, then consider `max_features` features at each split."
277  << "If float, then `max_features` is a percentage and"
278  << "`int(max_features * n_features)` features are considered at each split."
279  << "If 'auto', then `max_features=sqrt(n_features)`."
280  << "If 'sqrt', then `max_features=sqrt(n_features)`."
281  << "If 'log2', then `max_features=log2(n_features)`."
282  << "If None, then `max_features=n_features`."
283  << Endl;
284  }
285  Py_DECREF(pomax_features);
286 
287 // verbose(0),
288  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
289  if (!pomax_leaf_nodes) {
290  Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data())
291  << " The options are None or integer."
292  << Endl;
293  }
294  Py_DECREF(pomax_leaf_nodes);
295 
296 }
297 
298 
299 //_______________________________________________________________________
301 {
302  ProcessOptions();
303  _import_array();//require to use numpy arrays
304 
305  //Import sklearn
306  // Convert the file name to a Python string.
307  PyObject *pName = PyUnicode_FromString("sklearn.ensemble");
308  // Import the file as a Python module.
309  fModule = PyImport_Import(pName);
310  Py_DECREF(pName);
311 
312  if (!fModule) {
313  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
314  Log() << Endl;
315  }
316 
317 
318  //Training data
319  UInt_t fNvars = Data()->GetNVariables();
320  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
321  int *dims = new int[2];
322  dims[0] = fNrowsTraining;
323  dims[1] = fNvars;
324  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
325  float *TrainData = (float *)(PyArray_DATA(fTrainData));
326 
327 
328  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
329  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
330 
331  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
332  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
333 
334  for (int i = 0; i < fNrowsTraining; i++) {
335  const TMVA::Event *e = Data()->GetTrainingEvent(i);
336  for (UInt_t j = 0; j < fNvars; j++) {
337  TrainData[j + i * fNvars] = e->GetValue(j);
338  }
339  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
340  else TrainDataClasses[i] = TMVA::Types::kBackground;
341 
342  TrainDataWeights[i] = e->GetWeight();
343  }
344 }
345 
347 {
348  //NOTE: max_features must have 3 defferents variables int, float and string
349  //search a solution with PyObject
350  PyObject *poinit = Eval(init);
351  PyObject *porandom_state = Eval(random_state);
352  PyObject *pomax_features = Eval(max_features);
353  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
354 
355  PyObject *args = Py_BuildValue("(sfifiifiOOOiOi)", loss.Data(), \
357  max_depth, poinit, porandom_state, pomax_features, verbose, pomax_leaf_nodes, warm_start);
358 
359  PyObject_Print(args, stdout, 0);
360  std::cout << std::endl;
361 
362  PyObject *pDict = PyModule_GetDict(fModule);
363  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "GradientBoostingClassifier");
364 
365  // Create an instance of the class
366  if (PyCallable_Check(fClassifierClass)) {
367  //instance
368  fClassifier = PyObject_CallObject(fClassifierClass , args);
369  PyObject_Print(fClassifier, stdout, 0);
370  std::cout << std::endl;
371 
372  Py_DECREF(poinit);
373  Py_DECREF(porandom_state);
374  Py_DECREF(pomax_features);
375  Py_DECREF(pomax_leaf_nodes);
376  Py_DECREF(args);
377  } else {
378  PyErr_Print();
379  Py_DECREF(poinit);
380  Py_DECREF(porandom_state);
381  Py_DECREF(pomax_features);
382  Py_DECREF(pomax_leaf_nodes);
383  Py_DECREF(args);
384  Py_DECREF(pDict);
385  Py_DECREF(fClassifierClass);
386  Log() << kFATAL << "Can't call function GradientBoostingClassifier" << Endl;
387  Log() << Endl;
388 
389  }
390 
391  fClassifier = PyObject_CallMethod(fClassifier, (char *)"fit", (char *)"(OOO)", fTrainData, fTrainDataClasses, fTrainDataWeights);
392 // PyObject_Print(fClassifier, stdout, 0);
393 // std::cout<<std::endl;
394  // pValue =PyObject_CallObject(fClassifier, PyUnicode_FromString("classes_"));
395  // PyObject_Print(pValue, stdout, 0);
396  if (IsModelPersistence())
397  {
398  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
399  Log() << Endl;
400  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
401  Log() << Endl;
402  Serialize(path,fClassifier);
403  }
404 }
405 
406 //_______________________________________________________________________
408 {
410 }
411 
412 
413 
414 //_______________________________________________________________________
416 {
417  // cannot determine error
418  NoErrorCalc(errLower, errUpper);
419 
421 
422  Double_t mvaValue;
423  const TMVA::Event *e = Data()->GetEvent();
424  UInt_t nvars = e->GetNVariables();
425  int dims[2];
426  dims[0] = 1;
427  dims[1] = nvars;
428  PyArrayObject *pEvent= (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
429  float *pValue = (float *)(PyArray_DATA(pEvent));
430 
431  for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i);
432 
433  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
434  double *proba = (double *)(PyArray_DATA(result));
435  mvaValue = proba[0]; //getting signal prob
436  Py_DECREF(result);
437  Py_DECREF(pEvent);
438 
439  return mvaValue;
440 }
441 
442 
443 
444 //_______________________________________________________________________
446 {
447  if (!PyIsInitialized()) {
448  PyInitialize();
449  }
450 
451  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
452  Log() << Endl;
453  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
454  Log() << Endl;
455  UnSerialize(path,&fClassifier);
456 }
457 
458 //_______________________________________________________________________
460 {
461  // get help message text
462  //
463  // typical length of text line:
464  // "|--------------------------------------------------------------|"
465  Log() << Endl;
466  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
467  Log() << Endl;
468  Log() << "Decision Trees and Rule-Based Models " << Endl;
469  Log() << Endl;
470  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
471  Log() << Endl;
472  Log() << Endl;
473  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
474  Log() << Endl;
475  Log() << "<None>" << Endl;
476 }
477 
478 
const TString & GetWeightFileDir() const
Definition: MethodBase.h:486
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
#define REGISTER_METHOD(CLASS)
for example
PyObject * fClassifier
Definition: PyMethodBase.h:121
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:99
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:405
EAnalysisType
Definition: Types.h:128
Basic string class.
Definition: TString.h:137
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
Double_t min_weight_fraction_leaf
Definition: MethodPyGTB.h:107
static void Serialize(TString file, PyObject *classifier)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:378
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:125
static int PyIsInitialized()
static void PyInitialize()
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:233
TString max_leaf_nodes
Definition: MethodPyGTB.h:139
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
Double_t subsample
Definition: MethodPyGTB.h:97
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:124
void GetHelpMessage() const
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:305
#define None
Definition: TGWin32.h:59
PyObject * fModule
Definition: PyMethodBase.h:120
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
unsigned int UInt_t
Definition: RtypesCore.h:42
bool verbose
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:123
TString random_state
Definition: MethodPyGTB.h:120
const Event * GetEvent() const
Definition: DataSet.cxx:211
#define ClassImp(name)
Definition: Rtypes.h:279
static Int_t init()
double Double_t
Definition: RtypesCore.h:55
int type
Definition: TGX11.cxx:120
Double_t learning_rate
Definition: MethodPyGTB.h:91
MsgLogger & Log() const
Definition: Configurable.h:128
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:225
virtual void ReadModelFromFile()
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
TString max_features
Definition: MethodPyGTB.h:125
UInt_t GetClass() const
Definition: Event.h:89
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
Abstract ClassifierFactory template that handles arbitrary types.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:590
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:93
double result[121]
static void UnSerialize(TString file, PyObject **obj)
const Bool_t kTRUE
Definition: Rtypes.h:91
MethodPyGTB(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
Definition: MethodPyGTB.cxx:59
virtual void TestClassification()
initialization
virtual void TestClassification()
initialization
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:819
Bool_t IsModelPersistence()
Definition: MethodBase.h:379