ROOT  6.06/09
Reference Guide
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyGTB *
8  * Web : *
9  * *
10  * Description: *
11  * GradientBoostingClassifier Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * ( *
17  * *
18  **********************************************************************************/
20 #pragma GCC diagnostic ignored "-Wunused-parameter"
21 #include <iomanip>
22 #include <fstream>
24 #include <Python.h>
26 #include <numpy/arrayobject.h>
28 #include "TMath.h"
29 #include "Riostream.h"
30 #include "TMatrix.h"
31 #include "TMatrixD.h"
32 #include "TVectorD.h"
35 #include "TMVA/MethodPyGTB.h"
36 #include "TMVA/Tools.h"
37 #include "TMVA/Ranking.h"
38 #include "TMVA/Types.h"
39 #include "TMVA/Config.h"
40 #include "TMVA/PDF.h"
41 #include "TMVA/ClassifierFactory.h"
43 #include "TMVA/Results.h"
47 using namespace TMVA;
53 //_______________________________________________________________________
54 MethodPyGTB::MethodPyGTB(const TString &jobName,
55  const TString &methodTitle,
56  DataSetInfo &dsi,
57  const TString &theOption,
58  TDirectory *theTargetDir) :
59  PyMethodBase(jobName, Types::kPyGTB, methodTitle, dsi, theOption, theTargetDir),
60  loss("deviance"),
61  learning_rate(0.1),
62  n_estimators(100),
63  subsample(1.0),
64  min_samples_split(2),
65  min_samples_leaf(1),
66  min_weight_fraction_leaf(0.0),
67  max_depth(3),
68  init("None"),
69  random_state("None"),
70  max_features("None"),
71  verbose(0),
72  max_leaf_nodes("None"),
73  warm_start(kFALSE)
74 {
75  // standard constructor for the PyGTB
76  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
78 }
80 //_______________________________________________________________________
81 MethodPyGTB::MethodPyGTB(DataSetInfo &theData, const TString &theWeightFile, TDirectory *theTargetDir)
82  : PyMethodBase(Types::kPyGTB, theData, theWeightFile, theTargetDir),
83  loss("deviance"),
84  learning_rate(0.1),
85  n_estimators(100),
86  subsample(1.0),
87  min_samples_split(2),
88  min_samples_leaf(1),
89  min_weight_fraction_leaf(0.0),
90  max_depth(3),
91  init("None"),
92  random_state("None"),
93  max_features("None"),
94  verbose(0),
95  max_leaf_nodes("None"),
96  warm_start(kFALSE)
97 {
98  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
99 }
102 //_______________________________________________________________________
104 {
105 }
107 //_______________________________________________________________________
109 {
110  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
111  return kFALSE;
112 }
115 //_______________________________________________________________________
117 {
120  DeclareOptionRef(loss, "Loss", "{'deviance', 'exponential'}, optional (default='deviance')\
121  loss function to be optimized. 'deviance' refers to\
122  deviance (= logistic regression) for classification\
123  with probabilistic outputs. For loss 'exponential' gradient\
124  boosting recovers the AdaBoost algorithm.");
126  DeclareOptionRef(learning_rate, "LearningRate", "float, optional (default=0.1)\
127  learning rate shrinks the contribution of each tree by `learning_rate`.\
128  There is a trade-off between learning_rate and n_estimators.");
130  DeclareOptionRef(n_estimators, "NEstimators", "int (default=100)\
131  The number of boosting stages to perform. Gradient boosting\
132  is fairly robust to over-fitting so a large number usually\
133  results in better performance.");
135  DeclareOptionRef(subsample, "Subsample", "float, optional (default=1.0)\
136  The fraction of samples to be used for fitting the individual base\
137  learners. If smaller than 1.0 this results in Stochastic Gradient\
138  Boosting. `subsample` interacts with the parameter `n_estimators`.\
139  Choosing `subsample < 1.0` leads to a reduction of variance\
140  and an increase in bias.");
142  DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\
143  The minimum number of samples required to split an internal node.");
145  DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \
146  The minimum number of samples in newly created leaves. A split is \
147  discarded if after the split, one of the leaves would contain less then \
148  ``min_samples_leaf`` samples.");
150  DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
151  The minimum weighted fraction of the input samples required to be at a \
152  leaf node.");
154  DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \
155  The maximum depth of the tree. If None, then nodes are expanded until \
156  all leaves are pure or until all leaves contain less than \
157  min_samples_split samples. \
158  Ignored if ``max_leaf_nodes`` is not None.");
160  DeclareOptionRef(init, "Init", "BaseEstimator, None, optional (default=None)\
161  An estimator object that is used to compute the initial\
162  predictions. ``init`` has to provide ``fit`` and ``predict``.\
163  If None it uses ``loss.init_estimator`");
165  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
166  If int, random_state is the seed used by the random number generator;\
167  If RandomState instance, random_state is the random number generator;\
168  If None, the random number generator is the RandomState instance used\
169  by `np.random`.");
170  DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split");
171  DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\
172  Controls the verbosity of the tree building process.");
173  DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\
174  Grow trees with ``max_leaf_nodes`` in best-first fashion.\
175  Best nodes are defined as relative reduction in impurity.\
176  If None then unlimited number of leaf nodes.\
177  If not None then ``max_depth`` will be ignored.");
178  DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\
179  When set to ``True``, reuse the solution of the previous call to fit\
180  and add more estimators to the ensemble, otherwise, just fit a whole\
181  new forest.");
186 }
188 //_______________________________________________________________________
190 {
191  if (loss != "deviance" && loss != "exponential") {
192  Log() << kFATAL << Form(" Loss = %s... that does not work !! ", loss.Data())
193  << " The options are deviance of exponential."
194  << Endl;
195  }
197  if (learning_rate <= 0) {
198  Log() << kERROR << " LearningRate <=0... that does not work !! "
199  << " I set it to 0.1 .. just so that the program does not crash"
200  << Endl;
201  learning_rate = 0.1;
202  }
203  if (n_estimators <= 0) {
204  Log() << kERROR << " NEstimators <=0... that does not work !! "
205  << " I set it to 100 .. just so that the program does not crash"
206  << Endl;
207  n_estimators = 100;
208  }
209  if (min_samples_split < 0) {
210  Log() << kERROR << " MinSamplesSplit <0... that does not work !! "
211  << " I set it to 2 .. just so that the program does not crash"
212  << Endl;
213  min_samples_split = 2;
214  }
215  if (subsample < 0) {
216  Log() << kERROR << " Subsample <0... that does not work !! "
217  << " I set it to 1.0 .. just so that the program does not crash"
218  << Endl;
219  subsample = 1.0;
220  }
222  if (min_samples_leaf < 0) {
223  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
224  << " I set it to 1.0 .. just so that the program does not crash"
225  << Endl;
226  min_samples_leaf = 1;
227  }
229  if (min_samples_leaf < 0) {
230  Log() << kERROR << " MinSamplesLeaf <0... that does not work !! "
231  << " I set it to 1.0 .. just so that the program does not crash"
232  << Endl;
233  min_samples_leaf = 1;
234  }
236  if (min_weight_fraction_leaf < 0) {
237  Log() << kERROR << " MinWeightFractionLeaf <0... that does not work !! "
238  << " I set it to 0.0 .. just so that the program does not crash"
239  << Endl;
241  }
243  if (max_depth < 0) {
244  Log() << kERROR << " MaxDepth <0... that does not work !! "
245  << " I set it to 3 .. just so that the program does not crash"
246  << Endl;
247  max_depth = 3;
248  }
250  PyObject *poinit = Eval(init);
251  if (!poinit) {
252  Log() << kFATAL << Form(" Init = %s... that does not work !! ", init.Data())
253  << " The options are None or BaseEstimator. An estimator object that is used to compute the initial"
254  << " predictions. ``init`` has to provide ``fit`` and ``predict``."
255  << " If None it uses ``loss.init_estimator``."
256  << Endl;
257  }
258  Py_DECREF(poinit);
260  PyObject *porandom_state = Eval(random_state);
261  if (!porandom_state) {
262  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
263  << "If int, random_state is the seed used by the random number generator;"
264  << "If RandomState instance, random_state is the random number generator;"
265  << "If None, the random number generator is the RandomState instance used by `np.random`."
266  << Endl;
267  }
268  Py_DECREF(porandom_state);
270  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
271  PyObject *pomax_features = Eval(max_features);
272  if (!pomax_features) {
273  Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data())
274  << "int, float, string or None, optional (default='auto')"
275  << "The number of features to consider when looking for the best split:"
276  << "If int, then consider `max_features` features at each split."
277  << "If float, then `max_features` is a percentage and"
278  << "`int(max_features * n_features)` features are considered at each split."
279  << "If 'auto', then `max_features=sqrt(n_features)`."
280  << "If 'sqrt', then `max_features=sqrt(n_features)`."
281  << "If 'log2', then `max_features=log2(n_features)`."
282  << "If None, then `max_features=n_features`."
283  << Endl;
284  }
285  Py_DECREF(pomax_features);
287 // verbose(0),
288  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
289  if (!pomax_leaf_nodes) {
290  Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data())
291  << " The options are None or integer."
292  << Endl;
293  }
294  Py_DECREF(pomax_leaf_nodes);
296 }
299 //_______________________________________________________________________
301 {
302  ProcessOptions();
303  _import_array();//require to use numpy arrays
305  //Import sklearn
306  // Convert the file name to a Python string.
307  PyObject *pName = PyUnicode_FromString("sklearn.ensemble");
308  // Import the file as a Python module.
309  fModule = PyImport_Import(pName);
310  Py_DECREF(pName);
312  if (!fModule) {
313  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
314  Log() << Endl;
315  }
318  //Training data
319  UInt_t fNvars = Data()->GetNVariables();
320  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
321  int *dims = new int[2];
322  dims[0] = fNrowsTraining;
323  dims[1] = fNvars;
324  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
325  float *TrainData = (float *)(PyArray_DATA(fTrainData));
328  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
329  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
331  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
332  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
334  for (int i = 0; i < fNrowsTraining; i++) {
335  const TMVA::Event *e = Data()->GetTrainingEvent(i);
336  for (UInt_t j = 0; j < fNvars; j++) {
337  TrainData[j + i * fNvars] = e->GetValue(j);
338  }
339  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
340  else TrainDataClasses[i] = TMVA::Types::kBackground;
342  TrainDataWeights[i] = e->GetWeight();
343  }
344 }
347 {
348 // loss("deviance"),
349 // learning_rate(0.1),
350 // n_estimators(100),
351 // subsample(1.0),
352 // min_samples_split(2),
353 // min_samples_leaf(1),
354 // min_weight_fraction_leaf(0.0),
355 // max_depth(3),
356 // init("None"),
357 // random_state("None"),
358 // max_features("None"),
359 // verbose(0),
360 // max_leaf_nodes("None"),
361 // warm_start(kFALSE)
363  //NOTE: max_features must have 3 defferents variables int, float and string
364  //search a solution with PyObject
365  PyObject *poinit = Eval(init);
366  PyObject *porandom_state = Eval(random_state);
367  PyObject *pomax_features = Eval(max_features);
368  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
370  PyObject *args = Py_BuildValue("(sfifiifiOOOiOi)", loss.Data(), \
372  max_depth, poinit, porandom_state, pomax_features, verbose, pomax_leaf_nodes, warm_start);
374  PyObject_Print(args, stdout, 0);
375  std::cout << std::endl;
377  PyObject *pDict = PyModule_GetDict(fModule);
378  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "GradientBoostingClassifier");
380  // Create an instance of the class
381  if (PyCallable_Check(fClassifierClass)) {
382  //instance
383  fClassifier = PyObject_CallObject(fClassifierClass , args);
384  PyObject_Print(fClassifier, stdout, 0);
385  std::cout << std::endl;
387  Py_DECREF(poinit);
388  Py_DECREF(porandom_state);
389  Py_DECREF(pomax_features);
390  Py_DECREF(pomax_leaf_nodes);
391  Py_DECREF(args);
392  } else {
393  PyErr_Print();
394  Py_DECREF(poinit);
395  Py_DECREF(porandom_state);
396  Py_DECREF(pomax_features);
397  Py_DECREF(pomax_leaf_nodes);
398  Py_DECREF(args);
399  Py_DECREF(pDict);
400  Py_DECREF(fClassifierClass);
401  Log() << kFATAL << "Can't call function GradientBoostingClassifier" << Endl;
402  Log() << Endl;
404  }
406  fClassifier = PyObject_CallMethod(fClassifier, (char *)"fit", (char *)"(OOO)", fTrainData, fTrainDataClasses, fTrainDataWeights);
407 // PyObject_Print(fClassifier, stdout, 0);
408 // std::cout<<std::endl;
409  // pValue =PyObject_CallObject(fClassifier, PyUnicode_FromString("classes_"));
410  // PyObject_Print(pValue, stdout, 0);
412  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
413  Log() << Endl;
414  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
415  Log() << Endl;
417  Serialize(path,fClassifier);
418 }
420 //_______________________________________________________________________
422 {
424 }
427 //_______________________________________________________________________
429 {
430  // cannot determine error
431  NoErrorCalc(errLower, errUpper);
435  Double_t mvaValue;
436  const TMVA::Event *e = Data()->GetEvent();
437  UInt_t nvars = e->GetNVariables();
438  int *dims = new int[2];
439  dims[0] = 1;
440  dims[1] = nvars;
441  PyArrayObject *pEvent= (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
442  float *pValue = (float *)(PyArray_DATA(pEvent));
444  for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i);
446  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
447  double *proba = (double *)(PyArray_DATA(result));
448  mvaValue = proba[0]; //getting signal prob
449  Py_DECREF(result);
450  Py_DECREF(pEvent);
451  delete dims;
452  return mvaValue;
453 }
455 //_______________________________________________________________________
457 {
458  if (!PyIsInitialized()) {
459  PyInitialize();
460  }
462  TString path = GetWeightFileDir() + "/PyGTBModel.PyData";
463  Log() << Endl;
464  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
465  Log() << Endl;
466  UnSerialize(path,&fClassifier);
467 }
469 //_______________________________________________________________________
471 {
472  // get help message text
473  //
474  // typical length of text line:
475  // "|--------------------------------------------------------------|"
476  Log() << Endl;
477  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
478  Log() << Endl;
479  Log() << "Decision Trees and Rule-Based Models " << Endl;
480  Log() << Endl;
481  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
482  Log() << Endl;
483  Log() << Endl;
484  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
485  Log() << Endl;
486  Log() << "<None>" << Endl;
487 }
const TString & GetWeightFileDir() const
Definition: MethodBase.h:407
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
PyObject * fClassifier
Definition: PyMethodBase.h:114
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:96
Config & gConfig()
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:363
Definition: Types.h:124
Basic string class.
Definition: TString.h:137
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
ClassImp(MethodPyGTB) MethodPyGTB
Definition: MethodPyGTB.cxx:51
Double_t min_weight_fraction_leaf
Definition: MethodPyGTB.h:105
static void Serialize(TString file, PyObject *classifier)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:118
static int PyIsInitialized()
static void PyInitialize()
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
TString max_leaf_nodes
Definition: MethodPyGTB.h:137
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
Double_t subsample
Definition: MethodPyGTB.h:95
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:117
void GetHelpMessage() const
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:303
#define None
Definition: TGWin32.h:68
PyObject * fModule
Definition: PyMethodBase.h:113
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
unsigned int UInt_t
Definition: RtypesCore.h:42
bool verbose
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:116
TString random_state
Definition: MethodPyGTB.h:118
const Event * GetEvent() const
Definition: DataSet.cxx:180
MethodPyGTB(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=NULL)
static Int_t init()
double Double_t
Definition: RtypesCore.h:55
Describe directory structure in memory.
Definition: TDirectory.h:41
int type
Definition: TGX11.cxx:120
Double_t learning_rate
Definition: MethodPyGTB.h:89
MsgLogger & Log() const
Definition: Configurable.h:130
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:194
TString max_features
Definition: MethodPyGTB.h:123
UInt_t GetClass() const
Definition: Event.h:86
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
for example
Abstract ClassifierFactory template that handles arbitrary types.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:599
void SetWeightFileDir(TString fileDir)
set directory of weight file
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
double result[121]
static void UnSerialize(TString file, PyObject **obj)
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
virtual void TestClassification()
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:820