Logo ROOT   6.07/09
Reference Guide
MethodPyRandomForest.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyRandomForest *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * Random Forest Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 #include <Python.h> // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
21 
22 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
23 #include <numpy/arrayobject.h>
24 
25 #pragma GCC diagnostic ignored "-Wunused-parameter"
26 
27 #include "TMVA/Configurable.h"
28 #include "TMVA/ClassifierFactory.h"
29 #include "TMVA/Config.h"
30 #include "TMVA/DataSet.h"
31 #include "TMVA/Event.h"
32 #include "TMVA/IMethod.h"
33 #include "TMVA/MsgLogger.h"
34 #include "TMVA/PDF.h"
35 #include "TMVA/Ranking.h"
36 #include "TMVA/Results.h"
37 #include "TMVA/Tools.h"
38 #include "TMVA/Types.h"
40 
41 #include "Riostream.h"
42 #include "TMath.h"
43 #include "TMatrix.h"
44 #include "TMatrixD.h"
45 #include "TVectorD.h"
46 
47 #include <iomanip>
48 #include <fstream>
49 
50 using namespace TMVA;
51 
52 REGISTER_METHOD(PyRandomForest)
53 
55 
56 //_______________________________________________________________________
58  const TString &methodTitle,
59  DataSetInfo &dsi,
60  const TString &theOption) :
61  PyMethodBase(jobName, Types::kPyRandomForest, methodTitle, dsi, theOption),
62  n_estimators(10),
63  criterion("gini"),
64  max_depth("None"),
65  min_samples_split(2),
66  min_samples_leaf(1),
67  min_weight_fraction_leaf(0),
68  max_features("'auto'"),
69  max_leaf_nodes("None"),
70  bootstrap(kTRUE),
71  oob_score(kFALSE),
72  n_jobs(1),
73  random_state("None"),
74  verbose(0),
75  warm_start(kFALSE),
76  class_weight("None")
77 {
78 }
79 
80 //_______________________________________________________________________
82  : PyMethodBase(Types::kPyRandomForest, theData, theWeightFile),
83  n_estimators(10),
84  criterion("gini"),
85  max_depth("None"),
86  min_samples_split(2),
87  min_samples_leaf(1),
88  min_weight_fraction_leaf(0),
89  max_features("'auto'"),
90  max_leaf_nodes("None"),
91  bootstrap(kTRUE),
92  oob_score(kFALSE),
93  n_jobs(1),
94  random_state("None"),
95  verbose(0),
96  warm_start(kFALSE),
97  class_weight("None")
98 {
99 }
100 
101 
102 //_______________________________________________________________________
104 {
105 }
106 
107 //_______________________________________________________________________
109 {
110  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
111  return kFALSE;
112 }
113 
114 
115 //_______________________________________________________________________
117 {
119 
120  DeclareOptionRef(n_estimators, "NEstimators", "Integer, optional (default=10). The number of trees in the forest.");
121  DeclareOptionRef(criterion, "Criterion", "//string, optional (default='gini') \
122  The function to measure the quality of a split. Supported criteria are \
123  'gini' for the Gini impurity and 'entropy' for the information gain. \
124  Note: this parameter is tree-specific.");
125 
126  DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \
127  The maximum depth of the tree. If None, then nodes are expanded until \
128  all leaves are pure or until all leaves contain less than \
129  min_samples_split samples. \
130  Ignored if ``max_leaf_nodes`` is not None.");
131  DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\
132  The minimum number of samples required to split an internal node.");
133 
134  DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \
135  The minimum number of samples in newly created leaves. A split is \
136  discarded if after the split, one of the leaves would contain less then \
137  ``min_samples_leaf`` samples.");
138  DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
139  The minimum weighted fraction of the input samples required to be at a \
140  leaf node.");
141  DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split");
142  DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\
143  Grow trees with ``max_leaf_nodes`` in best-first fashion.\
144  Best nodes are defined as relative reduction in impurity.\
145  If None then unlimited number of leaf nodes.\
146  If not None then ``max_depth`` will be ignored.");
147  DeclareOptionRef(bootstrap, "Bootstrap", "boolean, optional (default=True) \
148  Whether bootstrap samples are used when building trees.");
149  DeclareOptionRef(oob_score, "OoBScore", " bool Whether to use out-of-bag samples to estimate\
150  the generalization error.");
151  DeclareOptionRef(n_jobs, "NJobs", " integer, optional (default=1) \
152  The number of jobs to run in parallel for both `fit` and `predict`. \
153  If -1, then the number of jobs is set to the number of cores.");
154 
155  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
156  If int, random_state is the seed used by the random number generator;\
157  If RandomState instance, random_state is the random number generator;\
158  If None, the random number generator is the RandomState instance used\
159  by `np.random`.");
160  DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\
161  Controls the verbosity of the tree building process.");
162  DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\
163  When set to ``True``, reuse the solution of the previous call to fit\
164  and add more estimators to the ensemble, otherwise, just fit a whole\
165  new forest.");
166  DeclareOptionRef(class_weight, "ClassWeight", "dict, list of dicts, \"auto\", \"subsample\" or None, optional\
167  Weights associated with classes in the form ``{class_label: weight}``.\
168  If not given, all classes are supposed to have weight one. For\
169  multi-output problems, a list of dicts can be provided in the same\
170  order as the columns of y.\
171  The \"auto\" mode uses the values of y to automatically adjust\
172  weights inversely proportional to class frequencies in the input data.\
173  The \"subsample\" mode is the same as \"auto\" except that weights are\
174  computed based on the bootstrap sample for every tree grown.\
175  For multi-output, the weights of each column of y will be multiplied.\
176  Note that these weights will be multiplied with sample_weight (passed\
177  through the fit method) if sample_weight is specified.");
178 }
179 
180 //_______________________________________________________________________
182 {
183  if (n_estimators <= 0) {
184  Log() << kERROR << " NEstimators <=0... that does not work !! "
185  << " I set it to 10 .. just so that the program does not crash"
186  << Endl;
187  n_estimators = 10;
188  }
189  if (criterion != "gini" && criterion != "entropy") {
190  Log() << kFATAL << Form(" Criterion = %s... that does not work !! ", criterion.Data())
191  << " The options are gini of entropy."
192  << Endl;
193  }
194  PyObject *pomax_depth = Eval(max_depth);
195  if (!pomax_depth) {
196  Log() << kFATAL << Form(" MaxDepth = %s... that does not work !! ", criterion.Data())
197  << " The options are None or integer."
198  << Endl;
199  }
200  Py_DECREF(pomax_depth);
201 
202  if (min_samples_split < 0) {
203  Log() << kERROR << " MinSamplesSplit < 0... that does not work !! "
204  << " I set it to 2 .. just so that the program does not crash"
205  << Endl;
206  min_samples_split = 2;
207  }
208  if (min_samples_leaf < 0) {
209  Log() << kERROR << " MinSamplesLeaf < 0... that does not work !! "
210  << " I set it to 1 .. just so that the program does not crash"
211  << Endl;
212  min_samples_leaf = 1;
213  }
214 
215  if (min_weight_fraction_leaf < 0) {
216  Log() << kERROR << " MinWeightFractionLeaf < 0... that does not work !! "
217  << " I set it to 0 .. just so that the program does not crash"
218  << Endl;
220  }
221  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
222  PyObject *pomax_features = Eval(max_features);
223  if (!pomax_features) {
224  Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data())
225  << "int, float, string or None, optional (default='auto')"
226  << "The number of features to consider when looking for the best split:"
227  << "If int, then consider `max_features` features at each split."
228  << "If float, then `max_features` is a percentage and"
229  << "`int(max_features * n_features)` features are considered at each split."
230  << "If 'auto', then `max_features=sqrt(n_features)`."
231  << "If 'sqrt', then `max_features=sqrt(n_features)`."
232  << "If 'log2', then `max_features=log2(n_features)`."
233  << "If None, then `max_features=n_features`."
234  << Endl;
235  }
236  Py_DECREF(pomax_features);
237 
238  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
239  if (!pomax_leaf_nodes) {
240  Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data())
241  << " The options are None or integer."
242  << Endl;
243  }
244  Py_DECREF(pomax_leaf_nodes);
245 
246 // bootstrap(kTRUE),
247 // oob_score(kFALSE),
248 // n_jobs(1),
249 
250  PyObject *porandom_state = Eval(random_state);
251  if (!porandom_state) {
252  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
253  << "If int, random_state is the seed used by the random number generator;"
254  << "If RandomState instance, random_state is the random number generator;"
255  << "If None, the random number generator is the RandomState instance used by `np.random`."
256  << Endl;
257  }
258  Py_DECREF(porandom_state);
259 
260 // verbose(0),
261 // warm_start(kFALSE),
262 // class_weight("None")
263  PyObject *poclass_weight = Eval(class_weight);
264  if (!poclass_weight) {
265  Log() << kFATAL << Form(" ClassWeight = %s... that does not work !! ", class_weight.Data())
266  << "dict, list of dicts, 'auto', 'subsample' or None, optional"
267  << Endl;
268  }
269  Py_DECREF(poclass_weight);
270 
271 }
272 
273 //_______________________________________________________________________
275 {
276  ProcessOptions();
277  _import_array();//require to use numpy arrays
278 
279  //Import sklearn
280  // Convert the file name to a Python string.
281  PyObject *pName = PyUnicode_FromString("sklearn.ensemble");
282  // Import the file as a Python module.
283  fModule = PyImport_Import(pName);
284  Py_DECREF(pName);
285 
286  if (!fModule) {
287  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
288  Log() << Endl;
289  }
290 
291 
292  //Training data
293  UInt_t fNvars = Data()->GetNVariables();
294  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
295  int dims[2];
296  dims[0] = fNrowsTraining;
297  dims[1] = fNvars;
298  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
299  float *TrainData = (float *)(PyArray_DATA(fTrainData));
300 
301 
302  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
303  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
304 
305  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
306  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
307 
308  for (int i = 0; i < fNrowsTraining; i++) {
309  const TMVA::Event *e = Data()->GetTrainingEvent(i);
310  for (UInt_t j = 0; j < fNvars; j++) {
311  TrainData[j + i * fNvars] = e->GetValue(j);
312  }
313  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
314  else TrainDataClasses[i] = TMVA::Types::kBackground;
315 
316  TrainDataWeights[i] = e->GetWeight();
317  }
318 
319 }
320 
321 //_______________________________________________________________________
323 {
324 
325  //NOTE: max_features must have 3 defferents variables int, float and string
326  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
327  PyObject *pomax_features = Eval(max_features);
328  PyObject *pomax_depth = Eval(max_depth);
329  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
330  PyObject *porandom_state = Eval(random_state);
331  PyObject *poclass_weight = Eval(class_weight);
332 
333  PyObject *args = Py_BuildValue("(isOiifOOiiiOiiO)", n_estimators, criterion.Data(), pomax_depth, min_samples_split, \
334  min_samples_leaf, min_weight_fraction_leaf, pomax_features, pomax_leaf_nodes, \
335  bootstrap, oob_score, n_jobs, porandom_state, verbose, warm_start, poclass_weight);
336  Py_DECREF(pomax_depth);
337  PyObject_Print(args, stdout, 0);
338  std::cout << std::endl;
339 
340  PyObject *pDict = PyModule_GetDict(fModule);
341  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "RandomForestClassifier");
342  // Log() << kFATAL <<"Train =" <<n_jobs<<Endl;
343 
344  // Create an instance of the class
345  if (PyCallable_Check(fClassifierClass)) {
346  //instance
347  fClassifier = PyObject_CallObject(fClassifierClass , args);
348  PyObject_Print(fClassifier, stdout, 0);
349 
350  Py_DECREF(args);
351  } else {
352  PyErr_Print();
353  Py_DECREF(pDict);
354  Py_DECREF(fClassifierClass);
355  Log() << kFATAL << "Can't call function RandomForestClassifier" << Endl;
356  Log() << Endl;
357 
358  }
359 
360  fClassifier = PyObject_CallMethod(fClassifier, const_cast<char *>("fit"), const_cast<char *>("(OOO)"), fTrainData, fTrainDataClasses, fTrainDataWeights);
361 
362  if(!fClassifier)
363  {
364  Log() << kFATAL << "Can't create classifier object from RandomForestClassifier" << Endl;
365  Log() << Endl;
366  }
367  if (IsModelPersistence())
368  {
369  TString path = GetWeightFileDir() + "/PyRFModel.PyData";
370  Log() << Endl;
371  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
372  Log() << Endl;
373  Serialize(path,fClassifier);
374  }
375 }
376 
377 //_______________________________________________________________________
379 {
381 }
382 
383 
384 //_______________________________________________________________________
386 {
387  // cannot determine error
388  NoErrorCalc(errLower, errUpper);
389 
391 
392  Double_t mvaValue;
393  const TMVA::Event *e = Data()->GetEvent();
394  UInt_t nvars = e->GetNVariables();
395  int dims[2];
396  dims[0] = 1;
397  dims[1] = nvars;
398  PyArrayObject *pEvent= (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
399  float *pValue = (float *)(PyArray_DATA(pEvent));
400 
401  for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i);
402 
403  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
404  double *proba = (double *)(PyArray_DATA(result));
405  mvaValue = proba[0]; //getting signal prob
406  Py_DECREF(result);
407  Py_DECREF(pEvent);
408  return mvaValue;
409 }
410 
411 //_______________________________________________________________________
413 {
414  if (!PyIsInitialized()) {
415  PyInitialize();
416  }
417 
418  TString path = GetWeightFileDir() + "/PyRFModel.PyData";
419  Log() << Endl;
420  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
421  Log() << Endl;
422  UnSerialize(path,&fClassifier);
423  if(!fClassifier)
424  {
425  Log() << kFATAL << "Can't load RandomForestClassifier from Serialized data." << Endl;
426  Log() << Endl;
427  }
428 }
429 
430 //_______________________________________________________________________
432 {
433  // get help message text
434  //
435  // typical length of text line:
436  // "|--------------------------------------------------------------|"
437  Log() << Endl;
438  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
439  Log() << Endl;
440  Log() << "Decision Trees and Rule-Based Models " << Endl;
441  Log() << Endl;
442  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
443  Log() << Endl;
444  Log() << Endl;
445  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
446  Log() << Endl;
447  Log() << "<None>" << Endl;
448 }
449 
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
const TString & GetWeightFileDir() const
Definition: MethodBase.h:486
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
#define REGISTER_METHOD(CLASS)
for example
PyObject * fClassifier
Definition: PyMethodBase.h:121
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:99
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:405
EAnalysisType
Definition: Types.h:128
Basic string class.
Definition: TString.h:137
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
static void Serialize(TString file, PyObject *classifier)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:378
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:125
static int PyIsInitialized()
static void PyInitialize()
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:233
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:124
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:305
#define None
Definition: TGWin32.h:59
PyObject * fModule
Definition: PyMethodBase.h:120
MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
unsigned int UInt_t
Definition: RtypesCore.h:42
bool verbose
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:123
const Event * GetEvent() const
Definition: DataSet.cxx:211
#define ClassImp(name)
Definition: Rtypes.h:279
double Double_t
Definition: RtypesCore.h:55
int type
Definition: TGX11.cxx:120
MsgLogger & Log() const
Definition: Configurable.h:128
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:225
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
UInt_t GetClass() const
Definition: Event.h:89
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
Abstract ClassifierFactory template that handles arbitrary types.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:590
virtual void TestClassification()
initialization
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:93
double result[121]
static void UnSerialize(TString file, PyObject **obj)
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
initialization
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:819
Bool_t IsModelPersistence()
Definition: MethodBase.h:379