ROOT  6.06/09
Reference Guide
MethodPyRandomForest.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyRandomForest *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * Random Forest Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 #pragma GCC diagnostic ignored "-Wunused-parameter"
20 #include <iomanip>
21 #include <fstream>
22 
23 #include <Python.h>
24 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
25 #include <numpy/arrayobject.h>
26 
27 #include "TMath.h"
28 #include "Riostream.h"
29 #include "TMatrix.h"
30 #include "TMatrixD.h"
31 #include "TVectorD.h"
32 
35 #include "TMVA/Tools.h"
36 #include "TMVA/Ranking.h"
37 #include "TMVA/Types.h"
38 #include "TMVA/Config.h"
39 #include "TMVA/PDF.h"
40 #include "TMVA/ClassifierFactory.h"
41 
42 #include "TMVA/Results.h"
43 
44 
45 
46 using namespace TMVA;
47 
48 REGISTER_METHOD(PyRandomForest)
49 
51 
52 //_______________________________________________________________________
54  const TString &methodTitle,
55  DataSetInfo &dsi,
56  const TString &theOption,
57  TDirectory *theTargetDir) :
58  PyMethodBase(jobName, Types::kPyRandomForest, methodTitle, dsi, theOption, theTargetDir),
59  n_estimators(10),
60  criterion("gini"),
61  max_depth("None"),
62  min_samples_split(2),
63  min_samples_leaf(1),
64  min_weight_fraction_leaf(0),
65  max_features("'auto'"),
66  max_leaf_nodes("None"),
67  bootstrap(kTRUE),
68  oob_score(kFALSE),
69  n_jobs(1),
70  random_state("None"),
71  verbose(0),
72  warm_start(kFALSE),
73  class_weight("None")
74 {
75  // standard constructor for the PyRandomForest
76  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
77 
78 }
79 
80 //_______________________________________________________________________
81 MethodPyRandomForest::MethodPyRandomForest(DataSetInfo &theData, const TString &theWeightFile, TDirectory *theTargetDir)
82  : PyMethodBase(Types::kPyRandomForest, theData, theWeightFile, theTargetDir),
83  n_estimators(10),
84  criterion("gini"),
85  max_depth("None"),
86  min_samples_split(2),
87  min_samples_leaf(1),
88  min_weight_fraction_leaf(0),
89  max_features("'auto'"),
90  max_leaf_nodes("None"),
91  bootstrap(kTRUE),
92  oob_score(kFALSE),
93  n_jobs(1),
94  random_state("None"),
95  verbose(0),
96  warm_start(kFALSE),
97  class_weight("None")
98 {
99  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
100 }
101 
102 
103 //_______________________________________________________________________
105 {
106 }
107 
108 //_______________________________________________________________________
110 {
111  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
112  return kFALSE;
113 }
114 
115 
116 //_______________________________________________________________________
118 {
120 
121  DeclareOptionRef(n_estimators, "NEstimators", "Integer, optional (default=10). The number of trees in the forest.");
122  DeclareOptionRef(criterion, "Criterion", "//string, optional (default='gini') \
123  The function to measure the quality of a split. Supported criteria are \
124  'gini' for the Gini impurity and 'entropy' for the information gain. \
125  Note: this parameter is tree-specific.");
126 
127  DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \
128  The maximum depth of the tree. If None, then nodes are expanded until \
129  all leaves are pure or until all leaves contain less than \
130  min_samples_split samples. \
131  Ignored if ``max_leaf_nodes`` is not None.");
132  DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\
133  The minimum number of samples required to split an internal node.");
134 
135  DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \
136  The minimum number of samples in newly created leaves. A split is \
137  discarded if after the split, one of the leaves would contain less then \
138  ``min_samples_leaf`` samples.");
139  DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
140  The minimum weighted fraction of the input samples required to be at a \
141  leaf node.");
142  DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split");
143  DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\
144  Grow trees with ``max_leaf_nodes`` in best-first fashion.\
145  Best nodes are defined as relative reduction in impurity.\
146  If None then unlimited number of leaf nodes.\
147  If not None then ``max_depth`` will be ignored.");
148  DeclareOptionRef(bootstrap, "Bootstrap", "boolean, optional (default=True) \
149  Whether bootstrap samples are used when building trees.");
150  DeclareOptionRef(oob_score, "OoBScore", " bool Whether to use out-of-bag samples to estimate\
151  the generalization error.");
152  DeclareOptionRef(n_jobs, "NJobs", " integer, optional (default=1) \
153  The number of jobs to run in parallel for both `fit` and `predict`. \
154  If -1, then the number of jobs is set to the number of cores.");
155 
156  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
157  If int, random_state is the seed used by the random number generator;\
158  If RandomState instance, random_state is the random number generator;\
159  If None, the random number generator is the RandomState instance used\
160  by `np.random`.");
161  DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\
162  Controls the verbosity of the tree building process.");
163  DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\
164  When set to ``True``, reuse the solution of the previous call to fit\
165  and add more estimators to the ensemble, otherwise, just fit a whole\
166  new forest.");
167  DeclareOptionRef(class_weight, "ClassWeight", "dict, list of dicts, \"auto\", \"subsample\" or None, optional\
168  Weights associated with classes in the form ``{class_label: weight}``.\
169  If not given, all classes are supposed to have weight one. For\
170  multi-output problems, a list of dicts can be provided in the same\
171  order as the columns of y.\
172  The \"auto\" mode uses the values of y to automatically adjust\
173  weights inversely proportional to class frequencies in the input data.\
174  The \"subsample\" mode is the same as \"auto\" except that weights are\
175  computed based on the bootstrap sample for every tree grown.\
176  For multi-output, the weights of each column of y will be multiplied.\
177  Note that these weights will be multiplied with sample_weight (passed\
178  through the fit method) if sample_weight is specified.");
179 }
180 
181 //_______________________________________________________________________
183 {
184  if (n_estimators <= 0) {
185  Log() << kERROR << " NEstimators <=0... that does not work !! "
186  << " I set it to 10 .. just so that the program does not crash"
187  << Endl;
188  n_estimators = 10;
189  }
190  if (criterion != "gini" && criterion != "entropy") {
191  Log() << kFATAL << Form(" Criterion = %s... that does not work !! ", criterion.Data())
192  << " The options are gini of entropy."
193  << Endl;
194  }
195  PyObject *pomax_depth = Eval(max_depth);
196  if (!pomax_depth) {
197  Log() << kFATAL << Form(" MaxDepth = %s... that does not work !! ", criterion.Data())
198  << " The options are None or integer."
199  << Endl;
200  }
201  Py_DECREF(pomax_depth);
202 
203  if (min_samples_split < 0) {
204  Log() << kERROR << " MinSamplesSplit < 0... that does not work !! "
205  << " I set it to 2 .. just so that the program does not crash"
206  << Endl;
207  min_samples_split = 2;
208  }
209  if (min_samples_leaf < 0) {
210  Log() << kERROR << " MinSamplesLeaf < 0... that does not work !! "
211  << " I set it to 1 .. just so that the program does not crash"
212  << Endl;
213  min_samples_leaf = 1;
214  }
215 
216  if (min_weight_fraction_leaf < 0) {
217  Log() << kERROR << " MinWeightFractionLeaf < 0... that does not work !! "
218  << " I set it to 0 .. just so that the program does not crash"
219  << Endl;
221  }
222  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
223  PyObject *pomax_features = Eval(max_features);
224  if (!pomax_features) {
225  Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data())
226  << "int, float, string or None, optional (default='auto')"
227  << "The number of features to consider when looking for the best split:"
228  << "If int, then consider `max_features` features at each split."
229  << "If float, then `max_features` is a percentage and"
230  << "`int(max_features * n_features)` features are considered at each split."
231  << "If 'auto', then `max_features=sqrt(n_features)`."
232  << "If 'sqrt', then `max_features=sqrt(n_features)`."
233  << "If 'log2', then `max_features=log2(n_features)`."
234  << "If None, then `max_features=n_features`."
235  << Endl;
236  }
237  Py_DECREF(pomax_features);
238 
239  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
240  if (!pomax_leaf_nodes) {
241  Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data())
242  << " The options are None or integer."
243  << Endl;
244  }
245  Py_DECREF(pomax_leaf_nodes);
246 
247 // bootstrap(kTRUE),
248 // oob_score(kFALSE),
249 // n_jobs(1),
250 
251  PyObject *porandom_state = Eval(random_state);
252  if (!porandom_state) {
253  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
254  << "If int, random_state is the seed used by the random number generator;"
255  << "If RandomState instance, random_state is the random number generator;"
256  << "If None, the random number generator is the RandomState instance used by `np.random`."
257  << Endl;
258  }
259  Py_DECREF(porandom_state);
260 
261 // verbose(0),
262 // warm_start(kFALSE),
263 // class_weight("None")
264  PyObject *poclass_weight = Eval(class_weight);
265  if (!poclass_weight) {
266  Log() << kFATAL << Form(" ClassWeight = %s... that does not work !! ", class_weight.Data())
267  << "dict, list of dicts, 'auto', 'subsample' or None, optional"
268  << Endl;
269  }
270  Py_DECREF(poclass_weight);
271 
272 }
273 
274 //_______________________________________________________________________
276 {
277  ProcessOptions();
278  _import_array();//require to use numpy arrays
279 
280  //Import sklearn
281  // Convert the file name to a Python string.
282  PyObject *pName = PyUnicode_FromString("sklearn.ensemble");
283  // Import the file as a Python module.
284  fModule = PyImport_Import(pName);
285  Py_DECREF(pName);
286 
287  if (!fModule) {
288  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
289  Log() << Endl;
290  }
291 
292 
293  //Training data
294  UInt_t fNvars = Data()->GetNVariables();
295  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
296  int *dims = new int[2];
297  dims[0] = fNrowsTraining;
298  dims[1] = fNvars;
299  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
300  float *TrainData = (float *)(PyArray_DATA(fTrainData));
301 
302 
303  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
304  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
305 
306  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
307  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
308 
309  for (int i = 0; i < fNrowsTraining; i++) {
310  const TMVA::Event *e = Data()->GetTrainingEvent(i);
311  for (UInt_t j = 0; j < fNvars; j++) {
312  TrainData[j + i * fNvars] = e->GetValue(j);
313  }
314  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
315  else TrainDataClasses[i] = TMVA::Types::kBackground;
316 
317  TrainDataWeights[i] = e->GetWeight();
318  }
319 
320  delete dims;
321 
322 }
323 
324 //_______________________________________________________________________
326 {
327 
328  //NOTE: max_features must have 3 defferents variables int, float and string
329  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
330  PyObject *pomax_features = Eval(max_features);
331  PyObject *pomax_depth = Eval(max_depth);
332  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
333  PyObject *porandom_state = Eval(random_state);
334  PyObject *poclass_weight = Eval(class_weight);
335 
336  PyObject *args = Py_BuildValue("(isOiifOOiiiOiiO)", n_estimators, criterion.Data(), pomax_depth, min_samples_split, \
337  min_samples_leaf, min_weight_fraction_leaf, pomax_features, pomax_leaf_nodes, \
338  bootstrap, oob_score, n_jobs, porandom_state, verbose, warm_start, poclass_weight);
339  Py_DECREF(pomax_depth);
340  PyObject_Print(args, stdout, 0);
341  std::cout << std::endl;
342 
343  PyObject *pDict = PyModule_GetDict(fModule);
344  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "RandomForestClassifier");
345  // Log() << kFATAL <<"Train =" <<n_jobs<<Endl;
346 
347  // Create an instance of the class
348  if (PyCallable_Check(fClassifierClass)) {
349  //instance
350  fClassifier = PyObject_CallObject(fClassifierClass , args);
351  PyObject_Print(fClassifier, stdout, 0);
352 
353  Py_DECREF(args);
354  } else {
355  PyErr_Print();
356  Py_DECREF(pDict);
357  Py_DECREF(fClassifierClass);
358  Log() << kFATAL << "Can't call function RandomForestClassifier" << Endl;
359  Log() << Endl;
360 
361  }
362 
363  fClassifier = PyObject_CallMethod(fClassifier, const_cast<char *>("fit"), const_cast<char *>("(OOO)"), fTrainData, fTrainDataClasses, fTrainDataWeights);
364 
365  if(!fClassifier)
366  {
367  Log() << kFATAL << "Can't create classifier object from RandomForestClassifier" << Endl;
368  Log() << Endl;
369  }
370 
371  TString path = GetWeightFileDir() + "/PyRFModel.PyData";
372  Log() << Endl;
373  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
374  Log() << Endl;
375  Serialize(path,fClassifier);
376 }
377 
378 //_______________________________________________________________________
380 {
382 }
383 
384 
385 //_______________________________________________________________________
387 {
388  // cannot determine error
389  NoErrorCalc(errLower, errUpper);
390 
392 
393  Double_t mvaValue;
394  const TMVA::Event *e = Data()->GetEvent();
395  UInt_t nvars = e->GetNVariables();
396  int *dims = new int[2];
397  dims[0] = 1;
398  dims[1] = nvars;
399  PyArrayObject *pEvent= (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
400  float *pValue = (float *)(PyArray_DATA(pEvent));
401 
402  for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i);
403 
404  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
405  double *proba = (double *)(PyArray_DATA(result));
406  mvaValue = proba[0]; //getting signal prob
407  Py_DECREF(result);
408  Py_DECREF(pEvent);
409  delete dims;
410  return mvaValue;
411 }
412 
413 //_______________________________________________________________________
415 {
416  if (!PyIsInitialized()) {
417  PyInitialize();
418  }
419 
420  TString path = GetWeightFileDir() + "/PyRFModel.PyData";
421  Log() << Endl;
422  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
423  Log() << Endl;
424  UnSerialize(path,&fClassifier);
425  if(!fClassifier)
426  {
427  Log() << kFATAL << "Can't load RandomForestClassifier from Serialized data." << Endl;
428  Log() << Endl;
429  }
430 }
431 
432 //_______________________________________________________________________
434 {
435  // get help message text
436  //
437  // typical length of text line:
438  // "|--------------------------------------------------------------|"
439  Log() << Endl;
440  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
441  Log() << Endl;
442  Log() << "Decision Trees and Rule-Based Models " << Endl;
443  Log() << Endl;
444  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
445  Log() << Endl;
446  Log() << Endl;
447  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
448  Log() << Endl;
449  Log() << "<None>" << Endl;
450 }
451 
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
const TString & GetWeightFileDir() const
Definition: MethodBase.h:407
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
PyObject * fClassifier
Definition: PyMethodBase.h:114
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:96
Config & gConfig()
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:363
EAnalysisType
Definition: Types.h:124
Basic string class.
Definition: TString.h:137
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
static void Serialize(TString file, PyObject *classifier)
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:118
static int PyIsInitialized()
static void PyInitialize()
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:117
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:303
#define None
Definition: TGWin32.h:68
PyObject * fModule
Definition: PyMethodBase.h:113
unsigned int UInt_t
Definition: RtypesCore.h:42
bool verbose
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:116
const Event * GetEvent() const
Definition: DataSet.cxx:180
double Double_t
Definition: RtypesCore.h:55
Describe directory structure in memory.
Definition: TDirectory.h:41
int type
Definition: TGX11.cxx:120
MsgLogger & Log() const
Definition: Configurable.h:130
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:194
UInt_t GetClass() const
Definition: Event.h:86
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
ClassImp(MethodPyRandomForest) MethodPyRandomForest
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=NULL)
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:599
void SetWeightFileDir(TString fileDir)
set directory of weight file
virtual void TestClassification()
initialization
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
double result[121]
static void UnSerialize(TString file, PyObject **obj)
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
initialization
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:820