ROOT  6.07/01
Reference Guide
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
MethodPyRandomForest.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyRandomForest *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * Random Forest Classifiear from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 #pragma GCC diagnostic ignored "-Wunused-parameter"
20 #include <iomanip>
21 #include <fstream>
22 
23 #include <Python.h>
24 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
25 #include <numpy/arrayobject.h>
26 
27 #include "TMath.h"
28 #include "Riostream.h"
29 #include "TMatrix.h"
30 #include "TMatrixD.h"
31 #include "TVectorD.h"
32 
35 #include "TMVA/Tools.h"
36 #include "TMVA/Ranking.h"
37 #include "TMVA/Types.h"
38 #include "TMVA/Config.h"
39 #include "TMVA/PDF.h"
40 #include "TMVA/ClassifierFactory.h"
41 
42 #include "TMVA/Results.h"
43 
44 
45 
46 using namespace TMVA;
47 
48 REGISTER_METHOD(PyRandomForest)
49 
51 
52 //_______________________________________________________________________
54  const TString &methodTitle,
55  DataSetInfo &dsi,
56  const TString &theOption,
57  TDirectory *theTargetDir) :
58  PyMethodBase(jobName, Types::kPyRandomForest, methodTitle, dsi, theOption, theTargetDir),
59  n_estimators(10),
60  criterion("gini"),
61  max_depth("None"),
62  min_samples_split(2),
63  min_samples_leaf(1),
64  min_weight_fraction_leaf(0),
65  max_features("'auto'"),
66  max_leaf_nodes("None"),
67  bootstrap(kTRUE),
68  oob_score(kFALSE),
69  n_jobs(1),
70  random_state("None"),
71  verbose(0),
72  warm_start(kFALSE),
73  class_weight("None")
74 {
75  // standard constructor for the PyRandomForest
76  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
77 
78 }
79 
80 //_______________________________________________________________________
81 MethodPyRandomForest::MethodPyRandomForest(DataSetInfo &theData, const TString &theWeightFile, TDirectory *theTargetDir)
82  : PyMethodBase(Types::kPyRandomForest, theData, theWeightFile, theTargetDir),
83  n_estimators(10),
84  criterion("gini"),
85  max_depth("None"),
86  min_samples_split(2),
87  min_samples_leaf(1),
88  min_weight_fraction_leaf(0),
89  max_features("'auto'"),
90  max_leaf_nodes("None"),
91  bootstrap(kTRUE),
92  oob_score(kFALSE),
93  n_jobs(1),
94  random_state("None"),
95  verbose(0),
96  warm_start(kFALSE),
97  class_weight("None")
98 {
99  SetWeightFileDir(gConfig().GetIONames().fWeightFileDir);
100 }
101 
102 
103 //_______________________________________________________________________
105 {
106 }
107 
108 //_______________________________________________________________________
110 {
111  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
112  return kFALSE;
113 }
114 
115 
116 //_______________________________________________________________________
118 {
120 
121  DeclareOptionRef(n_estimators, "NEstimators", "Integer, optional (default=10). The number of trees in the forest.");
122  DeclareOptionRef(criterion, "Criterion", "//string, optional (default='gini') \
123  The function to measure the quality of a split. Supported criteria are \
124  'gini' for the Gini impurity and 'entropy' for the information gain. \
125  Note: this parameter is tree-specific.");
126 
127  DeclareOptionRef(max_depth, "MaxDepth", "integer or None, optional (default=None) \
128  The maximum depth of the tree. If None, then nodes are expanded until \
129  all leaves are pure or until all leaves contain less than \
130  min_samples_split samples. \
131  Ignored if ``max_leaf_nodes`` is not None.");
132  DeclareOptionRef(min_samples_split, "MinSamplesSplit", "integer, optional (default=2)\
133  The minimum number of samples required to split an internal node.");
134 
135  DeclareOptionRef(min_samples_leaf, "MinSamplesLeaf", "integer, optional (default=1) \
136  The minimum number of samples in newly created leaves. A split is \
137  discarded if after the split, one of the leaves would contain less then \
138  ``min_samples_leaf`` samples.");
139  DeclareOptionRef(min_weight_fraction_leaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
140  The minimum weighted fraction of the input samples required to be at a \
141  leaf node.");
142  DeclareOptionRef(max_features, "MaxFeatures", "The number of features to consider when looking for the best split");
143  DeclareOptionRef(max_leaf_nodes, "MaxLeafNodes", "int or None, optional (default=None)\
144  Grow trees with ``max_leaf_nodes`` in best-first fashion.\
145  Best nodes are defined as relative reduction in impurity.\
146  If None then unlimited number of leaf nodes.\
147  If not None then ``max_depth`` will be ignored.");
148  DeclareOptionRef(bootstrap, "Bootstrap", "boolean, optional (default=True) \
149  Whether bootstrap samples are used when building trees.");
150  DeclareOptionRef(oob_score, "OoBScore", " bool Whether to use out-of-bag samples to estimate\
151  the generalization error.");
152  DeclareOptionRef(n_jobs, "NJobs", " integer, optional (default=1) \
153  The number of jobs to run in parallel for both `fit` and `predict`. \
154  If -1, then the number of jobs is set to the number of cores.");
155 
156  DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
157  If int, random_state is the seed used by the random number generator;\
158  If RandomState instance, random_state is the random number generator;\
159  If None, the random number generator is the RandomState instance used\
160  by `np.random`.");
161  DeclareOptionRef(verbose, "Verbose", "int, optional (default=0)\
162  Controls the verbosity of the tree building process.");
163  DeclareOptionRef(warm_start, "WarmStart", "bool, optional (default=False)\
164  When set to ``True``, reuse the solution of the previous call to fit\
165  and add more estimators to the ensemble, otherwise, just fit a whole\
166  new forest.");
167  DeclareOptionRef(class_weight, "ClassWeight", "dict, list of dicts, \"auto\", \"subsample\" or None, optional\
168  Weights associated with classes in the form ``{class_label: weight}``.\
169  If not given, all classes are supposed to have weight one. For\
170  multi-output problems, a list of dicts can be provided in the same\
171  order as the columns of y.\
172  The \"auto\" mode uses the values of y to automatically adjust\
173  weights inversely proportional to class frequencies in the input data.\
174  The \"subsample\" mode is the same as \"auto\" except that weights are\
175  computed based on the bootstrap sample for every tree grown.\
176  For multi-output, the weights of each column of y will be multiplied.\
177  Note that these weights will be multiplied with sample_weight (passed\
178  through the fit method) if sample_weight is specified.");
179 }
180 
181 //_______________________________________________________________________
183 {
184  if (n_estimators <= 0) {
185  Log() << kERROR << " NEstimators <=0... that does not work !! "
186  << " I set it to 10 .. just so that the program does not crash"
187  << Endl;
188  n_estimators = 10;
189  }
190  if (criterion != "gini" && criterion != "entropy") {
191  Log() << kFATAL << Form(" Criterion = %s... that does not work !! ", criterion.Data())
192  << " The options are gini of entropy."
193  << Endl;
194  }
195  PyObject *pomax_depth = Eval(max_depth);
196  if (!pomax_depth) {
197  Log() << kFATAL << Form(" MaxDepth = %s... that does not work !! ", criterion.Data())
198  << " The options are None or integer."
199  << Endl;
200  }
201  Py_DECREF(pomax_depth);
202 
203  if (min_samples_split < 0) {
204  Log() << kERROR << " MinSamplesSplit < 0... that does not work !! "
205  << " I set it to 2 .. just so that the program does not crash"
206  << Endl;
207  min_samples_split = 2;
208  }
209  if (min_samples_leaf < 0) {
210  Log() << kERROR << " MinSamplesLeaf < 0... that does not work !! "
211  << " I set it to 1 .. just so that the program does not crash"
212  << Endl;
213  min_samples_leaf = 1;
214  }
215 
216  if (min_weight_fraction_leaf < 0) {
217  Log() << kERROR << " MinWeightFractionLeaf < 0... that does not work !! "
218  << " I set it to 0 .. just so that the program does not crash"
219  << Endl;
221  }
222  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
223  PyObject *pomax_features = Eval(max_features);
224  if (!pomax_features) {
225  Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", max_features.Data())
226  << "int, float, string or None, optional (default='auto')"
227  << "The number of features to consider when looking for the best split:"
228  << "If int, then consider `max_features` features at each split."
229  << "If float, then `max_features` is a percentage and"
230  << "`int(max_features * n_features)` features are considered at each split."
231  << "If 'auto', then `max_features=sqrt(n_features)`."
232  << "If 'sqrt', then `max_features=sqrt(n_features)`."
233  << "If 'log2', then `max_features=log2(n_features)`."
234  << "If None, then `max_features=n_features`."
235  << Endl;
236  }
237  Py_DECREF(pomax_features);
238 
239  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
240  if (!pomax_leaf_nodes) {
241  Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work !! ", max_leaf_nodes.Data())
242  << " The options are None or integer."
243  << Endl;
244  }
245  Py_DECREF(pomax_leaf_nodes);
246 
247 // bootstrap(kTRUE),
248 // oob_score(kFALSE),
249 // n_jobs(1),
250 
251  PyObject *porandom_state = Eval(random_state);
252  if (!porandom_state) {
253  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
254  << "If int, random_state is the seed used by the random number generator;"
255  << "If RandomState instance, random_state is the random number generator;"
256  << "If None, the random number generator is the RandomState instance used by `np.random`."
257  << Endl;
258  }
259  Py_DECREF(porandom_state);
260 
261 // verbose(0),
262 // warm_start(kFALSE),
263 // class_weight("None")
264  PyObject *poclass_weight = Eval(class_weight);
265  if (!poclass_weight) {
266  Log() << kFATAL << Form(" ClassWeight = %s... that does not work !! ", class_weight.Data())
267  << "dict, list of dicts, 'auto', 'subsample' or None, optional"
268  << Endl;
269  }
270  Py_DECREF(poclass_weight);
271 
272 }
273 
274 //_______________________________________________________________________
276 {
277  ProcessOptions();
278  _import_array();//require to use numpy arrays
279 
280  //Import sklearn
281  // Convert the file name to a Python string.
282  PyObject *pName = PyString_FromString("sklearn.ensemble");
283  // Import the file as a Python module.
284  fModule = PyImport_Import(pName);
285  Py_DECREF(pName);
286 
287  if (!fModule) {
288  Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
289  Log() << Endl;
290  }
291 
292 
293  //Training data
294  UInt_t fNvars = Data()->GetNVariables();
295  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
296  int *dims = new int[2];
297  dims[0] = fNrowsTraining;
298  dims[1] = fNvars;
299  fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
300  float *TrainData = (float *)(PyArray_DATA(fTrainData));
301 
302 
303  fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
304  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
305 
306  fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
307  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
308 
309  for (int i = 0; i < fNrowsTraining; i++) {
310  const TMVA::Event *e = Data()->GetTrainingEvent(i);
311  for (UInt_t j = 0; j < fNvars; j++) {
312  TrainData[j + i * fNvars] = e->GetValue(j);
313  }
314  if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
315  else TrainDataClasses[i] = TMVA::Types::kBackground;
316 
317  TrainDataWeights[i] = e->GetWeight();
318  }
319 }
320 
321 //_______________________________________________________________________
323 {
324 // n_estimators(10),
325 // criterion("gini"),
326 // max_depth("None"),
327 // min_samples_split(2),
328 // min_samples_leaf(1),
329 // min_weight_fraction_leaf(0.0),
330 // max_features("'auto'"),
331 // max_leaf_nodes("None"),
332 // bootstrap(kTRUE),
333 // oob_score(kFALSE),
334 // n_jobs(1),
335 // random_state("None"),
336 // verbose(0),
337 // warm_start(kFALSE),
338 // class_weight("None")
339 
340  //NOTE: max_features must have 3 defferents variables int, float and string
341  if (max_features == "auto" || max_features == "sqrt" || max_features == "log2")max_features = Form("'%s'", max_features.Data());
342  PyObject *pomax_features = Eval(max_features);
343  PyObject *pomax_depth = Eval(max_depth);
344  PyObject *pomax_leaf_nodes = Eval(max_leaf_nodes);
345  PyObject *porandom_state = Eval(random_state);
346  PyObject *poclass_weight = Eval(class_weight);
347 // PyObject_Print(pomax_features,stdout,0);
348 // std::cout<<std::endl;
349 //
350 // PyObject_Print(pomax_depth,stdout,0);
351 // std::cout<<std::endl;
352  PyObject *args = Py_BuildValue("(isOiifOOiiiOiiO)", n_estimators, criterion.Data(), pomax_depth, min_samples_split, \
353  min_samples_leaf, min_weight_fraction_leaf, pomax_features, pomax_leaf_nodes, \
354  bootstrap, oob_score, n_jobs, porandom_state, verbose, warm_start, poclass_weight);
355  Py_DECREF(pomax_depth);
356  PyObject_Print(args, stdout, 0);
357  std::cout << std::endl;
358 
359  PyObject *pDict = PyModule_GetDict(fModule);
360  PyObject *fClassifierClass = PyDict_GetItemString(pDict, "RandomForestClassifier");
361  // Log() << kFATAL <<"Train =" <<n_jobs<<Endl;
362 
363  // Create an instance of the class
364  if (PyCallable_Check(fClassifierClass)) {
365  //instance
366  fClassifier = PyObject_CallObject(fClassifierClass , args);
367  PyObject_Print(fClassifier, stdout, 0);
368 
369  Py_DECREF(args);
370  } else {
371  PyErr_Print();
372  Py_DECREF(pDict);
373  Py_DECREF(fClassifierClass);
374  Log() << kFATAL << "Can't call function RandomForestClassifier" << Endl;
375  Log() << Endl;
376 
377  }
378 
379  fClassifier = PyObject_CallMethod(fClassifier, const_cast<char *>("fit"), const_cast<char *>("(OOO)"), fTrainData, fTrainDataClasses, fTrainDataWeights);
380  // PyObject_Print(fClassifier, stdout, 0);
381  // pValue =PyObject_CallObject(fClassifier, PyString_FromString("classes_"));
382  // PyObject_Print(pValue, stdout, 0);
383 
384  TString path = GetWeightFileDir() + "/PyRFModel.PyData";
385  Log() << Endl;
386  Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
387  Log() << Endl;
388 
389  PyObject *model_arg = Py_BuildValue("(O)", fClassifier);
390  PyObject *model_data = PyObject_CallObject(fPickleDumps , model_arg);
391  std::ofstream PyData;
392  PyData.open(path.Data());
393  PyData << PyString_AsString(model_data);
394  PyData.close();
395  Py_DECREF(model_arg);
396  Py_DECREF(model_data);
397 }
398 
399 //_______________________________________________________________________
401 {
403 }
404 
405 
406 //_______________________________________________________________________
408 {
409  // cannot determine error
410  NoErrorCalc(errLower, errUpper);
411 
413 
414  Double_t mvaValue;
415  const TMVA::Event *e = Data()->GetEvent();
416  UInt_t nvars = e->GetNVariables();
417  PyObject *pEvent = PyTuple_New(nvars);
418  for (UInt_t i = 0; i < nvars; i++) {
419 
420  PyObject *pValue = PyFloat_FromDouble(e->GetValue(i));
421  if (!pValue) {
422  Py_DECREF(pEvent);
423  Py_DECREF(fTrainData);
424  Log() << kFATAL << "Error Evaluating MVA " << Endl;
425  }
426  PyTuple_SetItem(pEvent, i, pValue);
427  }
428  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
429  double *proba = (double *)(PyArray_DATA(result));
430  mvaValue = proba[1]; //getting signal prob
431  Py_DECREF(result);
432  Py_DECREF(pEvent);
433  return mvaValue;
434 }
435 
436 //_______________________________________________________________________
438 {
439  if (!PyIsInitialized()) {
440  PyInitialize();
441  }
442 
443  TString path = GetWeightFileDir() + "/PyRFModel.PyData";
444  Log() << Endl;
445  Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
446  Log() << Endl;
447  std::ifstream PyData;
448  std::stringstream PyDataStream;
449  std::string PyDataString;
450 
451  PyData.open(path.Data());
452  PyDataStream << PyData.rdbuf();
453  PyDataString = PyDataStream.str();
454  PyData.close();
455 
456 // std::cout<<"-----------------------------------\n";
457 // std::cout<<PyDataString.c_str();
458 // std::cout<<"-----------------------------------\n";
459  PyObject *model_arg = Py_BuildValue("(s)", PyDataString.c_str());
460  fClassifier = PyObject_CallObject(fPickleLoads , model_arg);
461 
462 
463  Py_DECREF(model_arg);
464 }
465 
466 //_______________________________________________________________________
468 {
469  // get help message text
470  //
471  // typical length of text line:
472  // "|--------------------------------------------------------------|"
473  Log() << Endl;
474  Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
475  Log() << Endl;
476  Log() << "Decision Trees and Rule-Based Models " << Endl;
477  Log() << Endl;
478  Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
479  Log() << Endl;
480  Log() << Endl;
481  Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
482  Log() << Endl;
483  Log() << "<None>" << Endl;
484 }
485 
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
const TString & GetWeightFileDir() const
Definition: MethodBase.h:407
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162
PyObject * fClassifier
Definition: PyMethodBase.h:112
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:96
Config & gConfig()
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
DataSet * Data() const
Definition: MethodBase.h:363
EAnalysisType
Definition: Types.h:124
Basic string class.
Definition: TString.h:137
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:376
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:116
static int PyIsInitialized()
static void PyInitialize()
const char * Data() const
Definition: TString.h:349
Tools & gTools()
Definition: Tools.cxx:79
static PyObject * Eval(TString code)
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:115
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:303
#define None
Definition: TGWin32.h:68
PyObject * fModule
Definition: PyMethodBase.h:111
unsigned int UInt_t
Definition: RtypesCore.h:42
bool verbose
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:114
const Event * GetEvent() const
Definition: DataSet.cxx:186
double Double_t
Definition: RtypesCore.h:55
Describe directory structure in memory.
Definition: TDirectory.h:44
int type
Definition: TGX11.cxx:120
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
Definition: Event.cxx:231
MsgLogger & Log() const
Definition: Configurable.h:130
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:200
UInt_t GetClass() const
Definition: Event.h:86
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837
ClassImp(MethodPyRandomForest) MethodPyRandomForest
#define REGISTER_METHOD(CLASS)
for example
static PyObject * fPickleLoads
Definition: PyMethodBase.h:124
MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="", TDirectory *theTargetDir=NULL)
static PyObject * fPickleDumps
Definition: PyMethodBase.h:123
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:606
void SetWeightFileDir(TString fileDir)
set directory of weight file
virtual void TestClassification()
initialization
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:90
double result[121]
const Bool_t kTRUE
Definition: Rtypes.h:91
virtual void TestClassification()
initialization
_object PyObject
Definition: TPyArg.h:22
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:827