Logo ROOT   6.14/05
Reference Guide
MethodPyAdaBoost.cxx
Go to the documentation of this file.
1 // @(#)root/tmva/pymva $Id$
2 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3 
4 /**********************************************************************************
5  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6  * Package: TMVA *
7  * Class : MethodPyAdaBoost *
8  * Web : http://oproject.org *
9  * *
10  * Description: *
11  * AdaBoost Classifier from Scikit learn *
12  * *
13  * *
14  * Redistribution and use in source and binary forms, with or without *
15  * modification, are permitted according to the terms listed in LICENSE *
16  * (http://tmva.sourceforge.net/LICENSE) *
17  * *
18  **********************************************************************************/
19 
20 #include <Python.h> // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
21 #include "TMVA/MethodPyAdaBoost.h"
22 
23 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24 #include <numpy/arrayobject.h>
25 
26 #include "TMVA/Config.h"
27 #include "TMVA/Configurable.h"
28 #include "TMVA/ClassifierFactory.h"
29 #include "TMVA/DataSet.h"
30 #include "TMVA/Event.h"
31 #include "TMVA/IMethod.h"
32 #include "TMVA/MsgLogger.h"
33 #include "TMVA/PDF.h"
34 #include "TMVA/Ranking.h"
35 #include "TMVA/Tools.h"
36 #include "TMVA/Types.h"
38 #include "TMVA/Results.h"
39 
40 #include "TMath.h"
41 #include "Riostream.h"
42 #include "TMatrix.h"
43 #include "TMatrixD.h"
44 #include "TVectorD.h"
45 
46 #include <iomanip>
47 #include <fstream>
48 
49 using namespace TMVA;
50 
51 REGISTER_METHOD(PyAdaBoost)
52 
54 
55 //_______________________________________________________________________
56 MethodPyAdaBoost::MethodPyAdaBoost(const TString &jobName,
57  const TString &methodTitle,
58  DataSetInfo &dsi,
59  const TString &theOption) :
60  PyMethodBase(jobName, Types::kPyAdaBoost, methodTitle, dsi, theOption),
61  fBaseEstimator("None"),
62  fNestimators(50),
63  fLearningRate(1.0),
64  fAlgorithm("SAMME.R"),
65  fRandomState("None")
66 {
67 }
68 
69 //_______________________________________________________________________
71  const TString &theWeightFile) :
72  PyMethodBase(Types::kPyAdaBoost, theData, theWeightFile),
73  fBaseEstimator("None"),
74  fNestimators(50),
75  fLearningRate(1.0),
76  fAlgorithm("SAMME.R"),
77  fRandomState("None")
78 {
79 }
80 
81 //_______________________________________________________________________
83 {
84 }
85 
86 //_______________________________________________________________________
88 {
89  if (type == Types::kClassification && numberClasses == 2) return kTRUE;
90  if (type == Types::kMulticlass && numberClasses >= 2) return kTRUE;
91  return kFALSE;
92 }
93 
94 //_______________________________________________________________________
96 {
98 
99  DeclareOptionRef(fBaseEstimator, "BaseEstimator", "object, optional (default=DecisionTreeClassifier)\
100  The base estimator from which the boosted ensemble is built.\
101  Support for sample weighting is required, as well as proper `classes_`\
102  and `n_classes_` attributes.");
103 
104  DeclareOptionRef(fNestimators, "NEstimators", "integer, optional (default=50)\
105  The maximum number of estimators at which boosting is terminated.\
106  In case of perfect fit, the learning procedure is stopped early.");
107 
108  DeclareOptionRef(fLearningRate, "LearningRate", "float, optional (default=1.)\
109  Learning rate shrinks the contribution of each classifier by\
110  ``learning_rate``. There is a trade-off between ``learning_rate`` and\
111  ``n_estimators``.");
112 
113  DeclareOptionRef(fAlgorithm, "Algorithm", "{'SAMME', 'SAMME.R'}, optional (default='SAMME.R')\
114  If 'SAMME.R' then use the SAMME.R real boosting algorithm.\
115  ``base_estimator`` must support calculation of class probabilities.\
116  If 'SAMME' then use the SAMME discrete boosting algorithm.\
117  The SAMME.R algorithm typically converges faster than SAMME,\
118  achieving a lower test error with fewer boosting iterations.");
119 
120  DeclareOptionRef(fRandomState, "RandomState", "int, RandomState instance or None, optional (default=None)\
121  If int, random_state is the seed used by the random number generator;\
122  If RandomState instance, random_state is the random number generator;\
123  If None, the random number generator is the RandomState instance used\
124  by `np.random`.");
125 
126  DeclareOptionRef(fFilenameClassifier, "FilenameClassifier",
127  "Store trained classifier in this file");
128 }
129 
130 //_______________________________________________________________________
131 // Check options and load them to local python namespace
133 {
135  if (!pBaseEstimator) {
136  Log() << kFATAL << Form("BaseEstimator = %s ... that does not work!", fBaseEstimator.Data())
137  << " The options are Object or None." << Endl;
138  }
139  PyDict_SetItemString(fLocalNS, "baseEstimator", pBaseEstimator);
140 
141  if (fNestimators <= 0) {
142  Log() << kFATAL << "NEstimators <=0 ... that does not work!" << Endl;
143  }
145  PyDict_SetItemString(fLocalNS, "nEstimators", pNestimators);
146 
147  if (fLearningRate <= 0) {
148  Log() << kFATAL << "LearningRate <=0 ... that does not work!" << Endl;
149  }
151  PyDict_SetItemString(fLocalNS, "learningRate", pLearningRate);
152 
153  if (fAlgorithm != "SAMME" && fAlgorithm != "SAMME.R") {
154  Log() << kFATAL << Form("Algorithm = %s ... that does not work!", fAlgorithm.Data())
155  << " The options are SAMME of SAMME.R." << Endl;
156  }
157  pAlgorithm = Eval(Form("'%s'", fAlgorithm.Data()));
158  PyDict_SetItemString(fLocalNS, "algorithm", pAlgorithm);
159 
161  if (!pRandomState) {
162  Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", fRandomState.Data())
163  << "If int, random_state is the seed used by the random number generator;"
164  << "If RandomState instance, random_state is the random number generator;"
165  << "If None, the random number generator is the RandomState instance used by `np.random`." << Endl;
166  }
167  PyDict_SetItemString(fLocalNS, "randomState", pRandomState);
168 
169  // If no filename is given, set default
170  if(fFilenameClassifier.IsNull()) {
171  fFilenameClassifier = GetWeightFileDir() + "/PyAdaBoostModel_" + GetName() + ".PyData";
172  }
173 }
174 
175 //_______________________________________________________________________
177 {
178  _import_array(); //require to use numpy arrays
179 
180  // Check options and load them to local python namespace
181  ProcessOptions();
182 
183  // Import module for ada boost classifier
184  PyRunString("import sklearn.ensemble");
185 
186  // Get data properties
187  fNvars = GetNVariables();
189 }
190 
191 //_______________________________________________________________________
193 {
194  // Load training data (data, classes, weights) to python arrays
195  int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
196  npy_intp dimsData[2];
197  dimsData[0] = fNrowsTraining;
198  dimsData[1] = fNvars;
199  fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
200  PyDict_SetItemString(fLocalNS, "trainData", (PyObject*)fTrainData);
201  float *TrainData = (float *)(PyArray_DATA(fTrainData));
202 
203  npy_intp dimsClasses = (npy_intp) fNrowsTraining;
204  fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
205  PyDict_SetItemString(fLocalNS, "trainDataClasses", (PyObject*)fTrainDataClasses);
206  float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
207 
208  fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
209  PyDict_SetItemString(fLocalNS, "trainDataWeights", (PyObject*)fTrainDataWeights);
210  float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
211 
212  for (int i = 0; i < fNrowsTraining; i++) {
213  // Fill training data matrix
214  const TMVA::Event *e = Data()->GetTrainingEvent(i);
215  for (UInt_t j = 0; j < fNvars; j++) {
216  TrainData[j + i * fNvars] = e->GetValue(j);
217  }
218 
219  // Fill target classes
220  TrainDataClasses[i] = e->GetClass();
221 
222  // Get event weight
223  TrainDataWeights[i] = e->GetWeight();
224  }
225 
226  // Create classifier object
227  PyRunString("classifier = sklearn.ensemble.AdaBoostClassifier(base_estimator=baseEstimator, n_estimators=nEstimators, learning_rate=learningRate, algorithm=algorithm, random_state=randomState)",
228  "Failed to setup classifier");
229 
230  // Fit classifier
231  // NOTE: We dump the output to a variable so that the call does not pollute stdout
232  PyRunString("dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)", "Failed to train classifier");
233 
234  // Store classifier
235  fClassifier = PyDict_GetItemString(fLocalNS, "classifier");
236  if(fClassifier == 0) {
237  Log() << kFATAL << "Can't create classifier object from AdaBoostClassifier" << Endl;
238  Log() << Endl;
239  }
240 
241  if (IsModelPersistence()) {
242  Log() << Endl;
243  Log() << gTools().Color("bold") << "Saving state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
244  Log() << Endl;
246  }
247 }
248 
249 //_______________________________________________________________________
251 {
253 }
254 
255 //_______________________________________________________________________
256 std::vector<Double_t> MethodPyAdaBoost::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t)
257 {
258  // Load model if not already done
259  if (fClassifier == 0) ReadModelFromFile();
260 
261  // Determine number of events
262  Long64_t nEvents = Data()->GetNEvents();
263  if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
264  if (firstEvt < 0) firstEvt = 0;
265  nEvents = lastEvt-firstEvt;
266 
267  // Get data
268  npy_intp dims[2];
269  dims[0] = nEvents;
270  dims[1] = fNvars;
271  PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
272  float *pValue = (float *)(PyArray_DATA(pEvent));
273 
274  for (Int_t ievt=0; ievt<nEvents; ievt++) {
275  Data()->SetCurrentEvent(ievt);
276  const TMVA::Event *e = Data()->GetEvent();
277  for (UInt_t i = 0; i < fNvars; i++) {
278  pValue[ievt * fNvars + i] = e->GetValue(i);
279  }
280  }
281 
282  // Get prediction from classifier
283  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
284  double *proba = (double *)(PyArray_DATA(result));
285 
286  // Return signal probabilities
287  if(Long64_t(mvaValues.size()) != nEvents) mvaValues.resize(nEvents);
288  for (int i = 0; i < nEvents; ++i) {
289  mvaValues[i] = proba[fNoutputs*i + TMVA::Types::kSignal];
290  }
291 
292  Py_DECREF(pEvent);
293  Py_DECREF(result);
294 
295  return mvaValues;
296 }
297 
298 //_______________________________________________________________________
300 {
301  // cannot determine error
302  NoErrorCalc(errLower, errUpper);
303 
304  // Load model if not already done
305  if (fClassifier == 0) ReadModelFromFile();
306 
307  // Get current event and load to python array
308  const TMVA::Event *e = Data()->GetEvent();
309  npy_intp dims[2];
310  dims[0] = 1;
311  dims[1] = fNvars;
312  PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
313  float *pValue = (float *)(PyArray_DATA(pEvent));
314  for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
315 
316  // Get prediction from classifier
317  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
318  double *proba = (double *)(PyArray_DATA(result));
319 
320  // Return MVA value
321  Double_t mvaValue;
322  mvaValue = proba[TMVA::Types::kSignal]; // getting signal probability
323 
324  Py_DECREF(result);
325  Py_DECREF(pEvent);
326 
327  return mvaValue;
328 }
329 
330 //_______________________________________________________________________
332 {
333  // Load model if not already done
334  if (fClassifier == 0) ReadModelFromFile();
335 
336  // Get current event and load to python array
337  const TMVA::Event *e = Data()->GetEvent();
338  npy_intp dims[2];
339  dims[0] = 1;
340  dims[1] = fNvars;
341  PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
342  float *pValue = (float *)(PyArray_DATA(pEvent));
343  for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
344 
345  // Get prediction from classifier
346  PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
347  double *proba = (double *)(PyArray_DATA(result));
348 
349  // Return MVA values
350  if(UInt_t(classValues.size()) != fNoutputs) classValues.resize(fNoutputs);
351  for(UInt_t i = 0; i < fNoutputs; i++) classValues[i] = proba[i];
352 
353  return classValues;
354 }
355 
356 //_______________________________________________________________________
358 {
359  if (!PyIsInitialized()) {
360  PyInitialize();
361  }
362 
363  Log() << Endl;
364  Log() << gTools().Color("bold") << "Loading state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
365  Log() << Endl;
366 
367  // Load classifier from file
369  if(err != 0)
370  {
371  Log() << kFATAL << Form("Failed to load classifier from file (error code: %i): %s", err, fFilenameClassifier.Data()) << Endl;
372  }
373 
374  // Book classifier object in python dict
375  PyDict_SetItemString(fLocalNS, "classifier", fClassifier);
376 
377  // Load data properties
378  // NOTE: This has to be repeated here for the reader application
379  fNvars = GetNVariables();
381 }
382 
383 //_______________________________________________________________________
385 {
386  // Get feature importance from classifier as an array with length equal
387  // number of variables, higher value signals a higher importance
388  PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(fClassifier, "feature_importances_");
389  // The python object is null if the base estimator does not support
390  // variable ranking. Then, return NULL, which disables ranking.
391  if(pRanking == 0) return NULL;
392 
393  // Fill ranking object and return it
394  fRanking = new Ranking(GetName(), "Variable Importance");
395  Double_t* rankingData = (Double_t*) PyArray_DATA(pRanking);
396  for(UInt_t iVar=0; iVar<fNvars; iVar++){
397  fRanking->AddRank(Rank(GetInputLabel(iVar), rankingData[iVar]));
398  }
399 
400  Py_DECREF(pRanking);
401 
402  return fRanking;
403 }
404 
405 //_______________________________________________________________________
407 {
408  // typical length of text line:
409  // "|--------------------------------------------------------------|"
410  Log() << "An AdaBoost classifier is a meta-estimator that begins by fitting" << Endl;
411  Log() << "a classifier on the original dataset and then fits additional copies" << Endl;
412  Log() << "of the classifier on the same dataset but where the weights of incorrectly" << Endl;
413  Log() << "classified instances are adjusted such that subsequent classifiers focus" << Endl;
414  Log() << "more on difficult cases." << Endl;
415  Log() << Endl;
416  Log() << "Check out the scikit-learn documentation for more information." << Endl;
417 }
void SetCurrentEvent(Long64_t ievt) const
Definition: DataSet.h:99
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158
Singleton class for Global types used by TMVA.
Definition: Types.h:73
long long Long64_t
Definition: RtypesCore.h:69
PyObject * fClassifier
Definition: PyMethodBase.h:120
MsgLogger & Log() const
Definition: Configurable.h:122
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
EAnalysisType
Definition: Types.h:127
Ranking for variables in method (implementation)
Definition: Ranking.h:48
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
UInt_t GetNClasses() const
Definition: DataSetInfo.h:136
virtual void TestClassification()
initialization
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:124
static int PyIsInitialized()
Check Python interpreter initialization status.
static void PyInitialize()
Initialize Python interpreter.
const TString & GetInputLabel(Int_t i) const
Definition: MethodBase.h:341
const TString & GetWeightFileDir() const
Definition: MethodBase.h:481
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=Py_single_input)
Execute Python code from string.
DataSet * Data() const
Definition: MethodBase.h:400
UInt_t GetClass() const
Definition: Event.h:81
std::vector< Float_t > & GetMulticlassValues()
PyObject * Eval(TString code)
Evaluate Python code.
DataSetInfo & DataInfo() const
Definition: MethodBase.h:401
Class that contains all the data information.
Definition: DataSetInfo.h:60
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:123
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:382
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:79
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false)
get all the MVA values for the events of the current Data type
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:85
const char * GetName() const
Definition: MethodBase.h:325
unsigned int UInt_t
Definition: RtypesCore.h:42
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
Definition: PyMethodBase.h:122
Tools & gTools()
UInt_t GetNVariables() const
Definition: MethodBase.h:336
const Bool_t kFALSE
Definition: RtypesCore.h:88
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:237
#define ClassImp(name)
Definition: Rtypes.h:359
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
double Double_t
Definition: RtypesCore.h:55
int type
Definition: TGX11.cxx:120
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:840
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
Ranking * fRanking
Definition: MethodBase.h:576
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
Definition: Ranking.cxx:86
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:601
PyObject * fLocalNS
Definition: PyMethodBase.h:143
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:217
std::vector< Float_t > classValues
virtual void ReadModelFromFile()
std::vector< Double_t > mvaValues
const Bool_t kTRUE
Definition: RtypesCore.h:87
MethodPyAdaBoost(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
virtual void TestClassification()
initialization
const Event * GetEvent() const
Definition: DataSet.cxx:202
const Ranking * CreateRanking()
_object PyObject
Definition: TPyArg.h:20
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:841
Bool_t IsModelPersistence()
Definition: MethodBase.h:374