Logo ROOT   6.18/05
Reference Guide
MethodPyAdaBoost.cxx
Go to the documentation of this file.
1// @(#)root/tmva/pymva $Id$
2// Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : MethodPyAdaBoost *
8 * Web : http://oproject.org *
9 * *
10 * Description: *
11 * AdaBoost Classifier from Scikit learn *
12 * *
13 * *
14 * Redistribution and use in source and binary forms, with or without *
15 * modification, are permitted according to the terms listed in LICENSE *
16 * (http://tmva.sourceforge.net/LICENSE) *
17 * *
18 **********************************************************************************/
19
20#include <Python.h> // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
22
23#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24#include <numpy/arrayobject.h>
25
26#include "TMVA/Config.h"
27#include "TMVA/Configurable.h"
29#include "TMVA/DataSet.h"
30#include "TMVA/Event.h"
31#include "TMVA/IMethod.h"
32#include "TMVA/MsgLogger.h"
33#include "TMVA/PDF.h"
34#include "TMVA/Ranking.h"
35#include "TMVA/Tools.h"
36#include "TMVA/Types.h"
37#include "TMVA/Timer.h"
39#include "TMVA/Results.h"
40
41#include "TMath.h"
42#include "Riostream.h"
43#include "TMatrix.h"
44#include "TMatrixD.h"
45#include "TVectorD.h"
46
47#include <iomanip>
48#include <fstream>
49
50using namespace TMVA;
51
52namespace TMVA {
53namespace Internal {
54class PyGILRAII {
55 PyGILState_STATE m_GILState;
56
57public:
58 PyGILRAII() : m_GILState(PyGILState_Ensure()) {}
59 ~PyGILRAII() { PyGILState_Release(m_GILState); }
60};
61} // namespace Internal
62} // namespace TMVA
63
64REGISTER_METHOD(PyAdaBoost)
65
67
68//_______________________________________________________________________
69MethodPyAdaBoost::MethodPyAdaBoost(const TString &jobName,
70 const TString &methodTitle,
71 DataSetInfo &dsi,
72 const TString &theOption) :
73 PyMethodBase(jobName, Types::kPyAdaBoost, methodTitle, dsi, theOption),
74 fBaseEstimator("None"),
75 fNestimators(50),
76 fLearningRate(1.0),
77 fAlgorithm("SAMME.R"),
78 fRandomState("None")
79{
80}
81
82//_______________________________________________________________________
84 const TString &theWeightFile) :
85 PyMethodBase(Types::kPyAdaBoost, theData, theWeightFile),
86 fBaseEstimator("None"),
87 fNestimators(50),
88 fLearningRate(1.0),
89 fAlgorithm("SAMME.R"),
90 fRandomState("None")
91{
92}
93
94//_______________________________________________________________________
96{
97}
98
99//_______________________________________________________________________
101{
102 if (type == Types::kClassification && numberClasses == 2) return kTRUE;
103 if (type == Types::kMulticlass && numberClasses >= 2) return kTRUE;
104 return kFALSE;
105}
106
107//_______________________________________________________________________
109{
111
112 DeclareOptionRef(fBaseEstimator, "BaseEstimator", "object, optional (default=DecisionTreeClassifier)\
113 The base estimator from which the boosted ensemble is built.\
114 Support for sample weighting is required, as well as proper `classes_`\
115 and `n_classes_` attributes.");
116
117 DeclareOptionRef(fNestimators, "NEstimators", "integer, optional (default=50)\
118 The maximum number of estimators at which boosting is terminated.\
119 In case of perfect fit, the learning procedure is stopped early.");
120
121 DeclareOptionRef(fLearningRate, "LearningRate", "float, optional (default=1.)\
122 Learning rate shrinks the contribution of each classifier by\
123 ``learning_rate``. There is a trade-off between ``learning_rate`` and\
124 ``n_estimators``.");
125
126 DeclareOptionRef(fAlgorithm, "Algorithm", "{'SAMME', 'SAMME.R'}, optional (default='SAMME.R')\
127 If 'SAMME.R' then use the SAMME.R real boosting algorithm.\
128 ``base_estimator`` must support calculation of class probabilities.\
129 If 'SAMME' then use the SAMME discrete boosting algorithm.\
130 The SAMME.R algorithm typically converges faster than SAMME,\
131 achieving a lower test error with fewer boosting iterations.");
132
133 DeclareOptionRef(fRandomState, "RandomState", "int, RandomState instance or None, optional (default=None)\
134 If int, random_state is the seed used by the random number generator;\
135 If RandomState instance, random_state is the random number generator;\
136 If None, the random number generator is the RandomState instance used\
137 by `np.random`.");
138
139 DeclareOptionRef(fFilenameClassifier, "FilenameClassifier",
140 "Store trained classifier in this file");
141}
142
143//_______________________________________________________________________
144// Check options and load them to local python namespace
146{
148 if (!pBaseEstimator) {
149 Log() << kFATAL << Form("BaseEstimator = %s ... that does not work!", fBaseEstimator.Data())
150 << " The options are Object or None." << Endl;
151 }
152 PyDict_SetItemString(fLocalNS, "baseEstimator", pBaseEstimator);
153
154 if (fNestimators <= 0) {
155 Log() << kFATAL << "NEstimators <=0 ... that does not work!" << Endl;
156 }
158 PyDict_SetItemString(fLocalNS, "nEstimators", pNestimators);
159
160 if (fLearningRate <= 0) {
161 Log() << kFATAL << "LearningRate <=0 ... that does not work!" << Endl;
162 }
164 PyDict_SetItemString(fLocalNS, "learningRate", pLearningRate);
165
166 if (fAlgorithm != "SAMME" && fAlgorithm != "SAMME.R") {
167 Log() << kFATAL << Form("Algorithm = %s ... that does not work!", fAlgorithm.Data())
168 << " The options are SAMME of SAMME.R." << Endl;
169 }
170 pAlgorithm = Eval(Form("'%s'", fAlgorithm.Data()));
171 PyDict_SetItemString(fLocalNS, "algorithm", pAlgorithm);
172
174 if (!pRandomState) {
175 Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", fRandomState.Data())
176 << "If int, random_state is the seed used by the random number generator;"
177 << "If RandomState instance, random_state is the random number generator;"
178 << "If None, the random number generator is the RandomState instance used by `np.random`." << Endl;
179 }
180 PyDict_SetItemString(fLocalNS, "randomState", pRandomState);
181
182 // If no filename is given, set default
184 fFilenameClassifier = GetWeightFileDir() + "/PyAdaBoostModel_" + GetName() + ".PyData";
185 }
186}
187
188//_______________________________________________________________________
190{
191 TMVA::Internal::PyGILRAII raii;
192 _import_array(); //require to use numpy arrays
193
194 // Check options and load them to local python namespace
196
197 // Import module for ada boost classifier
198 PyRunString("import sklearn.ensemble");
199
200 // Get data properties
203}
204
205//_______________________________________________________________________
207{
208 // Load training data (data, classes, weights) to python arrays
209 int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
210 npy_intp dimsData[2];
211 dimsData[0] = fNrowsTraining;
212 dimsData[1] = fNvars;
213 fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
214 PyDict_SetItemString(fLocalNS, "trainData", (PyObject*)fTrainData);
215 float *TrainData = (float *)(PyArray_DATA(fTrainData));
216
217 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
218 fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
219 PyDict_SetItemString(fLocalNS, "trainDataClasses", (PyObject*)fTrainDataClasses);
220 float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));
221
222 fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
223 PyDict_SetItemString(fLocalNS, "trainDataWeights", (PyObject*)fTrainDataWeights);
224 float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));
225
226 for (int i = 0; i < fNrowsTraining; i++) {
227 // Fill training data matrix
228 const TMVA::Event *e = Data()->GetTrainingEvent(i);
229 for (UInt_t j = 0; j < fNvars; j++) {
230 TrainData[j + i * fNvars] = e->GetValue(j);
231 }
232
233 // Fill target classes
234 TrainDataClasses[i] = e->GetClass();
235
236 // Get event weight
237 TrainDataWeights[i] = e->GetWeight();
238 }
239
240 // Create classifier object
241 PyRunString("classifier = sklearn.ensemble.AdaBoostClassifier(base_estimator=baseEstimator, n_estimators=nEstimators, learning_rate=learningRate, algorithm=algorithm, random_state=randomState)",
242 "Failed to setup classifier");
243
244 // Fit classifier
245 // NOTE: We dump the output to a variable so that the call does not pollute stdout
246 PyRunString("dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)", "Failed to train classifier");
247
248 // Store classifier
249 fClassifier = PyDict_GetItemString(fLocalNS, "classifier");
250 if(fClassifier == 0) {
251 Log() << kFATAL << "Can't create classifier object from AdaBoostClassifier" << Endl;
252 Log() << Endl;
253 }
254
255 if (IsModelPersistence()) {
256 Log() << Endl;
257 Log() << gTools().Color("bold") << "Saving state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
258 Log() << Endl;
260 }
261}
262
263//_______________________________________________________________________
265{
267}
268
269//_______________________________________________________________________
270std::vector<Double_t> MethodPyAdaBoost::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t logProgress)
271{
272 // Load model if not already done
273 if (fClassifier == 0) ReadModelFromFile();
274
275 // Determine number of events
276 Long64_t nEvents = Data()->GetNEvents();
277 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
278 if (firstEvt < 0) firstEvt = 0;
279 nEvents = lastEvt-firstEvt;
280
281 // use timer
282 Timer timer( nEvents, GetName(), kTRUE );
283
284 if (logProgress)
285 Log() << kHEADER << Form("[%s] : ",DataInfo().GetName())
286 << "Evaluation of " << GetMethodName() << " on "
287 << (Data()->GetCurrentType() == Types::kTraining ? "training" : "testing")
288 << " sample (" << nEvents << " events)" << Endl;
289
290 // Get data
291 npy_intp dims[2];
292 dims[0] = nEvents;
293 dims[1] = fNvars;
294 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
295 float *pValue = (float *)(PyArray_DATA(pEvent));
296
297 for (Int_t ievt=0; ievt<nEvents; ievt++) {
298 Data()->SetCurrentEvent(ievt);
299 const TMVA::Event *e = Data()->GetEvent();
300 for (UInt_t i = 0; i < fNvars; i++) {
301 pValue[ievt * fNvars + i] = e->GetValue(i);
302 }
303 }
304
305 // Get prediction from classifier
306 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
307 double *proba = (double *)(PyArray_DATA(result));
308
309 // Return signal probabilities
310 if(Long64_t(mvaValues.size()) != nEvents) mvaValues.resize(nEvents);
311 for (int i = 0; i < nEvents; ++i) {
313 }
314
315 Py_DECREF(pEvent);
316 Py_DECREF(result);
317
318 if (logProgress) {
319 Log() << kINFO
320 << "Elapsed time for evaluation of " << nEvents << " events: "
321 << timer.GetElapsedTime() << " " << Endl;
322 }
323
324 return mvaValues;
325}
326
327//_______________________________________________________________________
329{
330 // cannot determine error
331 NoErrorCalc(errLower, errUpper);
332
333 // Load model if not already done
334 if (fClassifier == 0) ReadModelFromFile();
335
336 // Get current event and load to python array
337 const TMVA::Event *e = Data()->GetEvent();
338 npy_intp dims[2];
339 dims[0] = 1;
340 dims[1] = fNvars;
341 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
342 float *pValue = (float *)(PyArray_DATA(pEvent));
343 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
344
345 // Get prediction from classifier
346 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
347 double *proba = (double *)(PyArray_DATA(result));
348
349 // Return MVA value
350 Double_t mvaValue;
351 mvaValue = proba[TMVA::Types::kSignal]; // getting signal probability
352
353 Py_DECREF(result);
354 Py_DECREF(pEvent);
355
356 return mvaValue;
357}
358
359//_______________________________________________________________________
361{
362 // Load model if not already done
363 if (fClassifier == 0) ReadModelFromFile();
364
365 // Get current event and load to python array
366 const TMVA::Event *e = Data()->GetEvent();
367 npy_intp dims[2];
368 dims[0] = 1;
369 dims[1] = fNvars;
370 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
371 float *pValue = (float *)(PyArray_DATA(pEvent));
372 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
373
374 // Get prediction from classifier
375 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
376 double *proba = (double *)(PyArray_DATA(result));
377
378 // Return MVA values
379 if(UInt_t(classValues.size()) != fNoutputs) classValues.resize(fNoutputs);
380 for(UInt_t i = 0; i < fNoutputs; i++) classValues[i] = proba[i];
381
382 return classValues;
383}
384
385//_______________________________________________________________________
387{
388 if (!PyIsInitialized()) {
389 PyInitialize();
390 }
391
392 Log() << Endl;
393 Log() << gTools().Color("bold") << "Loading state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
394 Log() << Endl;
395
396 // Load classifier from file
398 if(err != 0)
399 {
400 Log() << kFATAL << Form("Failed to load classifier from file (error code: %i): %s", err, fFilenameClassifier.Data()) << Endl;
401 }
402
403 // Book classifier object in python dict
404 PyDict_SetItemString(fLocalNS, "classifier", fClassifier);
405
406 // Load data properties
407 // NOTE: This has to be repeated here for the reader application
410}
411
412//_______________________________________________________________________
414{
415 // Get feature importance from classifier as an array with length equal
416 // number of variables, higher value signals a higher importance
417 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(fClassifier, "feature_importances_");
418 // The python object is null if the base estimator does not support
419 // variable ranking. Then, return NULL, which disables ranking.
420 if(pRanking == 0) return NULL;
421
422 // Fill ranking object and return it
423 fRanking = new Ranking(GetName(), "Variable Importance");
424 Double_t* rankingData = (Double_t*) PyArray_DATA(pRanking);
425 for(UInt_t iVar=0; iVar<fNvars; iVar++){
426 fRanking->AddRank(Rank(GetInputLabel(iVar), rankingData[iVar]));
427 }
428
429 Py_DECREF(pRanking);
430
431 return fRanking;
432}
433
434//_______________________________________________________________________
436{
437 // typical length of text line:
438 // "|--------------------------------------------------------------|"
439 Log() << "An AdaBoost classifier is a meta-estimator that begins by fitting" << Endl;
440 Log() << "a classifier on the original dataset and then fits additional copies" << Endl;
441 Log() << "of the classifier on the same dataset but where the weights of incorrectly" << Endl;
442 Log() << "classified instances are adjusted such that subsequent classifiers focus" << Endl;
443 Log() << "more on difficult cases." << Endl;
444 Log() << Endl;
445 Log() << "Check out the scikit-learn documentation for more information." << Endl;
446}
#define REGISTER_METHOD(CLASS)
for example
#define e(i)
Definition: RSha256.hxx:103
int Int_t
Definition: RtypesCore.h:41
unsigned int UInt_t
Definition: RtypesCore.h:42
const Bool_t kFALSE
Definition: RtypesCore.h:88
bool Bool_t
Definition: RtypesCore.h:59
double Double_t
Definition: RtypesCore.h:55
long long Long64_t
Definition: RtypesCore.h:69
const Bool_t kTRUE
Definition: RtypesCore.h:87
#define ClassImp(name)
Definition: Rtypes.h:365
int type
Definition: TGX11.cxx:120
_object PyObject
Definition: TPyArg.h:20
char * Form(const char *fmt,...)
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
MsgLogger & Log() const
Definition: Configurable.h:122
Class that contains all the data information.
Definition: DataSetInfo.h:60
UInt_t GetNClasses() const
Definition: DataSetInfo.h:136
const Event * GetEvent() const
Definition: DataSet.cxx:202
Types::ETreeType GetCurrentType() const
Definition: DataSet.h:205
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition: DataSet.h:217
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:79
void SetCurrentEvent(Long64_t ievt) const
Definition: DataSet.h:99
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:85
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:601
const char * GetName() const
Definition: MethodBase.h:325
const TString & GetWeightFileDir() const
Definition: MethodBase.h:481
const TString & GetMethodName() const
Definition: MethodBase.h:322
DataSetInfo & DataInfo() const
Definition: MethodBase.h:401
virtual void TestClassification()
initialization
UInt_t GetNVariables() const
Definition: MethodBase.h:336
Bool_t IsModelPersistence()
Definition: MethodBase.h:374
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:841
const TString & GetInputLabel(Int_t i) const
Definition: MethodBase.h:341
Ranking * fRanking
Definition: MethodBase.h:576
DataSet * Data() const
Definition: MethodBase.h:400
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false)
get all the MVA values for the events of the current Data type
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
std::vector< Double_t > mvaValues
const Ranking * CreateRanking()
std::vector< Float_t > classValues
MethodPyAdaBoost(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
virtual void TestClassification()
initialization
virtual void ReadModelFromFile()
std::vector< Float_t > & GetMulticlassValues()
static int PyIsInitialized()
Check Python interpreter initialization status.
PyArrayObject * fTrainData
Definition: PyMethodBase.h:122
PyObject * Eval(TString code)
Evaluate Python code.
static void PyInitialize()
Initialize Python interpreter.
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:123
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
PyObject * fClassifier
Definition: PyMethodBase.h:120
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:124
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=Py_single_input)
Execute Python code from string.
PyObject * fLocalNS
Definition: PyMethodBase.h:143
Ranking for variables in method (implementation)
Definition: Ranking.h:48
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
Definition: Ranking.cxx:86
Timing information for training and evaluation of MVA methods.
Definition: Timer.h:58
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
Definition: Timer.cxx:134
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:840
Singleton class for Global types used by TMVA.
Definition: Types.h:73
@ kSignal
Definition: Types.h:136
EAnalysisType
Definition: Types.h:127
@ kMulticlass
Definition: Types.h:130
@ kClassification
Definition: Types.h:128
@ kTraining
Definition: Types.h:144
Basic string class.
Definition: TString.h:131
const char * Data() const
Definition: TString.h:364
Bool_t IsNull() const
Definition: TString.h:402
create variable transformations
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:158