Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
MethodPyAdaBoost.cxx
Go to the documentation of this file.
1// @(#)root/tmva/pymva $Id$
2// Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : MethodPyAdaBoost *
8 * Web : http://oproject.org *
9 * *
10 * Description: *
11 * AdaBoost Classifier from Scikit learn *
12 * *
13 * *
14 * Redistribution and use in source and binary forms, with or without *
15 * modification, are permitted according to the terms listed in LICENSE *
16 * (see tmva/doc/LICENSE) *
17 * *
18 **********************************************************************************/
19
20#include <Python.h> // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
22
23#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24#include <numpy/arrayobject.h>
25
26#include "TMVA/Config.h"
27#include "TMVA/Configurable.h"
29#include "TMVA/DataSet.h"
30#include "TMVA/Event.h"
31#include "TMVA/IMethod.h"
32#include "TMVA/MsgLogger.h"
33#include "TMVA/PDF.h"
34#include "TMVA/Ranking.h"
35#include "TMVA/Tools.h"
36#include "TMVA/Types.h"
37#include "TMVA/Timer.h"
39#include "TMVA/Results.h"
40
41#include "TMatrix.h"
42
43using namespace TMVA;
44
45namespace TMVA {
46namespace Internal {
47class PyGILRAII {
48 PyGILState_STATE m_GILState;
49
50public:
53};
54} // namespace Internal
55} // namespace TMVA
56
58
59
60//_______________________________________________________________________
62 const TString &methodTitle,
65 PyMethodBase(jobName, Types::kPyAdaBoost, methodTitle, dsi, theOption),
66 fBaseEstimator("None"),
67 fNestimators(50),
68 fLearningRate(1.0),
69 fAlgorithm("SAMME"),
70 fRandomState("None")
71{
72}
73
74//_______________________________________________________________________
76 const TString &theWeightFile) :
78 fBaseEstimator("None"),
79 fNestimators(50),
80 fLearningRate(1.0),
81 fAlgorithm("SAMME"),
82 fRandomState("None")
83{
84}
85
86//_______________________________________________________________________
90
91//_______________________________________________________________________
98
99//_______________________________________________________________________
101{
103
104 DeclareOptionRef(fBaseEstimator, "BaseEstimator", "object, optional (default=DecisionTreeClassifier)\
105 The base estimator from which the boosted ensemble is built.\
106 Support for sample weighting is required, as well as proper `classes_`\
107 and `n_classes_` attributes.");
108
109 DeclareOptionRef(fNestimators, "NEstimators", "integer, optional (default=50)\
110 The maximum number of estimators at which boosting is terminated.\
111 In case of perfect fit, the learning procedure is stopped early.");
112
113 DeclareOptionRef(fLearningRate, "LearningRate", "float, optional (default=1.)\
114 Learning rate shrinks the contribution of each classifier by\
115 ``learning_rate``. There is a trade-off between ``learning_rate`` and\
116 ``n_estimators``.");
117
118 DeclareOptionRef(fAlgorithm, "Algorithm", "{'SAMME'}, optional (default='SAMME')\
119 If 'SAMME.R' then use the SAMME.R real boosting algorithm.\
120 ``base_estimator`` must support calculation of class probabilities.\
121 If 'SAMME' then use the SAMME discrete boosting algorithm.\
122 The SAMME.R algorithm typically converges faster than SAMME,\
123 achieving a lower test error with fewer boosting iterations.\
124 'SAME.R' is deprecated since version 1.4 and removed since 1.6");
125
126 DeclareOptionRef(fRandomState, "RandomState", "int, RandomState instance or None, optional (default=None)\
127 If int, random_state is the seed used by the random number generator;\
128 If RandomState instance, random_state is the random number generator;\
129 If None, the random number generator is the RandomState instance used\
130 by `np.random`.");
131
132 DeclareOptionRef(fFilenameClassifier, "FilenameClassifier",
133 "Store trained classifier in this file");
134}
135
136//_______________________________________________________________________
137// Check options and load them to local python namespace
139{
141 if (!pBaseEstimator) {
142 Log() << kFATAL << Form("BaseEstimator = %s ... that does not work!", fBaseEstimator.Data())
143 << " The options are Object or None." << Endl;
144 }
146
147 if (fNestimators <= 0) {
148 Log() << kFATAL << "NEstimators <=0 ... that does not work!" << Endl;
149 }
152
153 if (fLearningRate <= 0) {
154 Log() << kFATAL << "LearningRate <=0 ... that does not work!" << Endl;
155 }
158
159 if (fAlgorithm != "SAMME" ) {
160 if (fAlgorithm != "SAMME.R")
161 Log() << kFATAL << Form("Algorithm = %s ... that does not work!", fAlgorithm.Data())
162 << " The only options is SAMME " << Endl;
163 else
164 Log() << kWARNING << Form("Algorithm = %s is deprecated for scikit versions > 1.5 - use SAMME", fAlgorithm.Data()) << Endl;
165 }
166
167
169 if (!pRandomState) {
170 Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", fRandomState.Data())
171 << "If int, random_state is the seed used by the random number generator;"
172 << "If RandomState instance, random_state is the random number generator;"
173 << "If None, the random number generator is the RandomState instance used by `np.random`." << Endl;
174 }
176
177 // If no filename is given, set default
179 fFilenameClassifier = GetWeightFileDir() + "/PyAdaBoostModel_" + GetName() + ".PyData";
180 }
181}
182
183//_______________________________________________________________________
185{
187 _import_array(); //require to use numpy arrays
188
189 // Check options and load them to local python namespace
191
192 // Import module for ada boost classifier
193 PyRunString("import sklearn.ensemble");
194
195 // Get data properties
198}
199
200//_______________________________________________________________________
202{
203 // Load training data (data, classes, weights) to python arrays
204 int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
207 dimsData[1] = fNvars;
210 float *TrainData = (float *)(PyArray_DATA(fTrainData));
211
216
220
221 for (int i = 0; i < fNrowsTraining; i++) {
222 // Fill training data matrix
223 const TMVA::Event *e = Data()->GetTrainingEvent(i);
224 for (UInt_t j = 0; j < fNvars; j++) {
225 TrainData[j + i * fNvars] = e->GetValue(j);
226 }
227
228 // Fill target classes
229 TrainDataClasses[i] = e->GetClass();
230
231 // Get event weight
232 TrainDataWeights[i] = e->GetWeight();
233 }
234
235 // Create classifier object
236 PyRunString("classifier = sklearn.ensemble.AdaBoostClassifier(estimator=baseEstimator, n_estimators=nEstimators, learning_rate=learningRate, random_state=randomState)",
237 "Failed to setup classifier");
238
239 // Fit classifier
240 // NOTE: We dump the output to a variable so that the call does not pollute stdout
241 PyRunString("dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)", "Failed to train classifier");
242
243 // Store classifier
245 if(fClassifier == 0) {
246 Log() << kFATAL << "Can't create classifier object from AdaBoostClassifier" << Endl;
247 Log() << Endl;
248 }
249
250 if (IsModelPersistence()) {
251 Log() << Endl;
252 Log() << gTools().Color("bold") << "Saving state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
253 Log() << Endl;
255 }
256}
257
258//_______________________________________________________________________
263
264//_______________________________________________________________________
265std::vector<Double_t> MethodPyAdaBoost::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t /* logProgress */)
266{
267 // Load model if not already done
268 if (fClassifier == 0) ReadModelFromFile();
269
270 // Determine number of events
271 Long64_t nEvents = Data()->GetNEvents();
272 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
273 if (firstEvt < 0) firstEvt = 0;
274 nEvents = lastEvt-firstEvt;
275
276 // Get data
277 npy_intp dims[2];
278 dims[0] = nEvents;
279 dims[1] = fNvars;
281 float *pValue = (float *)(PyArray_DATA(pEvent));
282
283 for (Int_t ievt=0; ievt<nEvents; ievt++) {
285 const TMVA::Event *e = Data()->GetEvent();
286 for (UInt_t i = 0; i < fNvars; i++) {
287 pValue[ievt * fNvars + i] = e->GetValue(i);
288 }
289 }
290
291 // Get prediction from classifier
292 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
293 double *proba = (double *)(PyArray_DATA(result));
294
295 // Return signal probabilities
296 if(Long64_t(mvaValues.size()) != nEvents) mvaValues.resize(nEvents);
297 for (int i = 0; i < nEvents; ++i) {
299 }
300
303
304 // if (logProgress) {
305 // Log() << kINFO
306 // << "Elapsed time for evaluation of " << nEvents << " events: "
307 // << timer.GetElapsedTime() << " " << Endl;
308 // }
309
310 return mvaValues;
311}
312
313//_______________________________________________________________________
315{
316 // cannot determine error
318
319 // Load model if not already done
320 if (fClassifier == 0) ReadModelFromFile();
321
322 // Get current event and load to python array
323 const TMVA::Event *e = Data()->GetEvent();
324 npy_intp dims[2];
325 dims[0] = 1;
326 dims[1] = fNvars;
328 float *pValue = (float *)(PyArray_DATA(pEvent));
329 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
330
331 // Get prediction from classifier
332 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
333 double *proba = (double *)(PyArray_DATA(result));
334
335 // Return MVA value
337 mvaValue = proba[TMVA::Types::kSignal]; // getting signal probability
338
341
342 return mvaValue;
343}
344
345//_______________________________________________________________________
347{
348 // Load model if not already done
349 if (fClassifier == 0) ReadModelFromFile();
350
351 // Get current event and load to python array
352 const TMVA::Event *e = Data()->GetEvent();
353 npy_intp dims[2];
354 dims[0] = 1;
355 dims[1] = fNvars;
357 float *pValue = (float *)(PyArray_DATA(pEvent));
358 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
359
360 // Get prediction from classifier
361 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
362 double *proba = (double *)(PyArray_DATA(result));
363
364 // Return MVA values
365 if(UInt_t(classValues.size()) != fNoutputs) classValues.resize(fNoutputs);
366 for(UInt_t i = 0; i < fNoutputs; i++) classValues[i] = proba[i];
367
368 return classValues;
369}
370
371//_______________________________________________________________________
373{
374 if (!PyIsInitialized()) {
375 PyInitialize();
376 }
377
378 Log() << Endl;
379 Log() << gTools().Color("bold") << "Loading state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
380 Log() << Endl;
381
382 // Load classifier from file
384 if(err != 0)
385 {
386 Log() << kFATAL << Form("Failed to load classifier from file (error code: %i): %s", err, fFilenameClassifier.Data()) << Endl;
387 }
388
389 // Book classifier object in python dict
391
392 // Load data properties
393 // NOTE: This has to be repeated here for the reader application
396}
397
398//_______________________________________________________________________
400{
401 // Get feature importance from classifier as an array with length equal
402 // number of variables, higher value signals a higher importance
404 // The python object is null if the base estimator does not support
405 // variable ranking. Then, return NULL, which disables ranking.
406 if(pRanking == 0) return NULL;
407
408 // Fill ranking object and return it
409 fRanking = new Ranking(GetName(), "Variable Importance");
411 for(UInt_t iVar=0; iVar<fNvars; iVar++){
413 }
414
416
417 return fRanking;
418}
419
420//_______________________________________________________________________
422{
423 // typical length of text line:
424 // "|--------------------------------------------------------------|"
425 Log() << "An AdaBoost classifier is a meta-estimator that begins by fitting" << Endl;
426 Log() << "a classifier on the original dataset and then fits additional copies" << Endl;
427 Log() << "of the classifier on the same dataset but where the weights of incorrectly" << Endl;
428 Log() << "classified instances are adjusted such that subsequent classifiers focus" << Endl;
429 Log() << "more on difficult cases." << Endl;
430 Log() << Endl;
431 Log() << "Check out the scikit-learn documentation for more information." << Endl;
432}
#define REGISTER_METHOD(CLASS)
for example
_object PyObject
#define e(i)
Definition RSha256.hxx:103
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int)
Definition RtypesCore.h:60
constexpr Bool_t kFALSE
Definition RtypesCore.h:108
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
constexpr Bool_t kTRUE
Definition RtypesCore.h:107
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
char * Form(const char *fmt,...)
Formats a string in a circular formatting buffer.
Definition TString.cxx:2495
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
MsgLogger & Log() const
Class that contains all the data information.
Definition DataSetInfo.h:62
UInt_t GetNClasses() const
const Event * GetEvent() const
returns event without transformations
Definition DataSet.cxx:202
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition DataSet.h:206
Long64_t GetNTrainingEvents() const
Definition DataSet.h:68
void SetCurrentEvent(Long64_t ievt) const
Definition DataSet.h:88
const Event * GetTrainingEvent(Long64_t ievt) const
Definition DataSet.h:74
const char * GetName() const override
Definition MethodBase.h:337
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Bool_t IsModelPersistence() const
Definition MethodBase.h:386
const TString & GetWeightFileDir() const
Definition MethodBase.h:495
DataSetInfo & DataInfo() const
Definition MethodBase.h:413
virtual void TestClassification()
initialization
UInt_t GetNVariables() const
Definition MethodBase.h:348
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const TString & GetInputLabel(Int_t i) const
Definition MethodBase.h:353
Ranking * fRanking
Definition MethodBase.h:590
DataSet * Data() const
Definition MethodBase.h:412
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false) override
get all the MVA values for the events of the current Data type
std::vector< Double_t > mvaValues
std::vector< Float_t > classValues
const Ranking * CreateRanking() override
void GetHelpMessage() const override
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets) override
std::vector< Float_t > & GetMulticlassValues() override
MethodPyAdaBoost(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
Double_t GetMvaValue(Double_t *errLower=nullptr, Double_t *errUpper=nullptr) override
void TestClassification() override
initialization
void ReadModelFromFile() override
Virtual base class for all TMVA method based on Python.
static int PyIsInitialized()
Check Python interpreter initialization status.
PyObject * Eval(TString code)
Evaluate Python code.
static void PyInitialize()
Initialize Python interpreter.
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
PyObject * fClassifier
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=256)
Execute Python code from string.
Ranking for variables in method (implementation)
Definition Ranking.h:48
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
Definition Ranking.cxx:85
const TString & Color(const TString &)
human readable color strings
Definition Tools.cxx:803
Singleton class for Global types used by TMVA.
Definition Types.h:71
@ kSignal
Never change this number - it is elsewhere assumed to be zero !
Definition Types.h:135
@ kMulticlass
Definition Types.h:129
@ kClassification
Definition Types.h:127
Basic string class.
Definition TString.h:138
const char * Data() const
Definition TString.h:384
Bool_t IsNull() const
Definition TString.h:422
create variable transformations
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition MsgLogger.h:148