Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
MethodPyGTB.cxx
Go to the documentation of this file.
1// @(#)root/tmva/pymva $Id$
2// Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015
3
4/**********************************************************************************
5 * Project: TMVA - a Root-integrated toolkit for multivariate data analysis *
6 * Package: TMVA *
7 * Class : MethodPyGTB *
8 * Web : http://oproject.org *
9 * *
10 * Description: *
11 * GradientBoostingClassifier Classifiear from Scikit learn *
12 * *
13 * *
14 * Redistribution and use in source and binary forms, with or without *
15 * modification, are permitted according to the terms listed in LICENSE *
16 * (see tmva/doc/LICENSE) *
17 * *
18 **********************************************************************************/
19
20#include <Python.h> // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
21#include "TMVA/MethodPyGTB.h"
22
23#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
24#include <numpy/arrayobject.h>
25
26#include "TMVA/Configurable.h"
28#include "TMVA/Config.h"
29#include "TMVA/DataSet.h"
30#include "TMVA/Event.h"
31#include "TMVA/IMethod.h"
32#include "TMVA/MsgLogger.h"
33#include "TMVA/PDF.h"
34#include "TMVA/Ranking.h"
35#include "TMVA/Results.h"
37#include "TMVA/Tools.h"
38#include "TMVA/Types.h"
39#include "TMVA/Timer.h"
41
42#include "TMatrix.h"
43
44using namespace TMVA;
45
46namespace TMVA {
47namespace Internal {
48class PyGILRAII {
49 PyGILState_STATE m_GILState;
50
51public:
54};
55} // namespace Internal
56} // namespace TMVA
57
59
60
61//_______________________________________________________________________
63 const TString &methodTitle,
66 PyMethodBase(jobName, Types::kPyGTB, methodTitle, dsi, theOption),
67 fLoss("log_loss"),
68 fLearningRate(0.1),
69 fNestimators(100),
70 fSubsample(1.0),
71 fMinSamplesSplit(2),
72 fMinSamplesLeaf(1),
73 fMinWeightFractionLeaf(0.0),
74 fMaxDepth(3),
75 fInit("None"),
76 fRandomState("None"),
77 fMaxFeatures("None"),
78 fVerbose(0),
79 fMaxLeafNodes("None"),
80 fWarmStart(kFALSE)
81{
82}
83
84//_______________________________________________________________________
87 fLoss("log_loss"),
88 fLearningRate(0.1),
89 fNestimators(100),
90 fSubsample(1.0),
91 fMinSamplesSplit(2),
92 fMinSamplesLeaf(1),
93 fMinWeightFractionLeaf(0.0),
94 fMaxDepth(3),
95 fInit("None"),
96 fRandomState("None"),
97 fMaxFeatures("None"),
98 fVerbose(0),
99 fMaxLeafNodes("None"),
100 fWarmStart(kFALSE)
101{
102}
103
104
105//_______________________________________________________________________
109
110//_______________________________________________________________________
117
118
119//_______________________________________________________________________
121{
123
124 DeclareOptionRef(fLoss, "Loss", "{'log_loss', 'exponential'}, optional (default='log_loss')\
125 loss function to be optimized. 'log_loss' refers to\
126 logistic loss for classification\
127 with probabilistic outputs. For loss 'exponential' gradient\
128 boosting recovers the AdaBoost algorithm.");
129
130 DeclareOptionRef(fLearningRate, "LearningRate", "float, optional (default=0.1)\
131 learning rate shrinks the contribution of each tree by `learning_rate`.\
132 There is a trade-off between learning_rate and n_estimators.");
133
134 DeclareOptionRef(fNestimators, "NEstimators", "int (default=100)\
135 The number of boosting stages to perform. Gradient boosting\
136 is fairly robust to over-fitting so a large number usually\
137 results in better performance.");
138
139 DeclareOptionRef(fSubsample, "Subsample", "float, optional (default=1.0)\
140 The fraction of samples to be used for fitting the individual base\
141 learners. If smaller than 1.0 this results in Stochastic Gradient\
142 Boosting. `subsample` interacts with the parameter `n_estimators`.\
143 Choosing `subsample < 1.0` leads to a reduction of variance\
144 and an increase in bias.");
145
146 DeclareOptionRef(fMinSamplesSplit, "MinSamplesSplit", "integer, optional (default=2)\
147 The minimum number of samples required to split an internal node.");
148
149 DeclareOptionRef(fMinSamplesLeaf, "MinSamplesLeaf", "integer, optional (default=1) \
150 The minimum number of samples in newly created leaves. A split is \
151 discarded if after the split, one of the leaves would contain less then \
152 ``min_samples_leaf`` samples.");
153
154 DeclareOptionRef(fMinWeightFractionLeaf, "MinWeightFractionLeaf", "//float, optional (default=0.) \
155 The minimum weighted fraction of the input samples required to be at a \
156 leaf node.");
157
158 DeclareOptionRef(fMaxDepth, "MaxDepth", "integer or None, optional (default=None) \
159 The maximum depth of the tree. If None, then nodes are expanded until \
160 all leaves are pure or until all leaves contain less than \
161 min_samples_split samples. \
162 Ignored if ``max_leaf_nodes`` is not None.");
163
164 DeclareOptionRef(fInit, "Init", "BaseEstimator, None, optional (default=None)\
165 An estimator object that is used to compute the initial\
166 predictions. ``init`` has to provide ``fit`` and ``predict``.\
167 If None it uses ``loss.init_estimator`");
168
169 DeclareOptionRef(fRandomState, "RandomState", "int, RandomState instance or None, optional (default=None)\
170 If int, random_state is the seed used by the random number generator;\
171 If RandomState instance, random_state is the random number generator;\
172 If None, the random number generator is the RandomState instance used\
173 by `np.random`.");
174
175 DeclareOptionRef(fMaxFeatures, "MaxFeatures", "The number of features to consider when looking for the best split");
176
177 DeclareOptionRef(fVerbose, "Verbose", "int, optional (default=0)\
178 Controls the verbosity of the tree building process.");
179
180 DeclareOptionRef(fMaxLeafNodes, "MaxLeafNodes", "int or None, optional (default=None)\
181 Grow trees with ``max_leaf_nodes`` in best-first fashion.\
182 Best nodes are defined as relative reduction in impurity.\
183 If None then unlimited number of leaf nodes.\
184 If not None then ``max_depth`` will be ignored.");
185
186 DeclareOptionRef(fWarmStart, "WarmStart", "bool, optional (default=False)\
187 When set to ``True``, reuse the solution of the previous call to fit\
188 and add more estimators to the ensemble, otherwise, just fit a whole\
189 new forest.");
190
191 DeclareOptionRef(fFilenameClassifier, "FilenameClassifier",
192 "Store trained classifier in this file");
193}
194
195//_______________________________________________________________________
196// Check options and load them to local python namespace
198{
199 if (fLoss != "log_loss" && fLoss != "exponential") {
200 Log() << kFATAL << Form("Loss = %s ... that does not work!", fLoss.Data())
201 << " The options are 'log_loss' or 'exponential'." << Endl;
202 }
203 pLoss = Eval(Form("'%s'", fLoss.Data()));
205
206 if (fLearningRate <= 0) {
207 Log() << kFATAL << "LearningRate <= 0 ... that does not work!" << Endl;
208 }
211
212 if (fNestimators <= 0) {
213 Log() << kFATAL << "NEstimators <= 0 ... that does not work!" << Endl;
214 }
217
218 if (fMinSamplesSplit < 0) {
219 Log() << kFATAL << "MinSamplesSplit < 0 ... that does not work!" << Endl;
220 }
223
224 if (fSubsample < 0) {
225 Log() << kFATAL << "Subsample < 0 ... that does not work!" << Endl;
226 }
227 pSubsample = Eval(Form("%f", fSubsample));
229
230 if (fMinSamplesLeaf < 0) {
231 Log() << kFATAL << "MinSamplesLeaf < 0 ... that does not work!" << Endl;
232 }
235
236 if (fMinSamplesSplit < 0) {
237 Log() << kFATAL << "MinSamplesSplit < 0 ... that does not work!" << Endl;
238 }
241
242 if (fMinWeightFractionLeaf < 0) {
243 Log() << kFATAL << "MinWeightFractionLeaf < 0 ... that does not work !" << Endl;
244 }
246 PyDict_SetItemString(fLocalNS, "minWeightFractionLeaf", pMinWeightFractionLeaf);
247
248 if (fMaxDepth <= 0) {
249 Log() << kFATAL << " MaxDepth <= 0 ... that does not work !! " << Endl;
250 }
251 pMaxDepth = Eval(Form("%i", fMaxDepth));
253
254 pInit = Eval(fInit);
255 if (!pInit) {
256 Log() << kFATAL << Form("Init = %s ... that does not work!", fInit.Data())
257 << " The options are None or BaseEstimator, which is an estimator object that"
258 << "is used to compute the initial predictions. "
259 << "'init' has to provide 'fit' and 'predict' methods."
260 << " If None it uses 'loss.init_estimator'." << Endl;
261 }
263
265 if (!pRandomState) {
266 Log() << kFATAL << Form(" RandomState = %s ... that does not work! ", fRandomState.Data())
267 << " If int, random_state is the seed used by the random number generator;"
268 << " If RandomState instance, random_state is the random number generator;"
269 << " If None, the random number generator is the RandomState instance used by 'np.random'."
270 << Endl;
271 }
273
274 if (fMaxFeatures == "auto" || fMaxFeatures == "sqrt" || fMaxFeatures == "log2"){
275 fMaxFeatures = Form("'%s'", fMaxFeatures.Data());
276 }
279
280 if (!pMaxFeatures) {
281 Log() << kFATAL << Form(" MaxFeatures = %s... that does not work !! ", fMaxFeatures.Data())
282 << "int, float, string or None, optional (default='auto')"
283 << "The number of features to consider when looking for the best split:"
284 << "If int, then consider `max_features` features at each split."
285 << "If float, then `max_features` is a percentage and"
286 << "`int(max_features * n_features)` features are considered at each split."
287 << "If 'auto', then `max_features=sqrt(n_features)`."
288 << "If 'sqrt', then `max_features=sqrt(n_features)`."
289 << "If 'log2', then `max_features=log2(n_features)`."
290 << "If None, then `max_features=n_features`." << Endl;
291 }
292
294 if (!pMaxLeafNodes) {
295 Log() << kFATAL << Form(" MaxLeafNodes = %s... that does not work!", fMaxLeafNodes.Data())
296 << " The options are None or integer." << Endl;
297 }
299
300 pVerbose = Eval(Form("%i", fVerbose));
302
303 pWarmStart = (fWarmStart) ? Eval("True") : Eval("False");
305
306 // If no filename is given, set default
308 fFilenameClassifier = GetWeightFileDir() + "/PyGTBModel_" + GetName() + ".PyData";
309 }
310}
311
312//_______________________________________________________________________
314{
316 _import_array(); //require to use numpy arrays
317
318 // Check options and load them to local python namespace
320
321 // Import module for gradient tree boosting classifier
322 PyRunString("import sklearn.ensemble");
323
324 // Get data properties
327}
328
330{
331 // Load training data (data, classes, weights) to python arrays
332 int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
335 dimsData[1] = fNvars;
338 float *TrainData = (float *)(PyArray_DATA(fTrainData));
339
344
348
349 for (int i = 0; i < fNrowsTraining; i++) {
350 // Fill training data matrix
351 const TMVA::Event *e = Data()->GetTrainingEvent(i);
352 for (UInt_t j = 0; j < fNvars; j++) {
353 TrainData[j + i * fNvars] = e->GetValue(j);
354 }
355
356 // Fill target classes
357 TrainDataClasses[i] = e->GetClass();
358
359 // Get event weight
360 TrainDataWeights[i] = e->GetWeight();
361 }
362
363 // Create classifier object
364 PyRunString("classifier = sklearn.ensemble.GradientBoostingClassifier(loss=loss, learning_rate=learningRate, n_estimators=nEstimators, max_depth=maxDepth, min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightFractionLeaf, subsample=subsample, max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, init=init, verbose=verbose, warm_start=warmStart, random_state=randomState)",
365 "Failed to setup classifier");
366
367 // Fit classifier
368 // NOTE: We dump the output to a variable so that the call does not pollute stdout
369 PyRunString("dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)", "Failed to train classifier");
370
371 // Store classifier
373 if(fClassifier == 0) {
374 Log() << kFATAL << "Can't create classifier object from GradientBoostingClassifier" << Endl;
375 Log() << Endl;
376 }
377
378 if (IsModelPersistence()) {
379 Log() << Endl;
380 Log() << gTools().Color("bold") << "Saving state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
381 Log() << Endl;
383 }
384}
385
386//_______________________________________________________________________
391
392//_______________________________________________________________________
393std::vector<Double_t> MethodPyGTB::GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t /* logProgress */)
394{
395 // Load model if not already done
396 if (fClassifier == 0) ReadModelFromFile();
397
398 // Determine number of events
399 Long64_t nEvents = Data()->GetNEvents();
400 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
401 if (firstEvt < 0) firstEvt = 0;
402 nEvents = lastEvt-firstEvt;
403
404
405 // Get data
406 npy_intp dims[2];
407 dims[0] = nEvents;
408 dims[1] = fNvars;
410 float *pValue = (float *)(PyArray_DATA(pEvent));
411
412 for (Int_t ievt=0; ievt<nEvents; ievt++) {
414 const TMVA::Event *e = Data()->GetEvent();
415 for (UInt_t i = 0; i < fNvars; i++) {
416 pValue[ievt * fNvars + i] = e->GetValue(i);
417 }
418 }
419
420 // Get prediction from classifier
421 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
422 double *proba = (double *)(PyArray_DATA(result));
423
424 // Return signal probabilities
425 if(Long64_t(mvaValues.size()) != nEvents) mvaValues.resize(nEvents);
426 for (int i = 0; i < nEvents; ++i) {
428 }
429
432
433
434 return mvaValues;
435}
436
437//_______________________________________________________________________
439{
440 // cannot determine error
442
443 // Load model if not already done
444 if (fClassifier == 0) ReadModelFromFile();
445
446 // Get current event and load to python array
447 const TMVA::Event *e = Data()->GetEvent();
448 npy_intp dims[2];
449 dims[0] = 1;
450 dims[1] = fNvars;
452 float *pValue = (float *)(PyArray_DATA(pEvent));
453 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
454
455 // Get prediction from classifier
456 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
457 double *proba = (double *)(PyArray_DATA(result));
458
459 // Return MVA value
461 mvaValue = proba[TMVA::Types::kSignal]; // getting signal probability
462
465
466 return mvaValue;
467}
468
469//_______________________________________________________________________
470std::vector<Float_t>& MethodPyGTB::GetMulticlassValues()
471{
472 // Load model if not already done
473 if (fClassifier == 0) ReadModelFromFile();
474
475 // Get current event and load to python array
476 const TMVA::Event *e = Data()->GetEvent();
477 npy_intp dims[2];
478 dims[0] = 1;
479 dims[1] = fNvars;
481 float *pValue = (float *)(PyArray_DATA(pEvent));
482 for (UInt_t i = 0; i < fNvars; i++) pValue[i] = e->GetValue(i);
483
484 // Get prediction from classifier
485 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
486 double *proba = (double *)(PyArray_DATA(result));
487
488 // Return MVA values
489 if(UInt_t(classValues.size()) != fNoutputs) classValues.resize(fNoutputs);
490 for(UInt_t i = 0; i < fNoutputs; i++) classValues[i] = proba[i];
491
494
495 return classValues;
496}
497
498//_______________________________________________________________________
500{
501 if (!PyIsInitialized()) {
502 PyInitialize();
503 }
504
505 Log() << Endl;
506 Log() << gTools().Color("bold") << "Loading state file: " << gTools().Color("reset") << fFilenameClassifier << Endl;
507 Log() << Endl;
508
509 // Load classifier from file
511 if(err != 0)
512 {
513 Log() << kFATAL << Form("Failed to load classifier from file (error code: %i): %s", err, fFilenameClassifier.Data()) << Endl;
514 }
515
516 // Book classifier object in python dict
518
519 // Load data properties
520 // NOTE: This has to be repeated here for the reader application
523}
524
525//_______________________________________________________________________
527{
528 // Get feature importance from classifier as an array with length equal
529 // number of variables, higher value signals a higher importance
531 if(pRanking == 0) Log() << kFATAL << "Failed to get ranking from classifier" << Endl;
532
533 // Fill ranking object and return it
534 fRanking = new Ranking(GetName(), "Variable Importance");
536 for(UInt_t iVar=0; iVar<fNvars; iVar++){
538 }
539
541
542 return fRanking;
543}
544
545//_______________________________________________________________________
547{
548 // typical length of text line:
549 // "|--------------------------------------------------------------|"
550 Log() << "A gradient tree boosting classifier builds a model from an ensemble" << Endl;
551 Log() << "of decision trees, which are adapted each boosting step to fit better" << Endl;
552 Log() << "to previously misclassified events." << Endl;
553 Log() << Endl;
554 Log() << "Check out the scikit-learn documentation for more information." << Endl;
555}
556
557
#define REGISTER_METHOD(CLASS)
for example
_object PyObject
#define e(i)
Definition RSha256.hxx:103
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int)
Definition RtypesCore.h:60
constexpr Bool_t kFALSE
Definition RtypesCore.h:108
long long Long64_t
Portable signed long integer 8 bytes.
Definition RtypesCore.h:83
constexpr Bool_t kTRUE
Definition RtypesCore.h:107
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
char * Form(const char *fmt,...)
Formats a string in a circular formatting buffer.
Definition TString.cxx:2495
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
MsgLogger & Log() const
Class that contains all the data information.
Definition DataSetInfo.h:62
UInt_t GetNClasses() const
const Event * GetEvent() const
returns event without transformations
Definition DataSet.cxx:202
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Definition DataSet.h:206
Long64_t GetNTrainingEvents() const
Definition DataSet.h:68
void SetCurrentEvent(Long64_t ievt) const
Definition DataSet.h:88
const Event * GetTrainingEvent(Long64_t ievt) const
Definition DataSet.h:74
const char * GetName() const override
Definition MethodBase.h:337
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Bool_t IsModelPersistence() const
Definition MethodBase.h:386
const TString & GetWeightFileDir() const
Definition MethodBase.h:495
DataSetInfo & DataInfo() const
Definition MethodBase.h:413
virtual void TestClassification()
initialization
UInt_t GetNVariables() const
Definition MethodBase.h:348
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const TString & GetInputLabel(Int_t i) const
Definition MethodBase.h:353
Ranking * fRanking
Definition MethodBase.h:590
DataSet * Data() const
Definition MethodBase.h:412
Double_t fSubsample
Definition MethodPyGTB.h:99
void Init() override
void ProcessOptions() override
std::vector< Float_t > & GetMulticlassValues() override
PyObject * pMinSamplesLeaf
Double_t fMinWeightFractionLeaf
const Ranking * CreateRanking() override
std::vector< Double_t > mvaValues
Definition MethodPyGTB.h:73
PyObject * pMaxFeatures
PyObject * pMaxDepth
PyObject * pMaxLeafNodes
std::vector< Float_t > classValues
Definition MethodPyGTB.h:74
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets) override
Double_t fLearningRate
Definition MethodPyGTB.h:90
MethodPyGTB(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
Double_t GetMvaValue(Double_t *errLower=nullptr, Double_t *errUpper=nullptr) override
PyObject * pLearningRate
Definition MethodPyGTB.h:89
void Train() override
PyObject * pNestimators
Definition MethodPyGTB.h:94
void DeclareOptions() override
PyObject * pVerbose
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false) override
get all the MVA values for the events of the current Data type
void TestClassification() override
initialization
void GetHelpMessage() const override
TString fFilenameClassifier
Definition MethodPyGTB.h:78
PyObject * pMinSamplesSplit
PyObject * pLoss
Definition MethodPyGTB.h:82
PyObject * pSubsample
Definition MethodPyGTB.h:98
PyObject * pRandomState
PyObject * pWarmStart
PyObject * pMinWeightFractionLeaf
void ReadModelFromFile() override
Virtual base class for all TMVA method based on Python.
static int PyIsInitialized()
Check Python interpreter initialization status.
PyObject * Eval(TString code)
Evaluate Python code.
static void PyInitialize()
Initialize Python interpreter.
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
PyObject * fClassifier
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=256)
Execute Python code from string.
Ranking for variables in method (implementation)
Definition Ranking.h:48
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
Definition Ranking.cxx:85
const TString & Color(const TString &)
human readable color strings
Definition Tools.cxx:803
Singleton class for Global types used by TMVA.
Definition Types.h:71
@ kSignal
Never change this number - it is elsewhere assumed to be zero !
Definition Types.h:135
@ kMulticlass
Definition Types.h:129
@ kClassification
Definition Types.h:127
Basic string class.
Definition TString.h:138
const char * Data() const
Definition TString.h:384
Bool_t IsNull() const
Definition TString.h:422
create variable transformations
Tools & gTools()
MsgLogger & Endl(MsgLogger &ml)
Definition MsgLogger.h:148