22#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
23#include <numpy/arrayobject.h>
56MethodPyRandomForest::MethodPyRandomForest(
const TString &jobName,
57 const TString &methodTitle,
59 const TString &theOption) :
66 fMinWeightFractionLeaf(0),
67 fMaxFeatures(
"'auto'"),
68 fMaxLeafNodes(
"None"),
87 fMinWeightFractionLeaf(0),
88 fMaxFeatures(
"'auto'"),
89 fMaxLeafNodes(
"None"),
121 The function to measure the quality of a split. Supported criteria are \
122 'gini' for the Gini impurity and 'entropy' for the information gain. \
123 Note: this parameter is tree-specific.");
126 The maximum depth of the tree. If None, then nodes are expanded until \
127 all leaves are pure or until all leaves contain less than \
128 min_samples_split samples. \
129 Ignored if ``max_leaf_nodes`` is not None.");
132 The minimum number of samples required to split an internal node.");
135 The minimum number of samples in newly created leaves. A split is \
136 discarded if after the split, one of the leaves would contain less then \
137 ``min_samples_leaf`` samples.");
139 The minimum weighted fraction of the input samples required to be at a \
144 Grow trees with ``max_leaf_nodes`` in best-first fashion.\
145 Best nodes are defined as relative reduction in impurity.\
146 If None then unlimited number of leaf nodes.\
147 If not None then ``max_depth`` will be ignored.");
150 Whether bootstrap samples are used when building trees.");
153 the generalization error.");
156 The number of jobs to run in parallel for both `fit` and `predict`. \
157 If -1, then the number of jobs is set to the number of cores.");
160 If int, random_state is the seed used by the random number generator;\
161 If RandomState instance, random_state is the random number generator;\
162 If None, the random number generator is the RandomState instance used\
166 Controls the verbosity of the tree building process.");
169 When set to ``True``, reuse the solution of the previous call to fit\
170 and add more estimators to the ensemble, otherwise, just fit a whole\
174 Weights associated with classes in the form ``{class_label: weight}``.\
175 If not given, all classes are supposed to have weight one. For\
176 multi-output problems, a list of dicts can be provided in the same\
177 order as the columns of y.\
178 The \"auto\" mode uses the values of y to automatically adjust\
179 weights inversely proportional to class frequencies in the input data.\
180 The \"subsample\" mode is the same as \"auto\" except that weights are\
181 computed based on the bootstrap sample for every tree grown.\
182 For multi-output, the weights of each column of y will be multiplied.\
183 Note that these weights will be multiplied with sample_weight (passed\
184 through the fit method) if sample_weight is specified.");
187 "Store trained classifier in this file");
195 Log() << kFATAL <<
" NEstimators <=0... that does not work !! " <<
Endl;
201 Log() << kFATAL <<
Form(
" Criterion = %s... that does not work !! ",
fCriterion.Data())
202 <<
" The options are `gini` or `entropy`." <<
Endl;
210 Log() << kFATAL <<
Form(
" MaxDepth = %s... that does not work !! ",
fMaxDepth.Data())
211 <<
" The options are None or integer." <<
Endl;
215 Log() << kFATAL <<
" MinSamplesSplit < 0... that does not work !! " <<
Endl;
221 Log() << kFATAL <<
" MinSamplesLeaf < 0... that does not work !! " <<
Endl;
227 Log() << kERROR <<
" MinWeightFractionLeaf < 0... that does not work !! " <<
Endl;
239 Log() << kFATAL <<
Form(
" MaxFeatures = %s... that does not work !! ",
fMaxFeatures.Data())
240 <<
"int, float, string or None, optional (default='auto')"
241 <<
"The number of features to consider when looking for the best split:"
242 <<
"If int, then consider `max_features` features at each split."
243 <<
"If float, then `max_features` is a percentage and"
244 <<
"`int(max_features * n_features)` features are considered at each split."
245 <<
"If 'auto', then `max_features=sqrt(n_features)`."
246 <<
"If 'sqrt', then `max_features=sqrt(n_features)`."
247 <<
"If 'log2', then `max_features=log2(n_features)`."
248 <<
"If None, then `max_features=n_features`." <<
Endl;
254 <<
" The options are None or integer." <<
Endl;
260 Log() << kFATAL <<
Form(
" RandomState = %s... that does not work !! ",
fRandomState.Data())
261 <<
"If int, random_state is the seed used by the random number generator;"
262 <<
"If RandomState instance, random_state is the random number generator;"
263 <<
"If None, the random number generator is the RandomState instance used by `np.random`." <<
Endl;
269 Log() << kFATAL <<
Form(
" ClassWeight = %s... that does not work !! ",
fClassWeight.Data())
270 <<
"dict, list of dicts, 'auto', 'subsample' or None, optional" <<
Endl;
275 Log() << kFATAL <<
Form(
" NJobs = %i... that does not work !! ",
fNjobs)
276 <<
"Value has to be greater than zero." <<
Endl;
318 npy_intp dimsData[2];
319 dimsData[0] = fNrowsTraining;
321 fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
323 float *TrainData = (
float *)(PyArray_DATA(
fTrainData));
325 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
326 fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
330 fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
334 for (
int i = 0; i < fNrowsTraining; i++) {
338 TrainData[j + i *
fNvars] =
e->GetValue(j);
342 TrainDataClasses[i] =
e->GetClass();
345 TrainDataWeights[i] =
e->GetWeight();
349 PyRunString(
"classifier = sklearn.ensemble.RandomForestClassifier(bootstrap=bootstrap, class_weight=classWeight, criterion=criterion, max_depth=maxDepth, max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_samples_leaf=minSamplesLeaf, min_samples_split=minSamplesSplit, min_weight_fraction_leaf=minWeightFractionLeaf, n_estimators=nEstimators, n_jobs=nJobs, oob_score=oobScore, random_state=randomState, verbose=verbose, warm_start=warmStart)",
350 "Failed to setup classifier");
354 PyRunString(
"dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)",
"Failed to train classifier");
359 Log() << kFATAL <<
"Can't create classifier object from RandomForestClassifier" <<
Endl;
385 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
386 if (firstEvt < 0) firstEvt = 0;
387 nEvents = lastEvt-firstEvt;
396 <<
" sample (" << nEvents <<
" events)" <<
Endl;
402 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
403 float *pValue = (
float *)(PyArray_DATA(pEvent));
405 for (
Int_t ievt=0; ievt<nEvents; ievt++) {
409 pValue[ievt *
fNvars + i] =
e->GetValue(i);
414 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
415 double *proba = (
double *)(PyArray_DATA(result));
419 for (
int i = 0; i < nEvents; ++i) {
428 <<
"Elapsed time for evaluation of " << nEvents <<
" events: "
449 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
450 float *pValue = (
float *)(PyArray_DATA(pEvent));
451 for (
UInt_t i = 0; i <
fNvars; i++) pValue[i] =
e->GetValue(i);
454 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
455 double *proba = (
double *)(PyArray_DATA(result));
478 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
479 float *pValue = (
float *)(PyArray_DATA(pEvent));
480 for (
UInt_t i = 0; i <
fNvars; i++) pValue[i] =
e->GetValue(i);
483 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
484 double *proba = (
double *)(PyArray_DATA(result));
528 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(
fClassifier,
"feature_importances_");
529 if(pRanking == 0)
Log() << kFATAL <<
"Failed to get ranking from classifier" <<
Endl;
548 Log() <<
"A random forest is a meta estimator that fits a number of decision" <<
Endl;
549 Log() <<
"tree classifiers on various sub-samples of the dataset and use" <<
Endl;
550 Log() <<
"averaging to improve the predictive accuracy and control over-fitting." <<
Endl;
552 Log() <<
"Check out the scikit-learn documentation for more information." <<
Endl;
#define REGISTER_METHOD(CLASS)
for example
char * Form(const char *fmt,...)
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
Class that contains all the data information.
UInt_t GetNClasses() const
const Event * GetEvent() const
Types::ETreeType GetCurrentType() const
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Long64_t GetNTrainingEvents() const
void SetCurrentEvent(Long64_t ievt) const
const Event * GetTrainingEvent(Long64_t ievt) const
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
const char * GetName() const
const TString & GetWeightFileDir() const
const TString & GetMethodName() const
DataSetInfo & DataInfo() const
virtual void TestClassification()
initialization
UInt_t GetNVariables() const
Bool_t IsModelPersistence()
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const TString & GetInputLabel(Int_t i) const
PyObject * pMinWeightFractionLeaf
MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
std::vector< Float_t > & GetMulticlassValues()
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false)
get all the MVA values for the events of the current Data type
~MethodPyRandomForest(void)
std::vector< Float_t > classValues
PyObject * pMinSamplesLeaf
TString fFilenameClassifier
void GetHelpMessage() const
std::vector< Double_t > mvaValues
virtual void TestClassification()
initialization
const Ranking * CreateRanking()
Double_t fMinWeightFractionLeaf
PyObject * pMinSamplesSplit
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
static int PyIsInitialized()
Check Python interpreter initialization status.
PyArrayObject * fTrainData
PyObject * Eval(TString code)
Evaluate Python code.
static void PyInitialize()
Initialize Python interpreter.
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
PyArrayObject * fTrainDataWeights
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
PyArrayObject * fTrainDataClasses
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=Py_single_input)
Execute Python code from string.
Ranking for variables in method (implementation)
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
Timing information for training and evaluation of MVA methods.
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
Singleton class for Global types used by TMVA.
Abstract ClassifierFactory template that handles arbitrary types.
MsgLogger & Endl(MsgLogger &ml)