22#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
23#include <numpy/arrayobject.h>
71 fMinWeightFractionLeaf(0),
72 fMaxFeatures(
"'sqrt'"),
73 fMaxLeafNodes(
"None"),
92 fMinWeightFractionLeaf(0),
93 fMaxFeatures(
"'sqrt'"),
94 fMaxLeafNodes(
"None"),
126 The function to measure the quality of a split. Supported criteria are \
127 'gini' for the Gini impurity and 'entropy' for the information gain. \
128 Note: this parameter is tree-specific.");
131 The maximum depth of the tree. If None, then nodes are expanded until \
132 all leaves are pure or until all leaves contain less than \
133 min_samples_split samples. \
134 Ignored if ``max_leaf_nodes`` is not None.");
137 The minimum number of samples required to split an internal node.");
140 The minimum number of samples in newly created leaves. A split is \
141 discarded if after the split, one of the leaves would contain less then \
142 ``min_samples_leaf`` samples.");
144 The minimum weighted fraction of the input samples required to be at a \
149 Grow trees with ``max_leaf_nodes`` in best-first fashion.\
150 Best nodes are defined as relative reduction in impurity.\
151 If None then unlimited number of leaf nodes.\
152 If not None then ``max_depth`` will be ignored.");
155 Whether bootstrap samples are used when building trees.");
158 the generalization error.");
161 The number of jobs to run in parallel for both `fit` and `predict`. \
162 If -1, then the number of jobs is set to the number of cores.");
165 If int, random_state is the seed used by the random number generator;\
166 If RandomState instance, random_state is the random number generator;\
167 If None, the random number generator is the RandomState instance used\
171 Controls the verbosity of the tree building process.");
174 When set to ``True``, reuse the solution of the previous call to fit\
175 and add more estimators to the ensemble, otherwise, just fit a whole\
179 Weights associated with classes in the form ``{class_label: weight}``.\
180 If not given, all classes are supposed to have weight one. For\
181 multi-output problems, a list of dicts can be provided in the same\
182 order as the columns of y.\
183 The \"auto\" mode uses the values of y to automatically adjust\
184 weights inversely proportional to class frequencies in the input data.\
185 The \"subsample\" mode is the same as \"auto\" except that weights are\
186 computed based on the bootstrap sample for every tree grown.\
187 For multi-output, the weights of each column of y will be multiplied.\
188 Note that these weights will be multiplied with sample_weight (passed\
189 through the fit method) if sample_weight is specified.");
192 "Store trained classifier in this file");
200 Log() << kFATAL <<
" NEstimators <=0... that does not work !! " <<
Endl;
207 <<
" The options are `gini` or `entropy`." <<
Endl;
216 <<
" The options are None or integer." <<
Endl;
220 Log() << kFATAL <<
" MinSamplesSplit < 0... that does not work !! " <<
Endl;
226 Log() << kFATAL <<
" MinSamplesLeaf < 0... that does not work !! " <<
Endl;
232 Log() << kERROR <<
" MinWeightFractionLeaf < 0... that does not work !! " <<
Endl;
246 <<
"int, float, string or None, optional (default='auto')"
247 <<
"The number of features to consider when looking for the best split:"
248 <<
"If int, then consider `max_features` features at each split."
249 <<
"If float, then `max_features` is a percentage and"
250 <<
"`int(max_features * n_features)` features are considered at each split."
251 <<
"If 'auto', then `max_features=sqrt(n_features)`."
252 <<
"If 'sqrt', then `max_features=sqrt(n_features)`."
253 <<
"If 'log2', then `max_features=log2(n_features)`."
254 <<
"If None, then `max_features=n_features`." <<
Endl;
260 <<
" The options are None or integer." <<
Endl;
267 <<
"If int, random_state is the seed used by the random number generator;"
268 <<
"If RandomState instance, random_state is the random number generator;"
269 <<
"If None, the random number generator is the RandomState instance used by `np.random`." <<
Endl;
276 <<
"dict, list of dicts, 'auto', 'subsample' or None, optional" <<
Endl;
281 Log() << kFATAL <<
Form(
" NJobs = %i... that does not work !! ",
fNjobs)
282 <<
"Value has to be greater than zero." <<
Endl;
325 npy_intp dimsData[2];
326 dimsData[0] = fNrowsTraining;
328 PyArrayObject * fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
330 float *TrainData = (
float *)(PyArray_DATA(fTrainData));
332 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
333 PyArrayObject * fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
334 PyDict_SetItemString(
fLocalNS,
"trainDataClasses", (
PyObject*)fTrainDataClasses);
335 float *TrainDataClasses = (
float *)(PyArray_DATA(fTrainDataClasses));
337 PyArrayObject * fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
338 PyDict_SetItemString(
fLocalNS,
"trainDataWeights", (
PyObject*)fTrainDataWeights);
339 float *TrainDataWeights = (
float *)(PyArray_DATA(fTrainDataWeights));
341 for (
int i = 0; i < fNrowsTraining; i++) {
349 TrainDataClasses[i] =
e->GetClass();
352 TrainDataWeights[i] =
e->GetWeight();
356 PyRunString(
"classifier = sklearn.ensemble.RandomForestClassifier(bootstrap=bootstrap, class_weight=classWeight, criterion=criterion, max_depth=maxDepth, max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_samples_leaf=minSamplesLeaf, min_samples_split=minSamplesSplit, min_weight_fraction_leaf=minWeightFractionLeaf, n_estimators=nEstimators, n_jobs=nJobs, oob_score=oobScore, random_state=randomState, verbose=verbose, warm_start=warmStart)",
357 "Failed to setup classifier");
361 PyRunString(
"dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)",
"Failed to train classifier");
366 Log() << kFATAL <<
"Can't create classifier object from RandomForestClassifier" <<
Endl;
392 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
393 if (firstEvt < 0) firstEvt = 0;
394 nEvents = lastEvt-firstEvt;
403 <<
" sample (" << nEvents <<
" events)" <<
Endl;
409 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
410 float *pValue = (
float *)(PyArray_DATA(pEvent));
412 for (
Int_t ievt=0; ievt<nEvents; ievt++) {
421 PyArrayObject *
result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
422 double *proba = (
double *)(PyArray_DATA(
result));
426 for (
int i = 0; i < nEvents; ++i) {
435 <<
"Elapsed time for evaluation of " << nEvents <<
" events: "
456 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
457 float *pValue = (
float *)(PyArray_DATA(pEvent));
458 for (
UInt_t i = 0; i <
fNvars; i++) pValue[i] =
e->GetValue(i);
461 PyArrayObject *
result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
462 double *proba = (
double *)(PyArray_DATA(
result));
485 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
486 float *pValue = (
float *)(PyArray_DATA(pEvent));
487 for (
UInt_t i = 0; i <
fNvars; i++) pValue[i] =
e->GetValue(i);
490 PyArrayObject *
result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
491 double *proba = (
double *)(PyArray_DATA(
result));
535 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(
fClassifier,
"feature_importances_");
536 if(pRanking == 0)
Log() << kFATAL <<
"Failed to get ranking from classifier" <<
Endl;
555 Log() <<
"A random forest is a meta estimator that fits a number of decision" <<
Endl;
556 Log() <<
"tree classifiers on various sub-samples of the dataset and use" <<
Endl;
557 Log() <<
"averaging to improve the predictive accuracy and control over-fitting." <<
Endl;
559 Log() <<
"Check out the scikit-learn documentation for more information." <<
Endl;
#define REGISTER_METHOD(CLASS)
for example
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t result
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t Float_t Float_t Float_t Int_t Int_t UInt_t UInt_t Rectangle_t Int_t Int_t Window_t TString Int_t GCValues_t GetPrimarySelectionOwner GetDisplay GetScreen GetColormap GetNativeEvent const char const char dpyName wid window const char font_name cursor keysym reg const char only_if_exist regb h Point_t winding char text const char depth char const char Int_t count const char ColorStruct_t color const char Pixmap_t Pixmap_t PictureAttributes_t attr const char char ret_data h unsigned char height h Atom_t Int_t ULong_t ULong_t unsigned char prop_list Atom_t Atom_t Atom_t Time_t type
char * Form(const char *fmt,...)
Formats a string in a circular formatting buffer.
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
Class that contains all the data information.
UInt_t GetNClasses() const
const Event * GetEvent() const
returns event without transformations
Types::ETreeType GetCurrentType() const
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Long64_t GetNTrainingEvents() const
void SetCurrentEvent(Long64_t ievt) const
const Event * GetTrainingEvent(Long64_t ievt) const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
PyGILState_STATE m_GILState
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
const char * GetName() const
Bool_t IsModelPersistence() const
const TString & GetWeightFileDir() const
const TString & GetMethodName() const
DataSetInfo & DataInfo() const
virtual void TestClassification()
initialization
UInt_t GetNVariables() const
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const TString & GetInputLabel(Int_t i) const
PyObject * pMinWeightFractionLeaf
MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
std::vector< Float_t > & GetMulticlassValues()
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false)
get all the MVA values for the events of the current Data type
~MethodPyRandomForest(void)
std::vector< Float_t > classValues
PyObject * pMinSamplesLeaf
TString fFilenameClassifier
void GetHelpMessage() const
std::vector< Double_t > mvaValues
Double_t GetMvaValue(Double_t *errLower=nullptr, Double_t *errUpper=nullptr)
virtual void TestClassification()
initialization
const Ranking * CreateRanking()
Double_t fMinWeightFractionLeaf
PyObject * pMinSamplesSplit
static int PyIsInitialized()
Check Python interpreter initialization status.
PyObject * Eval(TString code)
Evaluate Python code.
static void PyInitialize()
Initialize Python interpreter.
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=256)
Execute Python code from string.
Ranking for variables in method (implementation)
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
Timing information for training and evaluation of MVA methods.
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
Singleton class for Global types used by TMVA.
@ kSignal
Never change this number - it is elsewhere assumed to be zero !
const char * Data() const
create variable transformations
MsgLogger & Endl(MsgLogger &ml)