22#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
23#include <numpy/arrayobject.h>
125 The function to measure the quality of a split. Supported criteria are \
126 'gini' for the Gini impurity and 'entropy' for the information gain. \
127 Note: this parameter is tree-specific.");
130 The maximum depth of the tree. If None, then nodes are expanded until \
131 all leaves are pure or until all leaves contain less than \
132 min_samples_split samples. \
133 Ignored if ``max_leaf_nodes`` is not None.");
136 The minimum number of samples required to split an internal node.");
139 The minimum number of samples in newly created leaves. A split is \
140 discarded if after the split, one of the leaves would contain less then \
141 ``min_samples_leaf`` samples.");
143 The minimum weighted fraction of the input samples required to be at a \
148 Grow trees with ``max_leaf_nodes`` in best-first fashion.\
149 Best nodes are defined as relative reduction in impurity.\
150 If None then unlimited number of leaf nodes.\
151 If not None then ``max_depth`` will be ignored.");
154 Whether bootstrap samples are used when building trees.");
157 the generalization error.");
160 The number of jobs to run in parallel for both `fit` and `predict`. \
161 If -1, then the number of jobs is set to the number of cores.");
164 If int, random_state is the seed used by the random number generator;\
165 If RandomState instance, random_state is the random number generator;\
166 If None, the random number generator is the RandomState instance used\
170 Controls the verbosity of the tree building process.");
173 When set to ``True``, reuse the solution of the previous call to fit\
174 and add more estimators to the ensemble, otherwise, just fit a whole\
178 Weights associated with classes in the form ``{class_label: weight}``.\
179 If not given, all classes are supposed to have weight one. For\
180 multi-output problems, a list of dicts can be provided in the same\
181 order as the columns of y.\
182 The \"auto\" mode uses the values of y to automatically adjust\
183 weights inversely proportional to class frequencies in the input data.\
184 The \"subsample\" mode is the same as \"auto\" except that weights are\
185 computed based on the bootstrap sample for every tree grown.\
186 For multi-output, the weights of each column of y will be multiplied.\
187 Note that these weights will be multiplied with sample_weight (passed\
188 through the fit method) if sample_weight is specified.");
191 "Store trained classifier in this file");
199 Log() << kFATAL <<
" NEstimators <=0... that does not work !! " <<
Endl;
205 Log() << kFATAL <<
Form(
" Criterion = %s... that does not work !! ",
fCriterion.Data())
206 <<
" The options are `gini` or `entropy`." <<
Endl;
214 Log() << kFATAL <<
Form(
" MaxDepth = %s... that does not work !! ",
fMaxDepth.Data())
215 <<
" The options are None or integer." <<
Endl;
219 Log() << kFATAL <<
" MinSamplesSplit < 0... that does not work !! " <<
Endl;
225 Log() << kFATAL <<
" MinSamplesLeaf < 0... that does not work !! " <<
Endl;
231 Log() << kERROR <<
" MinWeightFractionLeaf < 0... that does not work !! " <<
Endl;
244 Log() << kFATAL <<
Form(
" MaxFeatures = %s... that does not work !! ",
fMaxFeatures.Data())
245 <<
"int, float, string or None, optional (default='auto')"
246 <<
"The number of features to consider when looking for the best split:"
247 <<
"If int, then consider `max_features` features at each split."
248 <<
"If float, then `max_features` is a percentage and"
249 <<
"`int(max_features * n_features)` features are considered at each split."
250 <<
"If 'auto', then `max_features=sqrt(n_features)`."
251 <<
"If 'sqrt', then `max_features=sqrt(n_features)`."
252 <<
"If 'log2', then `max_features=log2(n_features)`."
253 <<
"If None, then `max_features=n_features`." <<
Endl;
259 <<
" The options are None or integer." <<
Endl;
265 Log() << kFATAL <<
Form(
" RandomState = %s... that does not work !! ",
fRandomState.Data())
266 <<
"If int, random_state is the seed used by the random number generator;"
267 <<
"If RandomState instance, random_state is the random number generator;"
268 <<
"If None, the random number generator is the RandomState instance used by `np.random`." <<
Endl;
274 Log() << kFATAL <<
Form(
" ClassWeight = %s... that does not work !! ",
fClassWeight.Data())
275 <<
"dict, list of dicts, 'auto', 'subsample' or None, optional" <<
Endl;
280 Log() << kFATAL <<
Form(
" NJobs = %i... that does not work !! ",
fNjobs)
281 <<
"Value has to be greater than zero." <<
Endl;
324 npy_intp dimsData[2];
325 dimsData[0] = fNrowsTraining;
327 PyArrayObject * fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
329 float *TrainData = (
float *)(PyArray_DATA(fTrainData));
331 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
332 PyArrayObject * fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
333 PyDict_SetItemString(
fLocalNS,
"trainDataClasses", (
PyObject*)fTrainDataClasses);
334 float *TrainDataClasses = (
float *)(PyArray_DATA(fTrainDataClasses));
336 PyArrayObject * fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
337 PyDict_SetItemString(
fLocalNS,
"trainDataWeights", (
PyObject*)fTrainDataWeights);
338 float *TrainDataWeights = (
float *)(PyArray_DATA(fTrainDataWeights));
340 for (
int i = 0; i < fNrowsTraining; i++) {
343 for (UInt_t j = 0; j <
fNvars; j++) {
344 TrainData[j + i *
fNvars] =
e->GetValue(j);
348 TrainDataClasses[i] =
e->GetClass();
351 TrainDataWeights[i] =
e->GetWeight();
355 PyRunString(
"classifier = sklearn.ensemble.RandomForestClassifier(bootstrap=bootstrap, class_weight=classWeight, criterion=criterion, max_depth=maxDepth, max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_samples_leaf=minSamplesLeaf, min_samples_split=minSamplesSplit, min_weight_fraction_leaf=minWeightFractionLeaf, n_estimators=nEstimators, n_jobs=nJobs, oob_score=oobScore, random_state=randomState, verbose=verbose, warm_start=warmStart)",
356 "Failed to setup classifier");
360 PyRunString(
"dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)",
"Failed to train classifier");
365 Log() << kFATAL <<
"Can't create classifier object from RandomForestClassifier" <<
Endl;
391 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
392 if (firstEvt < 0) firstEvt = 0;
393 nEvents = lastEvt-firstEvt;
400 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
401 float *pValue = (
float *)(PyArray_DATA(pEvent));
403 for (
Int_t ievt=0; ievt<nEvents; ievt++) {
406 for (UInt_t i = 0; i <
fNvars; i++) {
407 pValue[ievt *
fNvars + i] =
e->GetValue(i);
412 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
413 double *proba = (
double *)(PyArray_DATA(result));
417 for (
int i = 0; i < nEvents; ++i) {
442 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
443 float *pValue = (
float *)(PyArray_DATA(pEvent));
444 for (UInt_t i = 0; i <
fNvars; i++) pValue[i] =
e->GetValue(i);
447 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
448 double *proba = (
double *)(PyArray_DATA(result));
471 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
472 float *pValue = (
float *)(PyArray_DATA(pEvent));
473 for (UInt_t i = 0; i <
fNvars; i++) pValue[i] =
e->GetValue(i);
476 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier,
const_cast<char *
>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
477 double *proba = (
double *)(PyArray_DATA(result));
521 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(
fClassifier,
"feature_importances_");
522 if(pRanking == 0)
Log() << kFATAL <<
"Failed to get ranking from classifier" <<
Endl;
527 for(UInt_t iVar=0; iVar<
fNvars; iVar++){
541 Log() <<
"A random forest is a meta estimator that fits a number of decision" <<
Endl;
542 Log() <<
"tree classifiers on various sub-samples of the dataset and use" <<
Endl;
543 Log() <<
"averaging to improve the predictive accuracy and control over-fitting." <<
Endl;
545 Log() <<
"Check out the scikit-learn documentation for more information." <<
Endl;
#define REGISTER_METHOD(CLASS)
for example
int Int_t
Signed integer 4 bytes (int).
unsigned int UInt_t
Unsigned integer 4 bytes (unsigned int).
bool Bool_t
Boolean (0=false, 1=true) (bool).
double Double_t
Double 8 bytes.
long long Long64_t
Portable signed long integer 8 bytes.
char * Form(const char *fmt,...)
Formats a string in a circular formatting buffer.
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
Class that contains all the data information.
UInt_t GetNClasses() const
const Event * GetEvent() const
returns event without transformations
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Long64_t GetNTrainingEvents() const
void SetCurrentEvent(Long64_t ievt) const
const Event * GetTrainingEvent(Long64_t ievt) const
PyGILState_STATE m_GILState
const char * GetName() const override
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Bool_t IsModelPersistence() const
const TString & GetWeightFileDir() const
DataSetInfo & DataInfo() const
virtual void TestClassification()
initialization
UInt_t GetNVariables() const
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const TString & GetInputLabel(Int_t i) const
PyObject * pMinWeightFractionLeaf
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets) override
void DeclareOptions() override
MethodPyRandomForest(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false) override
get all the MVA values for the events of the current Data type
std::vector< Float_t > & GetMulticlassValues() override
~MethodPyRandomForest(void)
std::vector< Float_t > classValues
PyObject * pMinSamplesLeaf
TString fFilenameClassifier
std::vector< Double_t > mvaValues
void GetHelpMessage() const override
void TestClassification() override
initialization
const Ranking * CreateRanking() override
Double_t fMinWeightFractionLeaf
Double_t GetMvaValue(Double_t *errLower=nullptr, Double_t *errUpper=nullptr) override
void ReadModelFromFile() override
void ProcessOptions() override
PyObject * pMinSamplesSplit
static int PyIsInitialized()
Check Python interpreter initialization status.
PyObject * Eval(TString code)
Evaluate Python code.
static void PyInitialize()
Initialize Python interpreter.
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
PyMethodBase(const TString &jobName, Types::EMVA methodType, const TString &methodTitle, DataSetInfo &dsi, const TString &theOption="")
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=256)
Execute Python code from string.
Ranking for variables in method (implementation).
Singleton class for Global types used by TMVA.
@ kSignal
Never change this number - it is elsewhere assumed to be zero !
create variable transformations
MsgLogger & Endl(MsgLogger &ml)