23 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 24 #include <numpy/arrayobject.h> 59 const TString &methodTitle,
61 const TString &theOption) :
69 fMinWeightFractionLeaf(0.0),
75 fMaxLeafNodes(
"None"),
121 loss function to be optimized. 'deviance' refers to\ 122 deviance (= logistic regression) for classification\ 123 with probabilistic outputs. For loss 'exponential' gradient\ 124 boosting recovers the AdaBoost algorithm.");
127 learning rate shrinks the contribution of each tree by `learning_rate`.\ 128 There is a trade-off between learning_rate and n_estimators.");
131 The number of boosting stages to perform. Gradient boosting\ 132 is fairly robust to over-fitting so a large number usually\ 133 results in better performance.");
136 The fraction of samples to be used for fitting the individual base\ 137 learners. If smaller than 1.0 this results in Stochastic Gradient\ 138 Boosting. `subsample` interacts with the parameter `n_estimators`.\ 139 Choosing `subsample < 1.0` leads to a reduction of variance\ 140 and an increase in bias.");
143 The minimum number of samples required to split an internal node.");
146 The minimum number of samples in newly created leaves. A split is \ 147 discarded if after the split, one of the leaves would contain less then \ 148 ``min_samples_leaf`` samples.");
151 The minimum weighted fraction of the input samples required to be at a \ 155 The maximum depth of the tree. If None, then nodes are expanded until \ 156 all leaves are pure or until all leaves contain less than \ 157 min_samples_split samples. \ 158 Ignored if ``max_leaf_nodes`` is not None.");
161 An estimator object that is used to compute the initial\ 162 predictions. ``init`` has to provide ``fit`` and ``predict``.\ 163 If None it uses ``loss.init_estimator`");
166 If int, random_state is the seed used by the random number generator;\ 167 If RandomState instance, random_state is the random number generator;\ 168 If None, the random number generator is the RandomState instance used\ 174 Controls the verbosity of the tree building process.");
177 Grow trees with ``max_leaf_nodes`` in best-first fashion.\ 178 Best nodes are defined as relative reduction in impurity.\ 179 If None then unlimited number of leaf nodes.\ 180 If not None then ``max_depth`` will be ignored.");
183 When set to ``True``, reuse the solution of the previous call to fit\ 184 and add more estimators to the ensemble, otherwise, just fit a whole\ 188 "Store trained classifier in this file");
195 if (
fLoss !=
"deviance" &&
fLoss !=
"exponential") {
196 Log() << kFATAL <<
Form(
"Loss = %s ... that does not work!",
fLoss.Data())
197 <<
" The options are 'deviance' or 'exponential'." <<
Endl;
200 PyDict_SetItemString(
fLocalNS,
"loss", pLoss);
203 Log() << kFATAL <<
"LearningRate <= 0 ... that does not work!" <<
Endl;
209 Log() << kFATAL <<
"NEstimators <= 0 ... that does not work!" <<
Endl;
215 Log() << kFATAL <<
"MinSamplesSplit < 0 ... that does not work!" <<
Endl;
221 Log() << kFATAL <<
"Subsample < 0 ... that does not work!" <<
Endl;
227 Log() << kFATAL <<
"MinSamplesLeaf < 0 ... that does not work!" <<
Endl;
233 Log() << kFATAL <<
"MinSamplesSplit < 0 ... that does not work!" <<
Endl;
239 Log() << kFATAL <<
"MinWeightFractionLeaf < 0 ... that does not work !" <<
Endl;
245 Log() << kFATAL <<
" MaxDepth <= 0 ... that does not work !! " <<
Endl;
252 Log() << kFATAL <<
Form(
"Init = %s ... that does not work!",
fInit.Data())
253 <<
" The options are None or BaseEstimator, which is an estimator object that" 254 <<
"is used to compute the initial predictions. " 255 <<
"'init' has to provide 'fit' and 'predict' methods." 256 <<
" If None it uses 'loss.init_estimator'." <<
Endl;
263 <<
" If int, random_state is the seed used by the random number generator;" 264 <<
" If RandomState instance, random_state is the random number generator;" 265 <<
" If None, the random number generator is the RandomState instance used by 'np.random'." 274 PyDict_SetItemString(
fLocalNS,
"maxFeatures", pMaxFeatures);
277 Log() << kFATAL <<
Form(
" MaxFeatures = %s... that does not work !! ",
fMaxFeatures.Data())
278 <<
"int, float, string or None, optional (default='auto')" 279 <<
"The number of features to consider when looking for the best split:" 280 <<
"If int, then consider `max_features` features at each split." 281 <<
"If float, then `max_features` is a percentage and" 282 <<
"`int(max_features * n_features)` features are considered at each split." 283 <<
"If 'auto', then `max_features=sqrt(n_features)`." 284 <<
"If 'sqrt', then `max_features=sqrt(n_features)`." 285 <<
"If 'log2', then `max_features=log2(n_features)`." 286 <<
"If None, then `max_features=n_features`." <<
Endl;
290 if (!pMaxLeafNodes) {
292 <<
" The options are None or integer." <<
Endl;
294 PyDict_SetItemString(
fLocalNS,
"maxLeafNodes", pMaxLeafNodes);
328 npy_intp dimsData[2];
329 dimsData[0] = fNrowsTraining;
331 fTrainData = (PyArrayObject *)PyArray_SimpleNew(2, dimsData, NPY_FLOAT);
333 float *TrainData = (
float *)(PyArray_DATA(fTrainData));
335 npy_intp dimsClasses = (npy_intp) fNrowsTraining;
336 fTrainDataClasses = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
338 float *TrainDataClasses = (
float *)(PyArray_DATA(fTrainDataClasses));
340 fTrainDataWeights = (PyArrayObject *)PyArray_SimpleNew(1, &dimsClasses, NPY_FLOAT);
342 float *TrainDataWeights = (
float *)(PyArray_DATA(fTrainDataWeights));
344 for (
int i = 0; i < fNrowsTraining; i++) {
352 TrainDataClasses[i] = e->
GetClass();
359 PyRunString(
"classifier = sklearn.ensemble.GradientBoostingClassifier(loss=loss, learning_rate=learningRate, n_estimators=nEstimators, max_depth=maxDepth, min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightFractionLeaf, subsample=subsample, max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, init=init, verbose=verbose, warm_start=warmStart, random_state=randomState)",
360 "Failed to setup classifier");
364 PyRunString(
"dump = classifier.fit(trainData, trainDataClasses, trainDataWeights)",
"Failed to train classifier");
369 Log() << kFATAL <<
"Can't create classifier object from GradientBoostingClassifier" <<
Endl;
395 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
396 if (firstEvt < 0) firstEvt = 0;
397 nEvents = lastEvt-firstEvt;
403 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
404 float *pValue = (
float *)(PyArray_DATA(pEvent));
406 for (
Int_t ievt=0; ievt<nEvents; ievt++) {
410 pValue[ievt * fNvars + i] = e->
GetValue(i);
415 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
416 double *proba = (
double *)(PyArray_DATA(result));
420 for (
int i = 0; i < nEvents; ++i) {
444 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
445 float *pValue = (
float *)(PyArray_DATA(pEvent));
449 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
450 double *proba = (
double *)(PyArray_DATA(result));
473 PyArrayObject *pEvent= (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_FLOAT);
474 float *pValue = (
float *)(PyArray_DATA(pEvent));
478 PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(
fClassifier, const_cast<char *>(
"predict_proba"),
const_cast<char *
>(
"(O)"), pEvent);
479 double *proba = (
double *)(PyArray_DATA(result));
523 PyArrayObject* pRanking = (PyArrayObject*) PyObject_GetAttrString(
fClassifier,
"feature_importances_");
524 if(pRanking == 0)
Log() << kFATAL <<
"Failed to get ranking from classifier" <<
Endl;
543 Log() <<
"A gradient tree boosting classifier builds a model from an ensemble" <<
Endl;
544 Log() <<
"of decision trees, which are adapted each boosting step to fit better" <<
Endl;
545 Log() <<
"to previously misclassified events." <<
Endl;
547 Log() <<
"Check out the scikit-learn documentation for more information." <<
Endl;
void SetCurrentEvent(Long64_t ievt) const
MsgLogger & Endl(MsgLogger &ml)
Singleton class for Global types used by TMVA.
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
PyObject * pMinSamplesSplit
Ranking for variables in method (implementation)
UInt_t GetNClasses() const
static void Serialize(TString file, PyObject *classifier)
Serialize Python object.
PyArrayObject * fTrainDataClasses
static int PyIsInitialized()
Check Python interpreter initialization status.
static void PyInitialize()
Initialize Python interpreter.
const TString & GetInputLabel(Int_t i) const
const TString & GetWeightFileDir() const
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=Py_single_input)
Execute Python code from string.
void GetHelpMessage() const
TString fFilenameClassifier
PyObject * Eval(TString code)
Evaluate Python code.
DataSetInfo & DataInfo() const
Class that contains all the data information.
PyArrayObject * fTrainDataWeights
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Long64_t GetNTrainingEvents() const
const Event * GetTrainingEvent(Long64_t ievt) const
std::vector< Double_t > mvaValues
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
const Ranking * CreateRanking()
const char * GetName() const
char * Form(const char *fmt,...)
PyArrayObject * fTrainData
UInt_t GetNVariables() const
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
static Int_t UnSerialize(TString file, PyObject **obj)
Unserialize Python object.
virtual void ReadModelFromFile()
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
PyObject * pMinWeightFractionLeaf
std::vector< Float_t > & GetMulticlassValues()
#define REGISTER_METHOD(CLASS)
for example
Abstract ClassifierFactory template that handles arbitrary types.
Double_t fMinWeightFractionLeaf
virtual void AddRank(const Rank &rank)
Add a new rank take ownership of it.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
PyObject * pMinSamplesLeaf
std::vector< Double_t > GetMvaValues(Long64_t firstEvt=0, Long64_t lastEvt=-1, Bool_t logProgress=false)
get all the MVA values for the events of the current Data type
MethodPyGTB(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
virtual void TestClassification()
initialization
virtual void TestClassification()
initialization
const Event * GetEvent() const
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
std::vector< Float_t > classValues
Bool_t IsModelPersistence()