root/html608/MethodPyAdaBoost_8cxx_source.html

 // @(#)root/tmva/pymva $Id$
 // Authors: Omar Zapata, Lorenzo Moneta, Sergei Gleyzer 2015

 /**********************************************************************************
  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
  * Package: TMVA                                                                  *
  * Class  : MethodPyAdaBoost                                                  *
  * Web    : http://oproject.org                                           *
  *                                                                                *
  * Description:                                                                   *
  *      AdaBoost      Classifiear from Scikit learn                               *
  *                                                                                *
  *                                                                                *
  * Redistribution and use in source and binary forms, with or without             *
  * modification, are permitted according to the terms listed in LICENSE           *
  * (http://tmva.sourceforge.net/LICENSE)                                          *
  *                                                                                *
  **********************************************************************************/

 #include <Python.h>    // Needs to be included first to avoid redefinition of _POSIX_C_SOURCE
 #include "TMVA/MethodPyAdaBoost.h"

 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #include <numpy/arrayobject.h>

 #pragma GCC diagnostic ignored "-Wunused-parameter"

 #include "TMVA/Config.h"
 #include "TMVA/Configurable.h"
 #include "TMVA/ClassifierFactory.h"
 #include "TMVA/DataSet.h"
 #include "TMVA/Event.h"
 #include "TMVA/IMethod.h"
 #include "TMVA/MsgLogger.h"
 #include "TMVA/PDF.h"
 #include "TMVA/Ranking.h"
 #include "TMVA/Tools.h"
 #include "TMVA/Types.h"
 #include "TMVA/VariableTransformBase.h"
 #include "TMVA/Results.h"

 #include "TMath.h"
 #include "Riostream.h"
 #include "TMatrix.h"
 #include "TMatrixD.h"
 #include "TVectorD.h"

 #include <iomanip>
 #include <fstream>

 using namespace TMVA;

 REGISTER_METHOD(PyAdaBoost)

 ClassImp(MethodPyAdaBoost)

 //_______________________________________________________________________
 MethodPyAdaBoost::MethodPyAdaBoost(const TString &jobName,
                                    const TString &methodTitle,
                                    DataSetInfo &dsi,
                                    const TString &theOption) :
    PyMethodBase(jobName, Types::kPyAdaBoost, methodTitle, dsi, theOption),
    base_estimator("None"),
    n_estimators(50),
    learning_rate(1.0),
    algorithm("SAMME.R"),
    random_state("None")
 {
 }

 //_______________________________________________________________________
 MethodPyAdaBoost::MethodPyAdaBoost(DataSetInfo &theData, const TString &theWeightFile)
    : PyMethodBase(Types::kPyAdaBoost, theData, theWeightFile),
      base_estimator("None"),
      n_estimators(50),
      learning_rate(1.0),
      algorithm("SAMME.R"),
      random_state("None")
 {
 }


 //_______________________________________________________________________
 MethodPyAdaBoost::~MethodPyAdaBoost(void)
 {
 }

 //_______________________________________________________________________
 Bool_t MethodPyAdaBoost::HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
 {
    if (type == Types::kClassification && numberClasses == 2) return kTRUE;
    return kFALSE;
 }


 //_______________________________________________________________________
 void MethodPyAdaBoost::DeclareOptions()
 {
    MethodBase::DeclareCompatibilityOptions();

    DeclareOptionRef(base_estimator, "BaseEstimator", "object, optional (default=DecisionTreeClassifier)\
     The base estimator from which the boosted ensemble is built.\
     Support for sample weighting is required, as well as proper `classes_`\
     and `n_classes_` attributes.");

    DeclareOptionRef(n_estimators, "NEstimators", "integer, optional (default=50)\
     The maximum number of estimators at which boosting is terminated.\
     In case of perfect fit, the learning procedure is stopped early.");

    DeclareOptionRef(learning_rate, "LearningRate", "float, optional (default=1.)\
     Learning rate shrinks the contribution of each classifier by\
     ``learning_rate``. There is a trade-off between ``learning_rate`` and\
     ``n_estimators``.");

    DeclareOptionRef(algorithm, "Algorithm", "{'SAMME', 'SAMME.R'}, optional (default='SAMME.R')\
     If 'SAMME.R' then use the SAMME.R real boosting algorithm.\
     ``base_estimator`` must support calculation of class probabilities.\
     If 'SAMME' then use the SAMME discrete boosting algorithm.\
     The SAMME.R algorithm typically converges faster than SAMME,\
     achieving a lower test error with fewer boosting iterations.");

    DeclareOptionRef(random_state, "RandomState", "int, RandomState instance or None, optional (default=None)\
     If int, random_state is the seed used by the random number generator;\
     If RandomState instance, random_state is the random number generator;\
     If None, the random number generator is the RandomState instance used\
     by `np.random`.");
 }

 //_______________________________________________________________________
 void MethodPyAdaBoost::ProcessOptions()
 {
    PyObject *pobase_estimator = Eval(base_estimator);
    if (!pobase_estimator) {
       Log() << kFATAL << Form(" BaseEstimator = %s... that does not work !! ", base_estimator.Data())
             << " The options are Object or None."
             << Endl;
    }
    Py_DECREF(pobase_estimator);

    if (n_estimators <= 0) {
       Log() << kERROR << " NEstimators <=0... that does not work !! "
             << " I set it to 10 .. just so that the program does not crash"
             << Endl;
       n_estimators = 10;
    }
    if (learning_rate <= 0) {
       Log() << kERROR << " LearningRate <=0... that does not work !! "
             << " I set it to 1.0 .. just so that the program does not crash"
             << Endl;
       learning_rate = 1.0;
    }

    if (algorithm != "SAMME" && algorithm != "SAMME.R") {
       Log() << kFATAL << Form(" Algorithm = %s... that does not work !! ", algorithm.Data())
             << " The options are SAMME of SAMME.R."
             << Endl;
    }
    PyObject *porandom_state = Eval(random_state);
    if (!porandom_state) {
       Log() << kFATAL << Form(" RandomState = %s... that does not work !! ", random_state.Data())
             << "If int, random_state is the seed used by the random number generator;"
             << "If RandomState instance, random_state is the random number generator;"
             << "If None, the random number generator is the RandomState instance used by `np.random`."
             << Endl;
    }
    Py_DECREF(porandom_state);
 }


 //_______________________________________________________________________
 void  MethodPyAdaBoost::Init()
 {
    ProcessOptions();
    _import_array();//require to use numpy arrays

    //Import sklearn
    // Convert the file name to a Python string.
    PyObject *pName = PyUnicode_FromString("sklearn.ensemble");
    // Import the file as a Python module.
    fModule = PyImport_Import(pName);
    Py_DECREF(pName);

    if (!fModule) {
       Log() << kFATAL << "Can't import sklearn.ensemble" << Endl;
       Log() << Endl;
    }


    //Training data
    UInt_t fNvars = Data()->GetNVariables();
    int fNrowsTraining = Data()->GetNTrainingEvents(); //every row is an event, a class type and a weight
    int *dims = new int[2];
    dims[0] = fNrowsTraining;
    dims[1] = fNvars;
    fTrainData = (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
    float *TrainData = (float *)(PyArray_DATA(fTrainData));


    fTrainDataClasses = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
    float *TrainDataClasses = (float *)(PyArray_DATA(fTrainDataClasses));

    fTrainDataWeights = (PyArrayObject *)PyArray_FromDims(1, &fNrowsTraining, NPY_FLOAT);
    float *TrainDataWeights = (float *)(PyArray_DATA(fTrainDataWeights));

    for (int i = 0; i < fNrowsTraining; i++) {
       const TMVA::Event *e = Data()->GetTrainingEvent(i);
       for (UInt_t j = 0; j < fNvars; j++) {
          TrainData[j + i * fNvars] = e->GetValue(j);
       }
       if (e->GetClass() == TMVA::Types::kSignal) TrainDataClasses[i] = TMVA::Types::kSignal;
       else TrainDataClasses[i] = TMVA::Types::kBackground;

       TrainDataWeights[i] = e->GetWeight();
    }
 }

 void MethodPyAdaBoost::Train()
 {
    PyObject *pobase_estimator = Eval(base_estimator);
    PyObject *porandom_state = Eval(random_state);

    PyObject *args = Py_BuildValue("(OifsO)", pobase_estimator, n_estimators, learning_rate, algorithm.Data(), porandom_state);
    PyObject_Print(args, stdout, 0);
    std::cout << std::endl;
    PyObject *pDict = PyModule_GetDict(fModule);
    PyObject *fClassifierClass = PyDict_GetItemString(pDict, "AdaBoostClassifier");

    // Create an instance of the class
    if (PyCallable_Check(fClassifierClass)) {
       //instance
       fClassifier = PyObject_CallObject(fClassifierClass , args);
       PyObject_Print(fClassifier, stdout, 0);

       Py_DECREF(args);
    } else {
       PyErr_Print();
       Py_DECREF(pDict);
       Py_DECREF(fClassifierClass);
       Log() << kFATAL << "Can't call function AdaBoostClassifier" << Endl;
       Log() << Endl;

    }

    fClassifier = PyObject_CallMethod(fClassifier, (char *)"fit", (char *)"(OOO)", fTrainData, fTrainDataClasses, fTrainDataWeights);

    if (IsModelPersistence())
    {
         TString path = GetWeightFileDir() + "/PyAdaBoostModel.PyData";
         Log() << Endl;
         Log() << gTools().Color("bold") << "--- Saving State File In:" << gTools().Color("reset") << path << Endl;
         Log() << Endl;
         Serialize(path,fClassifier);
    }
 }

 //_______________________________________________________________________
 void MethodPyAdaBoost::TestClassification()
 {
    MethodBase::TestClassification();
 }


 //_______________________________________________________________________
 Double_t MethodPyAdaBoost::GetMvaValue(Double_t *errLower, Double_t *errUpper)
 {
    // cannot determine error
    NoErrorCalc(errLower, errUpper);

    if (IsModelPersistence()) ReadModelFromFile();

    Double_t mvaValue;
    const TMVA::Event *e = Data()->GetEvent();
    UInt_t nvars = e->GetNVariables();
    int dims[2];
    dims[0] = 1;
    dims[1] = nvars;
    PyArrayObject *pEvent= (PyArrayObject *)PyArray_FromDims(2, dims, NPY_FLOAT);
    float *pValue = (float *)(PyArray_DATA(pEvent));

    for (UInt_t i = 0; i < nvars; i++) pValue[i] = e->GetValue(i);

    PyArrayObject *result = (PyArrayObject *)PyObject_CallMethod(fClassifier, const_cast<char *>("predict_proba"), const_cast<char *>("(O)"), pEvent);
    double *proba = (double *)(PyArray_DATA(result));
    mvaValue = proba[0]; //getting signal prob
    Py_DECREF(result);
    Py_DECREF(pEvent);
    return mvaValue;
 }

 //_______________________________________________________________________
 void MethodPyAdaBoost::ReadModelFromFile()
 {
    if (!PyIsInitialized()) {
       PyInitialize();
    }

    TString path = GetWeightFileDir() + "/PyAdaBoostModel.PyData";
    Log() << Endl;
    Log() << gTools().Color("bold") << "--- Loading State File From:" << gTools().Color("reset") << path << Endl;
    Log() << Endl;
    UnSerialize(path,&fClassifier);
 }

 //_______________________________________________________________________
 void MethodPyAdaBoost::GetHelpMessage() const
 {
    // get help message text
    //
    // typical length of text line:
    //         "|--------------------------------------------------------------|"
    Log() << Endl;
    Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
    Log() << Endl;
    Log() << "Decision Trees and Rule-Based Models " << Endl;
    Log() << Endl;
    Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
    Log() << Endl;
    Log() << Endl;
    Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
    Log() << Endl;
    Log() << "<None>" << Endl;
 }
TMVA::MethodPyAdaBoost
Definition: MethodPyAdaBoost.h:35

TMVA::MethodBase::GetWeightFileDir
const TString & GetWeightFileDir() const
Definition: MethodBase.h:486

TMVA::kFATAL
Definition: Types.h:67

TMVA::Endl
MsgLogger & Endl(MsgLogger &ml)
Definition: MsgLogger.h:162

Types.h

TMVA::MethodPyAdaBoost::random_state
TString random_state
Definition: MethodPyAdaBoost.h:100

TMVA::Types
Definition: Types.h:77

REGISTER_METHOD
#define REGISTER_METHOD(CLASS)
for example
Definition: ClassifierFactory.h:126

TMVA::PyMethodBase::fClassifier
PyObject * fClassifier
Definition: PyMethodBase.h:121

TMVA::DataSet::GetTrainingEvent
const Event * GetTrainingEvent(Long64_t ievt) const
Definition: DataSet.h:99

TMVA::Configurable::DeclareOptionRef
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")

MethodPyAdaBoost.h

TMVA::MethodPyAdaBoost::learning_rate
Double_t learning_rate
Definition: MethodPyAdaBoost.h:91

TMVA::MethodPyAdaBoost::Train
void Train()
Definition: MethodPyAdaBoost.cxx:217

TMVA::MethodBase::Data
DataSet * Data() const
Definition: MethodBase.h:405

TMVA::Types::EAnalysisType
EAnalysisType
Definition: Types.h:128

TString
Basic string class.
Definition: TString.h:137

TMVA::MethodPyAdaBoost::~MethodPyAdaBoost
~MethodPyAdaBoost(void)
Definition: MethodPyAdaBoost.cxx:84

Bool_t
bool Bool_t
Definition: RtypesCore.h:59

kFALSE
const Bool_t kFALSE
Definition: Rtypes.h:92

TMVA::PyMethodBase
Definition: PyMethodBase.h:63

TMVA::MethodPyAdaBoost::TestClassification
virtual void TestClassification()
initialization
Definition: MethodPyAdaBoost.cxx:257

TMVA::PyMethodBase::Serialize
static void Serialize(TString file, PyObject *classifier)
Definition: PyMethodBase.cxx:212

TMVA::Event::GetWeight
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
Definition: Event.cxx:378

TMVA::PyMethodBase::fTrainDataClasses
PyArrayObject * fTrainDataClasses
Definition: PyMethodBase.h:125

TMVA::PyMethodBase::PyIsInitialized
static int PyIsInitialized()
Definition: PyMethodBase.cxx:202

Config.h

TMVA::PyMethodBase::PyInitialize
static void PyInitialize()
Definition: PyMethodBase.cxx:96

TMVA::Types::kBackground
Definition: Types.h:138

TMVA::Event::GetValue
Float_t GetValue(UInt_t ivar) const
return value of i&#39;th variable
Definition: Event.cxx:233

TString::Data
const char * Data() const
Definition: TString.h:349

TVectorD.h

PDF.h

TMVA::gTools
Tools & gTools()
Definition: Tools.cxx:79

TMVA::MethodPyAdaBoost::Init
void Init()
Definition: MethodPyAdaBoost.cxx:171

TMVA::PyMethodBase::Eval
static PyObject * Eval(TString code)
Definition: PyMethodBase.cxx:86

TMVA::DataSetInfo
Definition: DataSetInfo.h:78

TMVA::kERROR
Definition: Types.h:66

TMVA::PyMethodBase::fTrainDataWeights
PyArrayObject * fTrainDataWeights
Definition: PyMethodBase.h:124

TMVA::MethodPyAdaBoost::DeclareOptions
void DeclareOptions()
Definition: MethodPyAdaBoost.cxx:97

DataSet.h

TMVA::Event::GetNVariables
UInt_t GetNVariables() const
accessor to the number of variables
Definition: Event.cxx:305

Ranking.h

TMVA::MethodPyAdaBoost::n_estimators
Int_t n_estimators
Definition: MethodPyAdaBoost.h:89

TMVA::MethodPyAdaBoost::GetMvaValue
Double_t GetMvaValue(Double_t *errLower=0, Double_t *errUpper=0)
Definition: MethodPyAdaBoost.cxx:264

None
#define None
Definition: TGWin32.h:59

TMVA::PyMethodBase::fModule
PyObject * fModule
Definition: PyMethodBase.h:120

TMVA::Event
Definition: Event.h:60

UInt_t
unsigned int UInt_t
Definition: RtypesCore.h:42

TMVA::MethodPyAdaBoost::ProcessOptions
void ProcessOptions()
Definition: MethodPyAdaBoost.cxx:130

Form
char * Form(const char *fmt,...)

TMVA::Types::kClassification
Definition: Types.h:129

TMVA::PyMethodBase::fTrainData
PyArrayObject * fTrainData
Definition: PyMethodBase.h:123

MsgLogger.h

TMVA::MethodPyAdaBoost::GetHelpMessage
void GetHelpMessage() const
Definition: MethodPyAdaBoost.cxx:305

Riostream.h

IMethod.h

TMVA::DataSet::GetEvent
const Event * GetEvent() const
Definition: DataSet.cxx:211

ClassImp
#define ClassImp(name)
Definition: Rtypes.h:279

Double_t
double Double_t
Definition: RtypesCore.h:55

type
int type
Definition: TGX11.cxx:120

Event.h

TMVA::Configurable::Log
MsgLogger & Log() const
Definition: Configurable.h:128

TMVA::DataSet::GetNVariables
UInt_t GetNVariables() const
access the number of variables through the datasetinfo
Definition: DataSet.cxx:225

e
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
Definition: TRolke.cxx:630

TMVA::Event::GetClass
UInt_t GetClass() const
Definition: Event.h:89

TMVA::Tools::Color
const TString & Color(const TString &)
human readable color strings
Definition: Tools.cxx:837

TMVA::MethodPyAdaBoost::base_estimator
TString base_estimator
Definition: MethodPyAdaBoost.h:85

Results.h

TMVA::MethodPyAdaBoost::HasAnalysisType
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
Definition: MethodPyAdaBoost.cxx:89

TMVA
Abstract ClassifierFactory template that handles arbitrary types.
Definition: GeneticMinimizer.h:21

VariableTransformBase.h

Tools.h

TMatrixD.h

TMVA::MethodBase::DeclareCompatibilityOptions
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Definition: MethodBase.cxx:590

TMatrix.h

TMVA::DataSet::GetNTrainingEvents
Long64_t GetNTrainingEvents() const
Definition: DataSet.h:93

result
double result[121]
Definition: testSampleQuantiles.cxx:17

Configurable.h

TMVA::MethodPyAdaBoost::ReadModelFromFile
virtual void ReadModelFromFile()
Definition: MethodPyAdaBoost.cxx:291

TMath.h

TMVA::PyMethodBase::UnSerialize
static void UnSerialize(TString file, PyObject **obj)
Definition: PyMethodBase.cxx:226

kTRUE
const Bool_t kTRUE
Definition: Rtypes.h:91

ClassifierFactory.h

TMVA::MethodPyAdaBoost::MethodPyAdaBoost
MethodPyAdaBoost(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
Definition: MethodPyAdaBoost.cxx:58

TMVA::MethodBase::TestClassification
virtual void TestClassification()
initialization
Definition: MethodBase.cxx:1078

R
TRandom3 R
a TMatrixD.
Definition: testIO.cxx:28

PyObject
_object PyObject
Definition: TPyArg.h:22

TMVA::MethodBase::NoErrorCalc
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
Definition: MethodBase.cxx:819

TMVA::Types::kSignal
Definition: Types.h:137

TMVA::MethodPyAdaBoost::algorithm
TString algorithm
Definition: MethodPyAdaBoost.h:94

TMVA::MethodBase::IsModelPersistence
Bool_t IsModelPersistence()
Definition: MethodBase.h:379