doc/hackathon/TMVA__RNN__Classification_8C.html

/***


    # TMVA Classification Example Using a Recurrent Neural Network


    This is an example of using a RNN in TMVA.

    We do the classification using a toy data set containing a time series of data sample ntimes

    and with dimension ndim that is generated when running the provided function `MakeTimeData (nevents, ntime, ndim)`


**/


#include<TROOT.h>


#include "TMVA/Factory.h"

#include "TMVA/DataLoader.h"

#include "TMVA/DataSetInfo.h"

#include "TMVA/Config.h"

#include "TMVA/MethodDL.h"


#include "TFile.h"

#include "TTree.h"


///  Helper function to generate the time data set

///  make some time data but not of fixed length.

///  use a poisson with mu = 5 and truncated at 10

///

void MakeTimeData(int n, int ntime, int ndim )

{


   // const int ntime = 10;

   // const int ndim = 30; // number of dim/time

   TString fname = TString::Format("time_data_t%d_d%d.root", ntime, ndim);

   std::vector<TH1 *> v1(ntime);

   std::vector<TH1 *> v2(ntime);

   int i = 0;

   for (int i = 0; i < ntime; ++i) {

      v1[i] = new TH1D(TString::Format("h1_%d", i), "h1", ndim, 0, 10);

      v2[i] = new TH1D(TString::Format("h2_%d", i), "h2", ndim, 0, 10);

   }


   auto f1 = new TF1("f1", "gaus");

   auto f2 = new TF1("f2", "gaus");


   TFile f(fname, "RECREATE");

   TTree sgn("sgn", "sgn");

   TTree bkg("bkg", "bkg");


   std::vector<std::vector<float>> x1(ntime);

   std::vector<std::vector<float>> x2(ntime);


   for (int i = 0; i < ntime; ++i) {

      x1[i] = std::vector<float>(ndim);

      x2[i] = std::vector<float>(ndim);

   }


   for (auto i = 0; i < ntime; i++) {

      bkg.Branch(Form("vars_time%d", i), "std::vector<float>", &x1[i]);

      sgn.Branch(Form("vars_time%d", i), "std::vector<float>", &x2[i]);

   }


   sgn.SetDirectory(&f);

   bkg.SetDirectory(&f);

   gRandom->SetSeed(0);


   std::vector<double> mean1(ntime);

   std::vector<double> mean2(ntime);

   std::vector<double> sigma1(ntime);

   std::vector<double> sigma2(ntime);

   for (int j = 0; j < ntime; ++j) {

      mean1[j] = 5. + 0.2 * sin(TMath::Pi() * j / double(ntime));

      mean2[j] = 5. + 0.2 * cos(TMath::Pi() * j / double(ntime));

      sigma1[j] = 4 + 0.3 * sin(TMath::Pi() * j / double(ntime));

      sigma2[j] = 4 + 0.3 * cos(TMath::Pi() * j / double(ntime));

   }

   for (int i = 0; i < n; ++i) {


      if (i % 1000 == 0)

         std::cout << "Generating  event ... " << i << std::endl;


      for (int j = 0; j < ntime; ++j) {

         auto h1 = v1[j];

         auto h2 = v2[j];

         h1->Reset();

         h2->Reset();


         f1->SetParameters(1, mean1[j], sigma1[j]);

         f2->SetParameters(1, mean2[j], sigma2[j]);


         h1->FillRandom("f1", 1000);

         h2->FillRandom("f2", 1000);


         for (int k = 0; k < ndim; ++k) {

            // std::cout << j*10+k << "   ";

            x1[j][k] = h1->GetBinContent(k + 1) + gRandom->Gaus(0, 10);

            x2[j][k] = h2->GetBinContent(k + 1) + gRandom->Gaus(0, 10);

         }

      }

      // std::cout << std::endl;

      sgn.Fill();

      bkg.Fill();


      if (n == 1) {

         auto c1 = new TCanvas();

         c1->Divide(ntime, 2);

         for (int j = 0; j < ntime; ++j) {

            c1->cd(j + 1);

            v1[j]->Draw();

         }

         for (int j = 0; j < ntime; ++j) {

            c1->cd(ntime + j + 1);

            v2[j]->Draw();

         }

         gPad->Update();

      }

   }

   if (n > 1) {

      sgn.Write();

      bkg.Write();

      sgn.Print();

      bkg.Print();

      f.Close();

   }

}

/// macro for performing a classification using a Recurrent Neural Network

/// @param nevts = 2000  Number of events used. (increase for better classification results)

/// @param use_type

///    use_type = 0    use Simple RNN network

///    use_type = 1    use LSTM network

///    use_type = 2    use GRU

///    use_type = 3    build 3 different networks with RNN, LSTM and GRU


void TMVA_RNN_Classification(int nevts = 2000, int use_type = 1)

{


   const int ninput = 30;

   const int ntime = 10;

   const int batchSize = 100;

   const int maxepochs = 20;


   int nTotEvts = nevts; // total events to be generated for signal or background


   bool useKeras = true;


   bool useTMVA_RNN = true;

   bool useTMVA_DNN = true;

   bool useTMVA_BDT = false;


   std::vector<std::string> rnn_types = {"RNN", "LSTM", "GRU"};

   std::vector<bool> use_rnn_type = {1, 1, 1};

   if (use_type >=0 && use_type < 3) {

      use_rnn_type = {0,0,0};

      use_rnn_type[use_type] = 1;

   }

   bool useGPU = true;   // use GPU for TMVA if available


#ifndef R__HAS_TMVAGPU

   useGPU = false;

#ifndef R__HAS_TMVACPU

   Warning("TMVA_RNN_Classification", "TMVA is not build with GPU or CPU multi-thread support. Cannot use TMVA Deep Learning for RNN");

   useTMVA_RNN = false;

#endif

#endif


   TString archString = (useGPU) ? "GPU" : "CPU";


   bool writeOutputFile = true;


   const char *rnn_type = "RNN";


#ifdef R__HAS_PYMVA

   TMVA::PyMethodBase::PyInitialize();

#else

   useKeras = false;

#endif


#ifdef R__USE_IMT

   int num_threads = 4; // use max 4 threads

   // switch off MT in OpenBLAS to avoid conflict with tbb

   gSystem->Setenv("OMP_NUM_THREADS", "1");


   // do enable MT running

   if (num_threads >= 0) {

      ROOT::EnableImplicitMT(num_threads);

   }

#endif


   TMVA::Config::Instance();


   std::cout << "Running with nthreads  = " << ROOT::GetThreadPoolSize() << std::endl;


   TString inputFileName = "time_data_t10_d30.root";


   bool fileExist = !gSystem->AccessPathName(inputFileName);


   // if file does not exists create it

   if (!fileExist) {

      MakeTimeData(nTotEvts,ntime, ninput);

   }


   auto inputFile = TFile::Open(inputFileName);

   if (!inputFile) {

      Error("TMVA_RNN_Classification", "Error opening input file %s - exit", inputFileName.Data());

      return;

   }


   std::cout << "--- RNNClassification  : Using input file: " << inputFile->GetName() << std::endl;


   // Create a ROOT output file where TMVA will store ntuples, histograms, etc.

   TString outfileName(TString::Format("data_RNN_%s.root", archString.Data()));

   TFile *outputFile = nullptr;

   if (writeOutputFile) outputFile = TFile::Open(outfileName, "RECREATE");


   /**

    ## Declare Factory


    Create the Factory class. Later you can choose the methods

    whose performance you'd like to investigate.


    The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to

pass


    - The first argument is the base of the name of all the output

    weightfiles in the directory weight/ that will be created with the

    method parameters


    - The second argument is the output file for the training results


    - The third argument is a string option defining some general configuration for the TMVA session.

      For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in

the option string


    **/


   // Creating the factory object

   TMVA::Factory *factory = new TMVA::Factory("TMVAClassification", outputFile,

                                              "!V:!Silent:Color:DrawProgressBar:Transformations=None:!Correlations:"

                                              "AnalysisType=Classification:ModelPersistence");

   TMVA::DataLoader *dataloader = new TMVA::DataLoader("dataset");


   TTree *signalTree = (TTree *)inputFile->Get("sgn");

   TTree *background = (TTree *)inputFile->Get("bkg");


   const int nvar = ninput * ntime;


   /// add variables - use new AddVariablesArray function

   for (auto i = 0; i < ntime; i++) {

      dataloader->AddVariablesArray(Form("vars_time%d", i), ninput);

   }


   dataloader->AddSignalTree(signalTree, 1.0);

   dataloader->AddBackgroundTree(background, 1.0);


   // check given input

   auto &datainfo = dataloader->GetDataSetInfo();

   auto vars = datainfo.GetListOfVariables();

   std::cout << "number of variables is " << vars.size() << std::endl;

   for (auto &v : vars)

      std::cout << v << ",";

   std::cout << std::endl;


   int nTrainSig = 0.8 * nTotEvts;

   int nTrainBkg = 0.8 *  nTotEvts;


   // build the string options for DataLoader::PrepareTrainingAndTestTree

   TString prepareOptions = TString::Format("nTrain_Signal=%d:nTrain_Background=%d:SplitMode=Random:SplitSeed=100:NormMode=NumEvents:!V:!CalcCorrelations", nTrainSig, nTrainBkg);


   // Apply additional cuts on the signal and background samples (can be different)

   TCut mycuts = ""; // for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1";

   TCut mycutb = "";


   dataloader->PrepareTrainingAndTestTree(mycuts, mycutb, prepareOptions);


   std::cout << "prepared DATA LOADER " << std::endl;


   /**

       ## Book TMVA  recurrent models


      Book the different types of recurrent models in TMVA  (SimpleRNN, LSTM or GRU)


 **/


   if (useTMVA_RNN) {


      for (int i = 0; i < 3; ++i) {


         if (!use_rnn_type[i])

            continue;


         const char *rnn_type = rnn_types[i].c_str();


         /// define the inputlayout string for RNN

         /// the input data should be organize as   following:

         //// input layout for RNN:    time x ndim


         TString inputLayoutString = TString::Format("InputLayout=%d|%d", ntime, ninput);


         /// Define RNN layer layout

         ///  it should be   LayerType (RNN or LSTM or GRU) |  number of units | number of inputs | time steps | remember output (typically no=0 | return full sequence

         TString rnnLayout = TString::Format("%s|10|%d|%d|0|1", rnn_type, ninput, ntime);


         /// add after RNN a reshape layer (needed top flatten the output) and a dense layer with 64 units and a last one

         /// Note the last layer is linear because  when using Crossentropy a Sigmoid is applied already

         TString layoutString = TString("Layout=") + rnnLayout + TString(",RESHAPE|FLAT,DENSE|64|TANH,LINEAR");


         /// Defining Training strategies. Different training strings can be concatenate. Use however only one

         TString trainingString1 = TString::Format("LearningRate=1e-3,Momentum=0.0,Repetitions=1,"

                                             "ConvergenceSteps=5,BatchSize=%d,TestRepetitions=1,"

                                             "WeightDecay=1e-2,Regularization=None,MaxEpochs=%d,"

                                             "Optimizer=ADAM,DropConfig=0.0+0.+0.+0.",

                                             batchSize,maxepochs);


         TString trainingStrategyString("TrainingStrategy=");

         trainingStrategyString += trainingString1; // + "|" + trainingString2


         /// Define the full RNN Noption string adding the final options for all network

         TString rnnOptions("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=None:"

                            "WeightInitialization=XAVIERUNIFORM:ValidationSize=0.2:RandomSeed=1234");


         rnnOptions.Append(":");

         rnnOptions.Append(inputLayoutString);

         rnnOptions.Append(":");

         rnnOptions.Append(layoutString);

         rnnOptions.Append(":");

         rnnOptions.Append(trainingStrategyString);

         rnnOptions.Append(":");

         rnnOptions.Append(TString::Format("Architecture=%s", archString.Data()));


         TString rnnName = "TMVA_" + TString(rnn_type);

         factory->BookMethod(dataloader, TMVA::Types::kDL, rnnName, rnnOptions);


      }

   }


   /**

      ## Book TMVA  fully connected dense layer  models


   **/


   if (useTMVA_DNN) {

      // Method DL with Dense Layer

      TString inputLayoutString = TString::Format("InputLayout=1|1|%d", ntime * ninput);


      TString layoutString("Layout=DENSE|64|TANH,DENSE|TANH|64,DENSE|TANH|64,LINEAR");

      // Training strategies.

      TString trainingString1("LearningRate=1e-3,Momentum=0.0,Repetitions=1,"

                        "ConvergenceSteps=10,BatchSize=256,TestRepetitions=1,"

                        "WeightDecay=1e-4,Regularization=None,MaxEpochs=20"

                        "DropConfig=0.0+0.+0.+0.,Optimizer=ADAM");

      TString trainingStrategyString("TrainingStrategy=");

      trainingStrategyString += trainingString1; // + "|" + trainingString2


      // General Options.

      TString dnnOptions("!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=None:"

                         "WeightInitialization=XAVIER:RandomSeed=0");


      dnnOptions.Append(":");

      dnnOptions.Append(inputLayoutString);

      dnnOptions.Append(":");

      dnnOptions.Append(layoutString);

      dnnOptions.Append(":");

      dnnOptions.Append(trainingStrategyString);

      dnnOptions.Append(":");

      dnnOptions.Append(archString);


      TString dnnName = "TMVA_DNN";

      factory->BookMethod(dataloader, TMVA::Types::kDL, dnnName, dnnOptions);

   }


   /**

    ## Book Keras recurrent models


     Book the different types of recurrent models in Keras  (SimpleRNN, LSTM or GRU)


   **/


   if (useKeras) {


      for (int i = 0; i < 3; i++) {


         if (use_rnn_type[i]) {


            TString modelName = TString::Format("model_%s.keras", rnn_types[i].c_str());

            TString trainedModelName = TString::Format("trained_model_%s.keras", rnn_types[i].c_str());


            Info("TMVA_RNN_Classification", "Building recurrent keras model using a %s layer", rnn_types[i].c_str());

            // create python script which can be executed

            // create 2 conv2d layer + maxpool + dense

            TMacro m;

            m.AddLine("import tensorflow");

            m.AddLine("from tensorflow.keras.models import Sequential");

            m.AddLine("from tensorflow.keras.optimizers import Adam");

            m.AddLine("from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, SimpleRNN, GRU, LSTM, Reshape, "

                      "BatchNormalization");

            m.AddLine("");

            m.AddLine("model = Sequential() ");

            m.AddLine("model.add(Reshape((10, 30), input_shape = (10*30, )))");

            // add recurrent neural network depending on type / Use option to return the full output

            if (rnn_types[i] == "LSTM")

               m.AddLine("model.add(LSTM(units=10, return_sequences=True) )");

            else if (rnn_types[i] == "GRU")

               m.AddLine("model.add(GRU(units=10, return_sequences=True) )");

            else

               m.AddLine("model.add(SimpleRNN(units=10, return_sequences=True) )");


            // m.AddLine("model.add(BatchNormalization())");

            m.AddLine("model.add(Flatten())"); // needed if returning the full time output sequence

            m.AddLine("model.add(Dense(64, activation = 'tanh')) ");

            m.AddLine("model.add(Dense(2, activation = 'sigmoid')) ");

            m.AddLine(

               "model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate = 0.001), weighted_metrics = ['accuracy'])");

            m.AddLine(TString::Format("modelName = '%s'", modelName.Data()));

            m.AddLine("model.save(modelName)");

            m.AddLine("model.summary()");


            m.SaveSource("make_rnn_model.py");

            // execute python script to make the model

            auto ret = (TString *)gROOT->ProcessLine("TMVA::Python_Executable()");

            TString python_exe = (ret) ? *(ret) : "python";

            gSystem->Exec(python_exe + " make_rnn_model.py");


            if (gSystem->AccessPathName(modelName)) {

               Warning("TMVA_RNN_Classification", "Error creating Keras recurrent model file - Skip using Keras");

               useKeras = false;

            } else {

               // book PyKeras method only if Keras model could be created

               Info("TMVA_RNN_Classification", "Booking Keras %s model", rnn_types[i].c_str());

               factory->BookMethod(dataloader, TMVA::Types::kPyKeras,

                                   TString::Format("PyKeras_%s", rnn_types[i].c_str()),

                                   TString::Format("!H:!V:VarTransform=None:FilenameModel=%s:tf.keras:"

                                                   "FilenameTrainedModel=%s:NumEpochs=%d:BatchSize=%d",

                                                   modelName.Data(), trainedModelName.Data(), maxepochs, batchSize));

            }

         }

      }

   }


   // use BDT in case not using Keras or TMVA DL

   if (!useKeras || !useTMVA_BDT)

      useTMVA_BDT = true;


   /**

         ## Book TMVA BDT

   **/


   if (useTMVA_BDT) {


      factory->BookMethod(dataloader, TMVA::Types::kBDT, "BDTG",

                          "!H:!V:NTrees=100:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:"

                          "BaggedSampleFraction=0.5:nCuts=20:"

                          "MaxDepth=2");


   }


   /// Train all methods

   factory->TrainAllMethods();


   std::cout << "nthreads  = " << ROOT::GetThreadPoolSize() << std::endl;


   // ---- Evaluate all MVAs using the set of test events

   factory->TestAllMethods();


   // ----- Evaluate and compare performance of all configured MVAs

   factory->EvaluateAllMethods();


   // check method


   // plot ROC curve

   auto c1 = factory->GetROCCurve(dataloader);

   c1->Draw();


   if (outputFile) outputFile->Close();

}

DataLoader.h

DataSetInfo.h

MethodDL.h

f
#define f(i)
Definition RSha256.hxx:104

ret
char * ret
Definition Rotated.cxx:221

Error
Error("WriteTObject","The current directory (%s) is not associated with a file. The object (%s) has not been written.", GetName(), objname)

Info
void Info(const char *location, const char *msgfmt,...)
Use this function for informational messages.
Definition TError.cxx:241

Warning
void Warning(const char *location, const char *msgfmt,...)
Use this function in warning situations.
Definition TError.cxx:252

TFile.h

TROOT.h

gROOT
#define gROOT
Definition TROOT.h:417

gRandom
externTRandom * gRandom
Definition TRandom.h:62

Form
char * Form(const char *fmt,...)
Formats a string in a circular formatting buffer.
Definition TString.cxx:2496

gSystem
externTSystem * gSystem
Definition TSystem.h:582

TTree.h

gPad
#define gPad
Definition TVirtualPad.h:322

TCanvas
The Canvas class.
Definition TCanvas.h:23

TCut
A specialized string object used for TTree selections.
Definition TCut.h:25

TF1
Definition TF1.h:182

TFile
A file, usually with extension .root, that stores data and code in the form of serialized objects in ...
Definition TFile.h:130

TFile::Open
static TFile * Open(const char *name, Option_t *option="", const char *ftitle="", Int_t compress=ROOT::RCompressionSetting::EDefaults::kUseCompiledDefault, Int_t netopt=0)
Create / open a file.
Definition TFile.cxx:3787

TFile::Close
void Close(Option_t *option="") override
Close a file.
Definition TFile.cxx:981

TH1D
1-D histogram with a double per channel (see TH1 documentation)
Definition TH1.h:926

TMVA::Config::Instance
static Config & Instance()
static function: returns TMVA instance
Definition Config.cxx:97

TMVA::DataLoader
Definition DataLoader.h:50

TMVA::DataLoader::AddVariablesArray
void AddVariablesArray(const TString &expression, int size, char type='F', Double_t min=0, Double_t max=0)
user inserts discriminating array of variables in data set info in case input tree provides an array ...
Definition DataLoader.cxx:511

TMVA::DataLoader::AddSignalTree
void AddSignalTree(TTree *signal, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition DataLoader.cxx:378

TMVA::DataLoader::PrepareTrainingAndTestTree
void PrepareTrainingAndTestTree(const TCut &cut, const TString &splitOpt)
prepare the training and test trees -> same cuts for signal and background
Definition DataLoader.cxx:639

TMVA::DataLoader::AddBackgroundTree
void AddBackgroundTree(TTree *background, Double_t weight=1.0, Types::ETreeType treetype=Types::kMaxTreeType)
number of signal events (used to compute significance)
Definition DataLoader.cxx:409

TMVA::DataLoader::GetDataSetInfo
DataSetInfo & GetDataSetInfo()
Definition DataLoader.cxx:136

TMVA::DataSetInfo::GetListOfVariables
std::vector< TString > GetListOfVariables() const
returns list of variables
Definition DataSetInfo.cxx:406

TMVA::Factory
This is the main MVA steering class.
Definition Factory.h:80

TMVA::Factory::TrainAllMethods
void TrainAllMethods()
Iterates through all booked methods and calls training.
Definition Factory.cxx:1108

TMVA::Factory::TestAllMethods
void TestAllMethods()
Evaluates all booked methods on the testing data and adds the output to the Results in the corresponi...
Definition Factory.cxx:1265

TMVA::Factory::EvaluateAllMethods
void EvaluateAllMethods(void)
Iterates over all MVAs that have been booked, and calls their evaluation methods.
Definition Factory.cxx:1370

TMVA::Factory::BookMethod
MethodBase * BookMethod(DataLoader *loader, MethodName theMethodName, TString methodTitle, TString theOption="")
Books an MVA classifier or regression method.
Definition Factory.cxx:357

TMVA::Factory::GetROCCurve
TGraph * GetROCCurve(DataLoader *loader, TString theMethodName, Bool_t setTitles=kTRUE, UInt_t iClass=0, Types::ETreeType type=Types::kTesting)
Argument iClass specifies the class to generate the ROC curve in a multiclass setting.
Definition Factory.cxx:906

TMVA::PyMethodBase::PyInitialize
static void PyInitialize()
Initialize Python interpreter.
Definition PyMethodBase.cxx:152

TMVA::Types::kPyKeras
@ kPyKeras
Definition Types.h:103

TMVA::Types::kBDT
@ kBDT
Definition Types.h:86

TMVA::Types::kDL
@ kDL
Definition Types.h:99

TMacro
Class supporting a collection of lines with C++ code.
Definition TMacro.h:31

TString
Basic string class.
Definition TString.h:138

TString::Data
const char * Data() const
Definition TString.h:384

TString::Format
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
Definition TString.cxx:2385

TTree
A TTree represents a columnar dataset.
Definition TTree.h:89

ROOT::VecOps::cos
RVec< PromoteType< T > > cos(const RVec< T > &v)
Definition RVec.hxx:1848

ROOT::VecOps::sin
RVec< PromoteType< T > > sin(const RVec< T > &v)
Definition RVec.hxx:1847

c1
return c1
Definition legend1.C:41

n
const Int_t n
Definition legend1.C:16

h1
TH1F * h1
Definition legend1.C:5

f1
TF1 * f1
Definition legend1.C:11

ROOT::EnableImplicitMT
void EnableImplicitMT(UInt_t numthreads=0)
Enable ROOT's implicit multi-threading for all objects and methods that provide an internal paralleli...
Definition TROOT.cxx:613

ROOT::GetThreadPoolSize
UInt_t GetThreadPoolSize()
Returns the size of ROOT's thread pool.
Definition TROOT.cxx:676

TMVA_RNN_Classification
Definition TMVA_RNN_Classification.py:1

TMath::Pi
constexpr Double_t Pi()
Definition TMath.h:40

v2
@ v2
Definition rootcling_impl.cxx:3557

v
@ v
Definition rootcling_impl.cxx:3554

v1
@ v1
Definition rootcling_impl.cxx:3556

m
TMarker m
Definition textangle.C:8

Config.h

Factory.h