7#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
8#include <numpy/arrayobject.h>
86 "Specify as 0.2 or 20% to use a fifth of the data set as validation set."
87 "Specify as 100 to use exactly 100 events. (Default: 20%)");
88 DeclareOptionRef(
fUserCodeName =
"",
"UserCode",
"Necessary python code provided by the user to be executed before loading and training the PyTorch Model");
103 Int_t nValidationSamples = 0;
117 <<
"\". Expected string like \"20%\" or \"20.0%\"." <<
Endl;
122 if (valSizeAsDouble < 1.0) {
127 nValidationSamples = valSizeAsDouble;
130 Log() << kFATAL <<
"Cannot parse number \"" <<
fNumValidationString <<
"\". Expected string like \"0.2\" or \"100\"."
136 if (nValidationSamples < 0) {
140 if (nValidationSamples == 0) {
144 if (nValidationSamples >= (
Int_t)trainingSetSize) {
146 <<
"\" is larger than or equal in size to training set (size=\"" << trainingSetSize <<
"\")." <<
Endl;
149 return nValidationSamples;
163 Log() << kINFO <<
"Using PyTorch - setting special configuration options " <<
Endl;
164 PyRunString(
"import torch",
"Error importing pytorch");
170 PyRunString(
"torch_major_version = int(torch.__version__.split('.')[0])");
171 PyObject *pyTorchVersion = PyDict_GetItemString(
fLocalNS,
"torch_major_version");
172 int torchVersion = PyLong_AsLong(pyTorchVersion);
173 Log() << kINFO <<
"Using PyTorch version " << torchVersion <<
Endl;
177 if (num_threads > 0) {
178 Log() << kINFO <<
"Setting the CPU number of threads = " << num_threads <<
Endl;
196 Log() << kINFO <<
" Setup PyTorch Model for training" <<
Endl;
212 PyRunString(
"print('custom objects for loading model : ',load_model_custom_objects)");
215 PyRunString(
"fit = load_model_custom_objects[\"train_func\"]",
216 "Failed to load train function from file. Please use key: 'train_func' and pass training loop function as the value.");
217 Log() << kINFO <<
"Loaded pytorch train function: " <<
Endl;
221 PyRunString(
"if 'optimizer' in load_model_custom_objects:\n"
222 " optimizer = load_model_custom_objects['optimizer']\n"
224 " optimizer = torch.optim.SGD\n",
225 "Please use key: 'optimizer' and pass a pytorch optimizer as the value for a custom optimizer.");
226 Log() << kINFO <<
"Loaded pytorch optimizer: " <<
Endl;
230 PyRunString(
"criterion = load_model_custom_objects[\"criterion\"]",
231 "Failed to load loss function from file. Using MSE Loss as default. Please use key: 'criterion' and pass a pytorch loss function as the value.");
232 Log() << kINFO <<
"Loaded pytorch loss function: " <<
Endl;
236 PyRunString(
"predict = load_model_custom_objects[\"predict_func\"]",
237 "Can't find user predict function object from file. Please use key: 'predict' and pass a predict function for evaluating the model as the value.");
238 Log() << kINFO <<
"Loaded pytorch predict function: " <<
Endl;
243 if (loadTrainedModel) {
249 PyRunString(
"model = torch.jit.load('"+filenameLoadModel+
"')",
250 "Failed to load PyTorch model from file: "+filenameLoadModel);
251 Log() << kINFO <<
"Loaded model from file: " << filenameLoadModel <<
Endl;
260 Log() << kFATAL <<
"Selected analysis type is not implemented" <<
Endl;
263 Log() << kERROR <<
"Model does not have a number of inputs or output. Setup failed" <<
Endl;
275 size_t inputSize =
fNVars*nEvents;
280 if (inputSize > 0 && (
fVals.size() != inputSize ||
fPyVals ==
nullptr)) {
281 fVals.resize(inputSize);
282 npy_intp dimsVals[2] = {(npy_intp)nEvents, (npy_intp)
fNVars};
284 fPyVals = PyArray_SimpleNewFromData(2, dimsVals, NPY_FLOAT, (
void*)
fVals.data());
286 Log() << kFATAL <<
"Failed to load data to Python array" <<
Endl;
290 if (outputSize > 0 && (
fOutput.size() != outputSize ||
fPyOutput ==
nullptr)) {
294 npy_intp dimsOutput[2] = {(npy_intp)1, (npy_intp)
fNOutputs};
296 fPyOutput = PyArray_SimpleNewFromData(2, dimsOutput, NPY_FLOAT, (
void*)
fOutput.data());
298 Log() << kFATAL <<
"Failed to create output data Python array" <<
Endl;
310 Log() << kFATAL <<
"Python is not initialized" <<
Endl;
315 PyRunString(
"import sys; sys.argv = ['']",
"Set sys.argv failed");
316 PyRunString(
"import torch",
"import PyTorch failed");
320 Log() << kFATAL <<
"import torch in global namespace failed!" <<
Endl;
337 UInt_t nTrainingEvents = nAllEvents - nValEvents;
339 Log() << kINFO <<
"Split TMVA training data in " << nTrainingEvents <<
" training events and "
340 << nValEvents <<
" validation events" <<
Endl;
342 float* trainDataX =
new float[nTrainingEvents*
fNVars];
343 float* trainDataY =
new float[nTrainingEvents*
fNOutputs];
344 float* trainDataWeights =
new float[nTrainingEvents];
345 for (UInt_t i=0; i<nTrainingEvents; i++) {
348 for (UInt_t j=0; j<
fNVars; j++) {
349 trainDataX[j + i*
fNVars] =
e->GetValue(j);
362 trainDataY[j + i*
fNOutputs] =
e->GetTarget(j);
365 else Log() << kFATAL <<
"Can not fill target vector because analysis type is not known" <<
Endl;
368 trainDataWeights[i] =
e->GetWeight();
371 npy_intp dimsTrainX[2] = {(npy_intp)nTrainingEvents, (npy_intp)
fNVars};
372 npy_intp dimsTrainY[2] = {(npy_intp)nTrainingEvents, (npy_intp)
fNOutputs};
373 npy_intp dimsTrainWeights[1] = {(npy_intp)nTrainingEvents};
374 PyArrayObject* pTrainDataX = (PyArrayObject*)PyArray_SimpleNewFromData(2, dimsTrainX, NPY_FLOAT, (
void*)trainDataX);
375 PyArrayObject* pTrainDataY = (PyArrayObject*)PyArray_SimpleNewFromData(2, dimsTrainY, NPY_FLOAT, (
void*)trainDataY);
376 PyArrayObject* pTrainDataWeights = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimsTrainWeights, NPY_FLOAT, (
void*)trainDataWeights);
379 PyDict_SetItemString(
fLocalNS,
"trainWeights", (
PyObject*)pTrainDataWeights);
389 float* valDataX =
new float[nValEvents*
fNVars];
390 float* valDataY =
new float[nValEvents*
fNOutputs];
391 float* valDataWeights =
new float[nValEvents];
393 for (UInt_t i=0; i< nValEvents ; i++) {
394 UInt_t ievt = nTrainingEvents + i;
397 for (UInt_t j=0; j<
fNVars; j++) {
398 valDataX[j + i*
fNVars] =
e->GetValue(j);
412 else Log() << kFATAL <<
"Can not fill target vector because analysis type is not known" <<
Endl;
414 valDataWeights[i] =
e->GetWeight();
417 npy_intp dimsValX[2] = {(npy_intp)nValEvents, (npy_intp)
fNVars};
418 npy_intp dimsValY[2] = {(npy_intp)nValEvents, (npy_intp)
fNOutputs};
419 npy_intp dimsValWeights[1] = {(npy_intp)nValEvents};
420 PyArrayObject* pValDataX = (PyArrayObject*)PyArray_SimpleNewFromData(2, dimsValX, NPY_FLOAT, (
void*)valDataX);
421 PyArrayObject* pValDataY = (PyArrayObject*)PyArray_SimpleNewFromData(2, dimsValY, NPY_FLOAT, (
void*)valDataY);
422 PyArrayObject* pValDataWeights = (PyArrayObject*)PyArray_SimpleNewFromData(1, dimsValWeights, NPY_FLOAT, (
void*)valDataWeights);
430 Log() << kINFO <<
"Print Training Model Architecture" <<
Endl;
437 PyDict_SetItemString(
fLocalNS,
"batchSize", pBatchSize);
438 PyDict_SetItemString(
fLocalNS,
"numEpochs", pNumEpochs);
441 PyRunString(
"train_dataset = torch.utils.data.TensorDataset(torch.Tensor(trainX), torch.Tensor(trainY))",
442 "Failed to create pytorch train Dataset.");
444 PyRunString(
"train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batchSize, shuffle=False)",
445 "Failed to create pytorch train Dataloader.");
449 PyRunString(
"val_dataset = torch.utils.data.TensorDataset(torch.Tensor(valX), torch.Tensor(valY))",
450 "Failed to create pytorch validation Dataset.");
452 PyRunString(
"val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batchSize, shuffle=False)",
453 "Failed to create pytorch validation Dataloader.");
460 "schedulerSteps = {}\n"
461 "for c in strScheduleSteps.split(';'):\n"
462 " x = c.split(',')\n"
463 " schedulerSteps[int(x[0])] = float(x[1])\n",
467 PyRunString(
"def schedule(optimizer, epoch, schedulerSteps=schedulerSteps):\n"
468 " if epoch in schedulerSteps:\n"
469 " for param_group in optimizer.param_groups:\n"
470 " param_group['lr'] = float(schedulerSteps[epoch])\n",
477 PyRunString(
"schedule = None; schedulerSteps = None",
"Failed to set scheduler to None.");
484 " if curr_val<=best_val:\n"
485 " best_val = curr_val\n"
486 " best_model_jitted = torch.jit.script(model)\n"
487 " torch.jit.save(best_model_jitted, save_path)\n"
489 "Failed to setup training with option: SaveBestOnly");
490 Log() << kINFO <<
"Option SaveBestOnly: Only model weights with smallest validation loss will be stored" <<
Endl;
493 PyRunString(
"save_best = None",
"Failed to set save_best to None.");
500 PyRunString(
"trained_model = fit(model, train_loader, val_loader, num_epochs=numEpochs, batch_size=batchSize,"
501 "optimizer=optimizer, criterion=criterion, save_best=save_best, scheduler=(schedule, schedulerSteps))",
502 "Failed to train model");
513 PyRunString(
"trained_model_jitted = torch.jit.script(trained_model)",
514 "Model not scriptable. Failed to convert to torch script.");
526 delete[] trainDataWeights;
529 delete[] valDataWeights;
551 for (UInt_t i=0; i<
fNVars; i++)
fVals[i] =
e->GetValue(i);
552 PyRunString(
"for i,p in enumerate(predict(model, vals)): output[i]=p\n",
553 "Failed to get predictions");
570 if (firstEvt > lastEvt || lastEvt > nEvents) lastEvt = nEvents;
571 if (firstEvt < 0) firstEvt = 0;
572 nEvents = lastEvt-firstEvt;
577 for (UInt_t i=0; i<nEvents; i++) {
580 for (UInt_t j=0; j<
fNVars; j++) {
588 if (pModel==0)
Log() << kFATAL <<
"Failed to get model Python object" <<
Endl;
591 if (pPredict==0)
Log() << kFATAL <<
"Failed to get Python predict function" <<
Endl;
595 PyArrayObject* pPredictions = (PyArrayObject*) PyObject_CallFunctionObjArgs(pPredict, pModel,
fPyVals, NULL);
596 if (pPredictions==0)
Log() << kFATAL <<
"Failed to get predictions" <<
Endl;
600 std::vector<double> mvaValues(nEvents);
601 float* predictionsData = (
float*) PyArray_DATA(pPredictions);
602 for (UInt_t i=0; i<nEvents; i++) {
606 Py_DECREF(pPredictions);
622 for (UInt_t i=0; i<
fNVars; i++)
fVals[i] =
e->GetValue(i);
624 PyRunString(
"for i,p in enumerate(predict(model, vals)): output[i]=p\n",
625 "Failed to get predictions");
655 for (UInt_t i=0; i<nEvents; i++) {
658 for (UInt_t j=0; j<
fNVars; j++) {
665 if (pModel==0)
Log() << kFATAL <<
"Failed to get model Python object" <<
Endl;
668 if (pPredict==0)
Log() << kFATAL <<
"Failed to get Python predict function" <<
Endl;
670 std::cout <<
" calling predict functon for regression \n";
672 PyArrayObject* pPredictions = (PyArrayObject*) PyObject_CallFunctionObjArgs(pPredict, pModel,
fPyVals, NULL);
673 if (pPredictions==0)
Log() << kFATAL <<
"Failed to get predictions" <<
Endl;
676 float* predictionsData = (
float*) PyArray_DATA(pPredictions);
680 for (UInt_t ievt = 0; ievt < nEvents; ievt++) {
692 Py_DECREF(pPredictions);
708 for (UInt_t i=0; i<
fNVars; i++)
fVals[i] =
e->GetValue(i);
709 PyRunString(
"for i,p in enumerate(predict(model, vals)): output[i]=p\n",
710 "Failed to get predictions");
726 for (UInt_t i=0; i<nEvents; i++) {
729 for (UInt_t j=0; j<
fNVars; j++) {
736 if (pModel==0)
Log() << kFATAL <<
"Failed to get model Python object" <<
Endl;
739 if (pPredict==0)
Log() << kFATAL <<
"Failed to get Python predict function" <<
Endl;
743 PyArrayObject* pPredictions = (PyArrayObject*) PyObject_CallFunctionObjArgs(pPredict, pModel,
fPyVals, NULL);
744 if (pPredictions==0)
Log() << kFATAL <<
"Failed to get predictions" <<
Endl;
747 float* predictionsData = (
float*) PyArray_DATA(pPredictions);
749 std::copy(predictionsData, predictionsData+nEvents*
fNOutputs,
fOutput.begin());
751 Py_DECREF(pPredictions);
763 Log() <<
"PyTorch is a scientific computing package supporting" <<
Endl;
764 Log() <<
"automatic differentiation. This method wraps the training" <<
Endl;
765 Log() <<
"and predictions steps of the PyTorch Python package for" <<
Endl;
766 Log() <<
"TMVA, so that dataloading, preprocessing and evaluation" <<
Endl;
767 Log() <<
"can be done within the TMVA system. To use this PyTorch" <<
Endl;
768 Log() <<
"interface, you need to generatea model with PyTorch first." <<
Endl;
769 Log() <<
"Then, this model can be loaded and trained in TMVA." <<
Endl;
#define REGISTER_METHOD(CLASS)
for example
int Int_t
Signed integer 4 bytes (int).
bool Bool_t
Boolean (0=false, 1=true) (bool).
double Double_t
Double 8 bytes.
OptionBase * DeclareOptionRef(T &ref, const TString &name, const TString &desc="")
Class that contains all the data information.
UInt_t GetNClasses() const
UInt_t GetNTargets() const
Long64_t GetNEvents(Types::ETreeType type=Types::kMaxTreeType) const
Long64_t GetNTrainingEvents() const
void SetCurrentEvent(Long64_t ievt) const
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
Float_t GetTarget(UInt_t itgt) const
PyGILState_STATE m_GILState
const char * GetName() const override
Types::EAnalysisType GetAnalysisType() const
const std::vector< TMVA::Event * > & GetEventCollection(Types::ETreeType type)
returns the event collection (i.e.
const TString & GetWeightFileDir() const
const Event * GetEvent() const
DataSetInfo & DataInfo() const
virtual void TestClassification()
initialization
UInt_t GetNVariables() const
TransformationHandler & GetTransformationHandler(Bool_t takeReroutedIfAvailable=true)
void NoErrorCalc(Double_t *const err, Double_t *const errUpper)
const Event * GetTrainingEvent(Long64_t ievt) const
void InitEvaluation(size_t nEvents)
void GetHelpMessage() const override
Double_t GetMvaValue(Double_t *errLower, Double_t *errUpper) override
std::vector< Float_t > & GetRegressionValues() override
void ProcessOptions() override
Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t) override
std::vector< float > fOutput
void ReadModelFromFile() override
MethodPyTorch(const TString &jobName, const TString &methodTitle, DataSetInfo &dsi, const TString &theOption="")
std::vector< Float_t > GetAllMulticlassValues() override
Get all multi-class values.
std::vector< Double_t > GetMvaValues(Long64_t firstEvt, Long64_t lastEvt, Bool_t logProgress) override
get all the MVA values for the events of the current Data type
std::vector< float > fVals
std::vector< Float_t > & GetMulticlassValues() override
std::vector< Float_t > GetAllRegressionValues() override
Get al regression values in one call.
void TestClassification() override
initialization
TString fNumValidationString
UInt_t GetNumValidationSamples()
Validation of the ValidationSize option.
TString fLearningRateSchedule
TString fFilenameTrainedModel
void SetupPyTorchModel(Bool_t loadTrainedModel)
void DeclareOptions() override
static int PyIsInitialized()
Check Python interpreter initialization status.
static PyObject * fGlobalNS
PyMethodBase(const TString &jobName, Types::EMVA methodType, const TString &methodTitle, DataSetInfo &dsi, const TString &theOption="")
void PyRunString(TString code, TString errorMessage="Failed to run python code", int start=256)
Execute Python code from string.
Singleton class for Global types used by TMVA.
@ kSignal
Never change this number - it is elsewhere assumed to be zero !
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString.
create variable transformations
MsgLogger & Endl(MsgLogger &ml)