45 TMVA::RuleFitAPI::RuleFitAPI( const MethodRuleFit *rfbase,
48 fMethodRuleFit(rfbase),
51 fLogger("RuleFitAPI",minType)
55 SetRFWorkDir(rfbase->GetRFWorkDir());
57 SetRFWorkDir(
"./rulefit");
77 <<
"---------------------------------------------------------------------------\n"
78 <<
"- You are running the interface to Jerome Friedmans RuleFit(tm) code. -\n"
79 <<
"- For a full manual see the following web page: -\n"
81 <<
"- http://www-stat.stanford.edu/~jhf/R-RuleFit.html -\n"
83 <<
"---------------------------------------------------------------------------"
93 <<
"------------------------ RULEFIT-JF INTERFACE SETUP -----------------------\n"
95 <<
"1. Create a rulefit directory in your current work directory:\n"
96 <<
" mkdir " << fRFWorkDir <<
"\n\n"
97 <<
" the directory may be set using the option RuleFitDir\n"
99 <<
"2. Copy (or make a link) the file rf_go.exe into this directory\n"
101 <<
"The file can be obtained from Jerome Friedmans homepage (linux):\n"
102 <<
" wget http://www-stat.stanford.edu/~jhf/r-rulefit/linux/rf_go.exe\n"
104 <<
"Don't forget to do:\n"
105 <<
" chmod +x rf_go.exe\n"
107 <<
"For Windows download:\n"
108 <<
" http://www-stat.stanford.edu/~jhf/r-rulefit/windows/rf_go.exe\n"
110 <<
"NOTE: other platforms are not supported (see Friedmans homepage)\n"
112 <<
"---------------------------------------------------------------------------\n"
131 fRFIntParms.p = fMethodRuleFit->DataInfo().GetNVariables();
132 fRFIntParms.max_rules = fMethodRuleFit->GetRFNrules();
133 fRFIntParms.tree_size = fMethodRuleFit->GetRFNendnodes();
134 fRFIntParms.path_steps = fMethodRuleFit->GetGDNPathSteps();
136 fRFRealParms.path_inc = fMethodRuleFit->GetGDPathStep();
137 fRFRealParms.samp_fract = fMethodRuleFit->GetTreeEveFrac();
138 fRFRealParms.trim_qntl = fMethodRuleFit->GetLinQuantile();
139 fRFRealParms.conv_fac = fMethodRuleFit->GetGDErrScale();
141 if (fRuleFit->GetRuleEnsemblePtr()->DoOnlyLinear() )
142 fRFIntParms.lmode = kRfLinear;
143 else if (fRuleFit->GetRuleEnsemblePtr()->DoOnlyRules() )
144 fRFIntParms.lmode = kRfRules;
146 fRFIntParms.lmode = kRfBoth;
167 fLogger <<
kWARNING <<
"Must create a rulefit directory named : " << fRFWorkDir <<
Endl;
169 fLogger <<
kFATAL <<
"Setup failed - aborting!" <<
Endl;
172 FILE *
f = fopen(
"rf_go.exe",
"r");
174 fLogger <<
kWARNING <<
"No rf_go.exe file in directory : " << fRFWorkDir <<
Endl;
176 fLogger <<
kFATAL <<
"Setup failed - aborting!" <<
Endl;
189 Int_t n = fMethodRuleFit->Data()->GetNTrainingEvents();
192 fRFProgram = kRfTrain;
201 Int_t n = fMethodRuleFit->Data()->GetNTestEvents();
204 fRFProgram = kRfPredict;
212 fRFRealParms.xmiss = 9.0e30;
213 fRFRealParms.trim_qntl = 0.025;
214 fRFRealParms.huber = 0.8;
215 fRFRealParms.inter_supp = 3.0;
216 fRFRealParms.memory_par = 0.01;
217 fRFRealParms.samp_fract = 0.5;
218 fRFRealParms.path_inc = 0.01;
219 fRFRealParms.conv_fac = 1.1;
227 fRFIntParms.mode = (int)kRfClass;
228 fRFIntParms.lmode = (int)kRfBoth;
231 fRFIntParms.max_rules = 2000;
232 fRFIntParms.tree_size = 4;
233 fRFIntParms.path_speed = 2;
234 fRFIntParms.path_xval = 3;
235 fRFIntParms.path_steps = 50000;
236 fRFIntParms.path_testfreq = 100;
237 fRFIntParms.tree_store = 10000000;
238 fRFIntParms.cat_store = 1000000;
252 if (fRFProgram==kRfTrain) WriteTrain();
253 if (fRFProgram==kRfPredict) WriteTest();
254 if (fRFProgram==kRfVarimp) WriteRealVarImp();
264 if (!OpenRFile(
"intparms",f))
return kFALSE;
265 WriteInt(f,&fRFIntParms.mode,
sizeof(fRFIntParms)/
sizeof(
Int_t));
275 if (!OpenRFile(
"realparms",f))
return kFALSE;
276 WriteFloat(f,&fRFRealParms.xmiss,
sizeof(fRFRealParms)/
sizeof(
Float_t));
291 fRFLx.resize(fMethodRuleFit->DataInfo().GetNVariables(),1);
294 if (!OpenRFile(
"lx",f))
return kFALSE;
295 WriteInt(f,&fRFLx[0],fRFLx.size());
305 if (!OpenRFile(
"program",f))
return kFALSE;
307 switch (fRFProgram) {
312 program =
"rulefit_pred";
319 fRFProgram = kRfTrain;
333 if (!OpenRFile(
"realvarimp",f))
return kFALSE;
337 WriteFloat(f,&rvp[0],2);
346 fLogger <<
kWARNING <<
"WriteRfOut is not yet implemented" <<
Endl;
355 fLogger <<
kWARNING <<
"WriteRfStatus is not yet implemented" <<
Endl;
364 fLogger <<
kWARNING <<
"WriteRuleFitMod is not yet implemented" <<
Endl;
373 fLogger <<
kWARNING <<
"WriteRuleFitSum is not yet implemented" <<
Endl;
386 if (!OpenRFile(
"train.x",fx))
return kFALSE;
387 if (!OpenRFile(
"train.y",fy))
return kFALSE;
388 if (!OpenRFile(
"train.w",fw))
return kFALSE;
395 for (
UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
396 for (
Int_t ievt=0;ievt<fMethodRuleFit->Data()->GetNTrainingEvents(); ievt++) {
397 const Event * ev = fMethodRuleFit->GetTrainingEvent(ievt);
402 y = fMethodRuleFit->DataInfo().IsSignal(ev)? 1.0 : -1.0;
408 fLogger <<
kINFO <<
"Number of training data written: " << fMethodRuleFit->Data()->GetNTrainingEvents() <<
Endl;
421 if (!OpenRFile(
"test.x",f))
return kFALSE;
426 neve =
static_cast<Float_t>(fMethodRuleFit->Data()->GetNEvents());
427 WriteFloat(f,&neve,1);
433 for (
UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
434 for (
Int_t ievt=0;ievt<fMethodRuleFit->Data()->GetNEvents(); ievt++) {
435 vf = fMethodRuleFit->GetEvent(ievt)->GetValue(ivar);
439 fLogger <<
kINFO <<
"Number of test data written: " << fMethodRuleFit->Data()->GetNEvents() <<
Endl;
450 if (!OpenRFile(
"varnames",f))
return kFALSE;
451 for (
UInt_t ivar=0; ivar<fMethodRuleFit->DataInfo().GetNVariables(); ivar++) {
452 f << fMethodRuleFit->DataInfo().GetVariableInfo(ivar).GetExpression() <<
'\n';
463 fLogger <<
kWARNING <<
"WriteVarImp is not yet implemented" <<
Endl;
472 fLogger <<
kWARNING <<
"WriteYhat is not yet implemented" <<
Endl;
484 if (!OpenRFile(
"yhat",f))
return kFALSE;
487 ReadFloat(f,&xval,1);
488 neve =
static_cast<Int_t>(xval);
489 if (neve!=fMethodRuleFit->Data()->GetNTestEvents()) {
490 fLogger <<
kWARNING <<
"Inconsistent size of yhat file and test tree!" <<
Endl;
491 fLogger <<
kWARNING <<
"neve = " << neve <<
" , tree = " << fMethodRuleFit->Data()->GetNTestEvents() <<
Endl;
494 for (
Int_t ievt=0; ievt<fMethodRuleFit->Data()->GetNTestEvents(); ievt++) {
495 ReadFloat(f,&xval,1);
496 fRFYhat.push_back(xval);
509 if (!OpenRFile(
"varimp",f))
return kFALSE;
513 nvars=fMethodRuleFit->DataInfo().GetNVariables();
517 for (
UInt_t ivar=0; ivar<nvars; ivar++) {
518 ReadFloat(f,&xval,1);
522 if (xval>xmax) xmax=xval;
524 fRFVarImp.push_back(xval);
530 for (
UInt_t ivar=0; ivar<nvars; ivar++) {
531 fRFVarImp[ivar] = fRFVarImp[ivar]/
xmax;
532 ReadFloat(f,&xval,1);
533 fRFVarImpInd.push_back(
Int_t(xval)-1);
545 fLogger <<
kVERBOSE <<
"Reading RuleFit summary file" <<
Endl;
547 if (!OpenRFile(
"rulefit.sum",f))
return kFALSE;
558 fRuleFit->GetRuleEnsemblePtr()->SetAverageRuleSigma(0.4);
586 lines += ReadInt(f,&nrules);
587 norules = (nrules==1);
588 lines += ReadInt(f,&dumI);
589 norules = norules && (dumI==1);
590 lines += ReadInt(f,&dumI);
591 norules = norules && (dumI==1);
592 lines += ReadInt(f,&dumI);
593 norules = norules && (dumI==0);
594 if (nrules==0) norules=
kTRUE;
595 if (norules) nrules = 0;
597 lines += ReadInt(f,&nvars);
598 lines += ReadInt(f,&nvarsOpt);
599 lines += ReadFloat(f,&dumF);
600 lines += ReadFloat(f,&offset);
601 fLogger <<
kDEBUG <<
"N(rules) = " << nrules <<
Endl;
602 fLogger <<
kDEBUG <<
"N(vars) = " << nvars <<
Endl;
603 fLogger <<
kDEBUG <<
"N(varsO) = " << nvarsOpt <<
Endl;
604 fLogger <<
kDEBUG <<
"xmiss = " << dumF <<
Endl;
605 fLogger <<
kDEBUG <<
"offset = " << offset <<
Endl;
606 if (nvars!=nvarsOpt) {
607 fLogger <<
kWARNING <<
"Format of rulefit.sum is ... weird?? Continuing but who knows how it will end...?" <<
Endl;
609 std::vector<Double_t> rfSupp;
610 std::vector<Double_t> rfCoef;
611 std::vector<Int_t> rfNcut;
612 std::vector<Rule *> rfRules;
616 for (
Int_t t=0; t<8; t++) {
617 lines += ReadFloat(f,&dumF);
632 lines += ReadFloat(f,&dumF);
633 lines += ReadFloat(f,&dumF);
634 rfSupp.push_back(dumF);
635 lines += ReadFloat(f,&dumF);
636 rfCoef.push_back(dumF);
637 lines += ReadFloat(f,&dumF);
638 rfNcut.push_back(static_cast<int>(dumF+0.5));
639 lines += ReadFloat(f,&dumF);
656 Rule *rule =
new Rule(fRuleFit->GetRuleEnsemblePtr());
657 rfRules.push_back( rule );
675 if (imp>impref) impref = imp;
677 fLogger <<
kDEBUG <<
"Rule #" << r <<
" : " << nvars <<
Endl;
678 fLogger <<
kDEBUG <<
" support = " << rfSupp[
r] <<
Endl;
680 fLogger <<
kDEBUG <<
" coeff = " << rfCoef[
r] <<
Endl;
681 fLogger <<
kDEBUG <<
" N(cut) = " << rfNcut[
r] <<
Endl;
684 lines += ReadFloat(f,&dumF);
685 varind =
static_cast<Int_t>(dumF+0.5)-1;
686 lines += ReadFloat(f,&dumF);
688 lines += ReadFloat(f,&dumF);
701 fRuleFit->GetRuleEnsemblePtr()->SetRules( rfRules );
702 fRuleFit->GetRuleEnsemblePtr()->SetOffset( offset );
715 std::vector<Int_t> varind;
716 std::vector<Double_t>
xmin;
717 std::vector<Double_t>
xmax;
718 std::vector<Double_t> average;
719 std::vector<Double_t> stdev;
720 std::vector<Double_t>
norm;
721 std::vector<Double_t> coeff;
724 lines += ReadFloat(f,&dumF);
725 varind.push_back(static_cast<Int_t>(dumF+0.5)-1);
726 lines += ReadFloat(f,&dumF);
727 xmin.push_back(static_cast<Double_t>(dumF));
728 lines += ReadFloat(f,&dumF);
729 xmax.push_back(static_cast<Double_t>(dumF));
730 lines += ReadFloat(f,&dumF);
731 average.push_back(static_cast<Double_t>(dumF));
732 lines += ReadFloat(f,&dumF);
733 stdev.push_back(static_cast<Double_t>(dumF));
734 Double_t nv = fRuleFit->GetRuleEnsemblePtr()->CalcLinNorm(stdev.back());
736 lines += ReadFloat(f,&dumF);
737 coeff.push_back(dumF/nv);
740 fLogger <<
kDEBUG <<
" varind = " << varind.back() <<
Endl;
741 fLogger <<
kDEBUG <<
" xmin = " << xmin.back() <<
Endl;
742 fLogger <<
kDEBUG <<
" xmax = " << xmax.back() <<
Endl;
743 fLogger <<
kDEBUG <<
" average = " << average.back() <<
Endl;
744 fLogger <<
kDEBUG <<
" stdev = " << stdev.back() <<
Endl;
745 fLogger <<
kDEBUG <<
" coeff = " << coeff.back() <<
Endl;
748 fRuleFit->GetRuleEnsemblePtr()->SetLinCoefficients(coeff);
749 fRuleFit->GetRuleEnsemblePtr()->SetLinDM(xmin);
750 fRuleFit->GetRuleEnsemblePtr()->SetLinDP(xmax);
751 fRuleFit->GetRuleEnsemblePtr()->SetLinNorm(norm);
754 imp = fRuleFit->GetRuleEnsemblePtr()->CalcLinImportance();
755 if (imp>impref) impref=imp;
756 fRuleFit->GetRuleEnsemblePtr()->SetImportanceRef(impref);
757 fRuleFit->GetRuleEnsemblePtr()->CleanupLinear();
759 fRuleFit->GetRuleEnsemblePtr()->CalcVarImportance();
762 fLogger <<
kDEBUG <<
"Reading model done" <<
Endl;
Bool_t WriteLx()
Save input variable mask.
Bool_t ReadVarImp()
read variable importance
void WelcomeMessage()
welcome message
void SetCoefficient(Double_t v)
void HowtoSetupRF()
howto message
MsgLogger & Endl(MsgLogger &ml)
void SetSSBNeve(Double_t v)
ClassImp(TMVA::RuleFitAPI) TMVA
void SetRuleCut(RuleCut *rc)
void SetCutMax(Int_t i, Double_t v)
virtual ~RuleFitAPI()
destructor
Bool_t WriteRfStatus()
written by rf_go.exe; write rulefit status
void SetRFWorkDir(const char *wdir)
set the directory containing rf_go.exe.
Bool_t WriteAll()
write all files read by rf_go.exe
void FillIntParmsDef()
set default int params
Bool_t cd(const char *path)
Bool_t WriteTrain()
write training data, columnwise
Bool_t WriteRfOut()
written by rf_go.exe; write rulefit output (rfout)
Bool_t WriteIntParms()
write int params file
Bool_t WriteRealVarImp()
write the minimum importance to be considered
void ImportSetup()
import setup from MethodRuleFit
Double_t GetWeight() const
return the event weight - depending on whether the flag IgnoreNegWeightsInTraining is or not...
void CheckRFWorkDir()
check if the rulefit work dir is properly setup.
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
const char * Data() const
void SetCutMin(Int_t i, Double_t v)
Bool_t WriteProgram()
write command to rf_go.exe
void FillRealParmsDef()
set default real params
void SetCutDoMin(Int_t i, Bool_t v)
if(pyself &&pyself!=Py_None)
Bool_t ReadModelSum()
read model from rulefit.sum
void SetSelector(Int_t i, UInt_t s)
Bool_t WriteRuleFitMod()
written by rf_go.exe (NOTE:Format unknown!)
void SetTrainParms()
set the training parameters
R__EXTERN TSystem * gSystem
Double_t GetImportance() const
Double_t GetSigma() const
virtual Int_t Exec(const char *shellcmd)
Execute a command.
void SetTestParms()
set the test params
void SetImportanceRef(Double_t v)
void SetSupport(Double_t v)
void SetNorm(Double_t norm)
void SetCutDoMax(Int_t i, Bool_t v)
Bool_t WriteVarNames()
write variable names, ascii
Bool_t WriteTest()
Write test data.
Abstract ClassifierFactory template that handles arbitrary types.
Bool_t WriteYhat()
written by rf_go.exe
Bool_t WriteRuleFitSum()
written by rf_go.exe (NOTE: format unknown!)
Bool_t ReadYhat()
read the score
Int_t RunRuleFit()
execute rf_go.exe
Bool_t WriteRealParms()
write int params file
double norm(double *x, double *p)
void InitRuleFit()
default initialisation SetRFWorkDir("./rulefit");