11 #ifndef TMVA_DNN_MINIMIZERS 12 #define TMVA_DNN_MINIMIZERS 53 template<
typename Architecture_t>
57 using Scalar_t =
typename Architecture_t::Scalar_t;
58 using Matrix_t =
typename Architecture_t::Matrix_t;
77 size_t convergenceSteps,
83 fMinimumError = std::numeric_limits<Scalar_t>::infinity();
84 fConvergenceCount = 0;
90 template <
typename Data_t,
typename Net_t>
91 Scalar_t Train(
const Data_t & TrainingDataIn,
size_t nTrainingSamples,
92 const Data_t & TestDataIn,
size_t nTestSamples,
93 Net_t & net,
size_t nThreads = 1);
96 template <
typename Data_t,
typename Net_t>
98 const Data_t & TestDataIn,
size_t nTestSamples,
99 Net_t & net,
Scalar_t momentum,
size_t nThreads = 1);
106 template <
typename Net_t>
111 template <
typename Net_t>
120 template <
typename Net_t>
121 void Step(Net_t &master,
122 std::vector<Net_t> &nets,
126 template <
typename Net_t>
128 std::vector<Net_t> &nets,
131 template <
typename Net_t>
136 std::vector<Net_t> &nets,
143 template <
typename Net_t>
148 template <
typename Net_t>
174 template <
typename Architecture_t>
183 template <
typename Architecture_t>
192 template<
typename Architecture_t>
193 template <
typename Data_t,
typename Net_t>
195 size_t nTrainingSamples,
196 const Data_t & testData,
208 net.GetOutputWidth(), nThreads);
209 auto testNet = net.CreateClone(nTestSamples);
211 testNet.GetBatchSize(),
212 testNet.GetInputWidth(),
213 net.GetOutputWidth());
214 std::vector<Net_t> nets{};
215 nets.reserve(nThreads);
216 for (
size_t i = 0; i < nThreads; i++) {
218 for (
size_t j = 0; j < net.GetDepth(); j++)
220 auto &masterLayer = net.GetLayer(j);
221 auto &layer = nets.back().GetLayer(j);
223 masterLayer.GetWeights());
225 masterLayer.GetBiases());
229 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
230 std::vector<TBatch<Architecture_t>> batches{};
231 batches.reserve(nThreads);
235 trainLoader.Shuffle();
236 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
238 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
239 Step(net, nets, batches);
243 auto b = *testLoader.begin();
244 auto inputMatrix =
b.GetInput();
245 auto outputMatrix =
b.GetOutput();
246 auto weightMatrix =
b.GetWeights();
247 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
255 template<
typename Architecture_t>
256 template <
typename Data_t,
typename Net_t>
258 size_t nTrainingSamples,
259 const Data_t & testData,
272 net.GetOutputWidth(), nThreads);
273 auto testNet = net.CreateClone(net.GetBatchSize());
275 testNet.GetBatchSize(),
276 testNet.GetInputWidth(),
277 net.GetOutputWidth());
279 net.InitializeGradients();
280 std::vector<Net_t> nets{};
281 nets.reserve(nThreads);
282 for (
size_t i = 0; i < nThreads; i++) {
284 for (
size_t j = 0; j < net.GetDepth(); j++)
286 auto &masterLayer = net.GetLayer(j);
287 auto &layer = nets.back().GetLayer(j);
289 masterLayer.GetWeights());
291 masterLayer.GetBiases());
295 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
296 std::vector<TBatch<Architecture_t>> batches{};
297 batches.reserve(nThreads);
301 trainLoader.Shuffle();
302 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
304 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.GetBatch());
305 if (momentum != 0.0) {
308 Step(net, nets, batches);
314 for (
size_t i = 0; i < batchesInEpoch; i++) {
315 auto b = testLoader.GetBatch();
316 auto inputMatrix =
b.GetInput();
317 auto outputMatrix =
b.GetOutput();
318 auto weightMatrix =
b.GetWeights();
319 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
327 template <
typename Architecture_t>
328 template <
typename Net_t>
332 net.Forward(input,
true);
333 net.Backward(input, output, weights);
335 for (
size_t i = 0; i < net.GetDepth(); i++)
337 auto &layer = net.GetLayer(i);
338 Architecture_t::ScaleAdd(layer.GetWeights(),
339 layer.GetWeightGradients(),
341 Architecture_t::ScaleAdd(layer.GetBiases(),
342 layer.GetBiasGradients(),
348 template <
typename Architecture_t>
349 template <
typename Net_t>
354 net.Backward(input,
output);
356 for (
size_t i = 0; i < net.GetDepth(); i++)
358 auto &layer = net.GetLayer(i);
359 Architecture_t::ScaleAdd(layer.GetWeights(),
360 layer.GetWeightGradients(),
362 Architecture_t::ScaleAdd(layer.GetBiases(),
363 layer.GetBiasGradients(),
370 template<
typename Architecture_t>
371 template <
typename Net_t>
374 std::vector<Net_t> & nets,
377 typename Architecture_t::Matrix_t
dummy(0,0);
378 size_t depth = master.GetDepth();
381 for (
size_t j = 0; j < nets.size(); j++) {
382 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
385 for (
size_t i = 1; i < depth; i++)
387 for (
size_t j = 0; j < nets.size(); j++) {
388 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
392 for (
size_t j = 0; j < nets.size(); j++) {
393 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
394 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
395 batches[j].GetWeights());
398 for (
size_t i = depth - 1; i > 0; i--)
400 for (
size_t j = 0; j < nets.size(); j++) {
401 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
402 nets[j].GetLayer(i-1).GetOutput(),
403 nets[j].GetRegularization(),
404 nets[j].GetWeightDecay());
407 for (
size_t j = 0; j < nets.size(); j++) {
408 nets[j].GetLayer(0).Backward(dummy,
409 batches[j].GetInput(),
410 nets[j].GetRegularization(),
411 nets[j].GetWeightDecay());
414 for (
size_t j = 0; j < nets.size(); j++) {
415 for (
size_t i = 0; i < depth; i++)
417 auto &masterLayer = master.GetLayer(i);
418 auto &layer = nets[j].GetLayer(i);
419 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
420 layer.GetWeightGradients(),
423 masterLayer.GetWeights());
424 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
425 layer.GetBiasGradients(),
428 masterLayer.GetBiases());
434 template<
typename Architecture_t>
435 template <
typename Net_t>
438 std::vector<Net_t> & nets,
442 typename Architecture_t::Matrix_t
dummy(0,0);
443 size_t depth = master.GetDepth();
446 for (
size_t j = 0; j < nets.size(); j++) {
447 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
450 for (
size_t i = 1; i < depth; i++)
452 for (
size_t j = 0; j < nets.size(); j++) {
453 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
457 for (
size_t j = 0; j < nets.size(); j++) {
458 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
459 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
460 batches[j].GetWeights());
463 for (
size_t i = depth - 1; i > 0; i--)
465 for (
size_t j = 0; j < nets.size(); j++) {
466 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
467 nets[j].GetLayer(i-1).GetOutput(),
468 nets[j].GetRegularization(),
469 nets[j].GetWeightDecay());
470 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
471 nets[j].GetLayer(i).GetWeightGradients(),
473 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
474 nets[j].GetLayer(i).GetBiasGradients(),
477 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
478 master.GetLayer(i).GetWeightGradients(),
480 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
481 master.GetLayer(i).GetBiasGradients(),
484 for (
size_t j = 0; j < nets.size(); j++) {
485 nets[j].GetLayer(0).Backward(dummy,
486 batches[j].GetInput(),
487 nets[j].GetRegularization(),
488 nets[j].GetWeightDecay());
489 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
490 nets[j].GetLayer(0).GetWeightGradients(),
492 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
493 nets[j].GetLayer(0).GetBiasGradients(),
497 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
498 master.GetLayer(0).GetWeightGradients(),
500 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
501 master.GetLayer(0).GetBiasGradients(),
504 for (
size_t i = 0; i < depth; i++)
506 auto &masterLayer = master.GetLayer(i);
507 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
508 masterLayer.GetWeightGradients(),
510 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
511 masterLayer.GetBiasGradients(),
513 for (
size_t j = 0; j < nets.size(); j++) {
514 auto &layer = nets[j].GetLayer(i);
516 masterLayer.GetWeights());
518 masterLayer.GetBiases());
524 template<
typename Architecture_t>
525 template <
typename Net_t>
528 std::vector<Net_t> & nets,
532 typename Architecture_t::Matrix_t
dummy(0,0);
533 size_t depth = master.GetDepth();
536 for (
size_t j = 0; j < nets.size(); j++) {
537 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
540 for (
size_t i = 1; i < depth; i++)
542 for (
size_t j = 0; j < nets.size(); j++) {
543 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
548 for (
size_t j = 0; j < nets.size(); j++) {
549 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
550 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
551 batches[j].GetWeights());
555 for (
size_t i = depth - 1; i > 0; i--)
557 for (
size_t j = 0; j < nets.size(); j++) {
558 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
559 nets[j].GetLayer(i-1).GetOutput(),
560 nets[j].GetRegularization(),
561 nets[j].GetWeightDecay());
565 for (
size_t j = 0; j < nets.size(); j++) {
566 nets[j].GetLayer(0).Backward(dummy,
567 batches[j].GetInput(),
568 nets[j].GetRegularization(),
569 nets[j].GetWeightDecay());
572 for (
size_t i = 0; i < depth; i++)
574 auto &masterLayer = master.GetLayer(i);
575 for (
size_t j = 0; j < nets.size(); j++) {
576 auto &layer = nets[j].GetLayer(i);
578 masterLayer.GetWeights());
580 masterLayer.GetBiases());
581 Architecture_t::ScaleAdd(layer.GetWeights(),
582 masterLayer.GetWeightGradients(),
584 Architecture_t::ScaleAdd(layer.GetBiases(),
585 masterLayer.GetBiasGradients(),
588 for (
size_t j = 0; j < nets.size(); j++) {
589 auto &layer = nets[j].GetLayer(i);
590 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
591 layer.GetWeightGradients(),
593 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
594 layer.GetBiasGradients(),
597 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
598 masterLayer.GetWeightGradients(),
600 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
601 masterLayer.GetBiasGradients(),
603 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
604 masterLayer.GetWeightGradients(),
606 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
607 masterLayer.GetBiasGradients(),
613 template<
typename Architecture_t>
614 template <
typename Net_t>
620 net.Forward(input,
true);
621 net.Backward(input, output);
623 for (
size_t i = 0; i < net.GetDepth(); i++)
625 auto &layer = net.GetLayer(i);
626 Architecture_t::ScaleAdd(layer.GetWeights(),
627 layer.GetWeightGradients(),
630 Architecture_t::ScaleAdd(layer.GetBiases(),
631 layer.GetBiasGradients(),
638 template <
typename Architecture_t>
639 template <
typename Net_t>
646 net.Backward(input,
output, weights);
648 for (
size_t i = 0; i < net.GetDepth(); i++)
650 auto &layer = net.GetLayer(i);
651 Architecture_t::ScaleAdd(layer.GetWeights(),
652 layer.GetWeightGradients(),
655 Architecture_t::ScaleAdd(layer.GetBiases(),
656 layer.GetBiasGradients(),
664 template<
typename Architecture_t>
678 template<
typename Architecture_t>
typename Architecture_t::Scalar_t Scalar_t
Scalar_t GetTrainingError() const
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels)...
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t fStepCount
Number of steps performed in the current training session.
size_t fBatchSize
Batch size to use for the training.
Scalar_t fMinimumError
The minimum loss achieved on the training set.
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
Scalar_t GetTestError() const
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t fTestError
Holds the most recently computed test loss.
void SetBatchSize(Scalar_t rate)
void SetConvergenceSteps(size_t steps)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device...
size_t fConvergenceCount
Current number of training epochs without.
size_t fTestInterval
Interval for the computation of the test error.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetLearningRate(Scalar_t rate)
size_t GetTestInterval() const
void Copy(void *source, void *dest)
size_t GetConvergenceSteps() const
static RooMathCoreReg dummy
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t >> &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
Abstract ClassifierFactory template that handles arbitrary types.
typename Architecture_t::Matrix_t Matrix_t
Scalar_t fLearningRate
Learning rate .
you should not use this method at all Int_t Int_t Double_t Double_t Double_t Int_t Double_t Double_t Double_t Double_t b
void SetTestInterval(size_t interval)
size_t GetConvergenceCount() const
Scalar_t fTrainingError
Holds the most recently computed training loss.