11#ifndef TMVA_DNN_MINIMIZERS
12#define TMVA_DNN_MINIMIZERS
53template<
typename Architecture_t>
57 using Scalar_t =
typename Architecture_t::Scalar_t;
58 using Matrix_t =
typename Architecture_t::Matrix_t;
77 size_t convergenceSteps,
90 template <
typename Data_t,
typename Net_t>
92 const Data_t & TestDataIn,
size_t nTestSamples,
93 Net_t & net,
size_t nThreads = 1);
96 template <
typename Data_t,
typename Net_t>
98 const Data_t & TestDataIn,
size_t nTestSamples,
99 Net_t & net,
Scalar_t momentum,
size_t nThreads = 1);
106 template <
typename Net_t>
111 template <
typename Net_t>
120 template <
typename Net_t>
121 void Step(Net_t &master,
122 std::vector<Net_t> &nets,
126 template <
typename Net_t>
128 std::vector<Net_t> &nets,
131 template <
typename Net_t>
136 std::vector<Net_t> &nets,
143 template <
typename Net_t>
148 template <
typename Net_t>
174template <
typename Architecture_t>
176 : fBatchSize(0), fStepCount(0), fConvergenceSteps(0), fConvergenceCount(0), fTestInterval(0), fLearningRate(0),
177 fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
183template <
typename Architecture_t>
185 : fBatchSize(0), fStepCount(0), fConvergenceSteps(convergenceSteps), fConvergenceCount(0),
186 fTestInterval(testInterval), fLearningRate(learningRate), fMinimumError(std::numeric_limits<
Scalar_t>::infinity())
192template<
typename Architecture_t>
193template <
typename Data_t,
typename Net_t>
195 size_t nTrainingSamples,
196 const Data_t & testData,
208 net.GetOutputWidth(), nThreads);
209 auto testNet = net.CreateClone(nTestSamples);
211 testNet.GetBatchSize(),
212 testNet.GetInputWidth(),
213 net.GetOutputWidth());
214 std::vector<Net_t> nets{};
215 nets.reserve(nThreads);
216 for (
size_t i = 0; i < nThreads; i++) {
218 for (
size_t j = 0; j < net.GetDepth(); j++)
220 auto &masterLayer = net.GetLayer(j);
221 auto &layer = nets.back().GetLayer(j);
223 masterLayer.GetWeights());
225 masterLayer.GetBiases());
229 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
230 std::vector<TBatch<Architecture_t>> batches{};
231 batches.reserve(nThreads);
234 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
236 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
238 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
239 Step(net, nets, batches);
243 auto b = *testLoader.
begin();
244 auto inputMatrix =
b.GetInput();
245 auto outputMatrix =
b.GetOutput();
246 auto weightMatrix =
b.GetWeights();
247 fTestError = testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
249 }
while (!HasConverged());
251 return fMinimumError;
255template<
typename Architecture_t>
256template <
typename Data_t,
typename Net_t>
258 size_t nTrainingSamples,
259 const Data_t & testData,
272 net.GetOutputWidth(), nThreads);
273 auto testNet = net.CreateClone(net.GetBatchSize());
275 testNet.GetBatchSize(),
276 testNet.GetInputWidth(),
277 net.GetOutputWidth());
279 net.InitializeGradients();
280 std::vector<Net_t> nets{};
281 nets.reserve(nThreads);
282 for (
size_t i = 0; i < nThreads; i++) {
284 for (
size_t j = 0; j < net.GetDepth(); j++)
286 auto &masterLayer = net.GetLayer(j);
287 auto &layer = nets.back().GetLayer(j);
289 masterLayer.GetWeights());
291 masterLayer.GetBiases());
295 size_t batchesInEpoch = nTrainingSamples / net.GetBatchSize();
296 std::vector<TBatch<Architecture_t>> batches{};
297 batches.reserve(nThreads);
300 for (fStepCount = 0; fStepCount < fTestInterval; fStepCount++) {
302 for (
size_t i = 0; i < batchesInEpoch; i += nThreads) {
304 for (
size_t j = 0; j < nThreads; j++) batches.push_back(trainLoader.
GetBatch());
305 if (momentum != 0.0) {
306 StepMomentum(net, nets, batches, momentum);
308 Step(net, nets, batches);
314 for (
size_t i = 0; i < batchesInEpoch; i++) {
316 auto inputMatrix =
b.GetInput();
317 auto outputMatrix =
b.GetOutput();
318 auto weightMatrix =
b.GetWeights();
319 fTestError += testNet.Loss(inputMatrix, outputMatrix, weightMatrix);
321 fTestError /= (
Double_t)batchesInEpoch;
322 }
while (!HasConverged());
323 return fMinimumError;
327template <
typename Architecture_t>
328template <
typename Net_t>
332 net.Forward(input,
true);
333 net.Backward(input,
output, weights);
335 for (
size_t i = 0; i < net.GetDepth(); i++)
337 auto &layer = net.GetLayer(i);
338 Architecture_t::ScaleAdd(layer.GetWeights(),
339 layer.GetWeightGradients(),
341 Architecture_t::ScaleAdd(layer.GetBiases(),
342 layer.GetBiasGradients(),
348template <
typename Architecture_t>
349template <
typename Net_t>
354 net.Backward(input,
output);
356 for (
size_t i = 0; i < net.GetDepth(); i++)
358 auto &layer = net.GetLayer(i);
359 Architecture_t::ScaleAdd(layer.GetWeights(),
360 layer.GetWeightGradients(),
362 Architecture_t::ScaleAdd(layer.GetBiases(),
363 layer.GetBiasGradients(),
370template<
typename Architecture_t>
371 template <
typename Net_t>
374 std::vector<Net_t> & nets,
377 typename Architecture_t::Matrix_t
dummy(0,0);
378 size_t depth = master.GetDepth();
381 for (
size_t j = 0; j < nets.size(); j++) {
382 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
385 for (
size_t i = 1; i < depth; i++)
387 for (
size_t j = 0; j < nets.size(); j++) {
388 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
392 for (
size_t j = 0; j < nets.size(); j++) {
393 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
394 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
395 batches[j].GetWeights());
398 for (
size_t i = depth - 1; i > 0; i--)
400 for (
size_t j = 0; j < nets.size(); j++) {
401 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
402 nets[j].GetLayer(i-1).GetOutput(),
403 nets[j].GetRegularization(),
404 nets[j].GetWeightDecay());
407 for (
size_t j = 0; j < nets.size(); j++) {
408 nets[j].GetLayer(0).Backward(
dummy,
409 batches[j].GetInput(),
410 nets[j].GetRegularization(),
411 nets[j].GetWeightDecay());
414 for (
size_t j = 0; j < nets.size(); j++) {
415 for (
size_t i = 0; i < depth; i++)
417 auto &masterLayer = master.GetLayer(i);
418 auto &layer = nets[j].GetLayer(i);
419 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
420 layer.GetWeightGradients(),
423 masterLayer.GetWeights());
424 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
425 layer.GetBiasGradients(),
428 masterLayer.GetBiases());
434template<
typename Architecture_t>
435template <
typename Net_t>
438 std::vector<Net_t> & nets,
442 typename Architecture_t::Matrix_t
dummy(0,0);
443 size_t depth = master.GetDepth();
446 for (
size_t j = 0; j < nets.size(); j++) {
447 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
450 for (
size_t i = 1; i < depth; i++)
452 for (
size_t j = 0; j < nets.size(); j++) {
453 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
457 for (
size_t j = 0; j < nets.size(); j++) {
458 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
459 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
460 batches[j].GetWeights());
463 for (
size_t i = depth - 1; i > 0; i--)
465 for (
size_t j = 0; j < nets.size(); j++) {
466 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
467 nets[j].GetLayer(i-1).GetOutput(),
468 nets[j].GetRegularization(),
469 nets[j].GetWeightDecay());
470 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
471 nets[j].GetLayer(i).GetWeightGradients(),
472 - fLearningRate / momentum);
473 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
474 nets[j].GetLayer(i).GetBiasGradients(),
475 - fLearningRate / momentum);
477 Architecture_t::ScaleAdd(master.GetLayer(i).GetWeightGradients(),
478 master.GetLayer(i).GetWeightGradients(),
480 Architecture_t::ScaleAdd(master.GetLayer(i).GetBiasGradients(),
481 master.GetLayer(i).GetBiasGradients(),
484 for (
size_t j = 0; j < nets.size(); j++) {
485 nets[j].GetLayer(0).Backward(
dummy,
486 batches[j].GetInput(),
487 nets[j].GetRegularization(),
488 nets[j].GetWeightDecay());
489 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
490 nets[j].GetLayer(0).GetWeightGradients(),
491 - fLearningRate / momentum);
492 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
493 nets[j].GetLayer(0).GetBiasGradients(),
494 - fLearningRate / momentum);
497 Architecture_t::ScaleAdd(master.GetLayer(0).GetWeightGradients(),
498 master.GetLayer(0).GetWeightGradients(),
500 Architecture_t::ScaleAdd(master.GetLayer(0).GetBiasGradients(),
501 master.GetLayer(0).GetBiasGradients(),
504 for (
size_t i = 0; i < depth; i++)
506 auto &masterLayer = master.GetLayer(i);
507 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
508 masterLayer.GetWeightGradients(),
510 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
511 masterLayer.GetBiasGradients(),
513 for (
size_t j = 0; j < nets.size(); j++) {
514 auto &layer = nets[j].GetLayer(i);
516 masterLayer.GetWeights());
518 masterLayer.GetBiases());
524template<
typename Architecture_t>
525template <
typename Net_t>
528 std::vector<Net_t> & nets,
532 typename Architecture_t::Matrix_t
dummy(0,0);
533 size_t depth = master.GetDepth();
536 for (
size_t j = 0; j < nets.size(); j++) {
537 nets[j].GetLayer(0).Forward(batches[j].GetInput(),
true);
540 for (
size_t i = 1; i < depth; i++)
542 for (
size_t j = 0; j < nets.size(); j++) {
543 nets[j].GetLayer(i).Forward(nets[j].GetLayer(i-1).GetOutput(),
true);
548 for (
size_t j = 0; j < nets.size(); j++) {
549 evaluateGradients<Architecture_t>(nets[j].GetLayer(depth - 1).GetActivationGradients(), nets[j].GetLossFunction(),
550 batches[j].GetOutput(), nets[j].GetLayer(depth - 1).GetOutput(),
551 batches[j].GetWeights());
555 for (
size_t i = depth - 1; i > 0; i--)
557 for (
size_t j = 0; j < nets.size(); j++) {
558 nets[j].GetLayer(i).Backward(nets[j].GetLayer(i-1).GetActivationGradients(),
559 nets[j].GetLayer(i-1).GetOutput(),
560 nets[j].GetRegularization(),
561 nets[j].GetWeightDecay());
565 for (
size_t j = 0; j < nets.size(); j++) {
566 nets[j].GetLayer(0).Backward(
dummy,
567 batches[j].GetInput(),
568 nets[j].GetRegularization(),
569 nets[j].GetWeightDecay());
572 for (
size_t i = 0; i < depth; i++)
574 auto &masterLayer = master.GetLayer(i);
575 for (
size_t j = 0; j < nets.size(); j++) {
576 auto &layer = nets[j].GetLayer(i);
578 masterLayer.GetWeights());
580 masterLayer.GetBiases());
581 Architecture_t::ScaleAdd(layer.GetWeights(),
582 masterLayer.GetWeightGradients(),
584 Architecture_t::ScaleAdd(layer.GetBiases(),
585 masterLayer.GetBiasGradients(),
588 for (
size_t j = 0; j < nets.size(); j++) {
589 auto &layer = nets[j].GetLayer(i);
590 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
591 layer.GetWeightGradients(),
592 - fLearningRate / momentum);
593 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
594 layer.GetBiasGradients(),
595 - fLearningRate / momentum);
597 Architecture_t::ScaleAdd(masterLayer.GetWeightGradients(),
598 masterLayer.GetWeightGradients(),
600 Architecture_t::ScaleAdd(masterLayer.GetBiasGradients(),
601 masterLayer.GetBiasGradients(),
603 Architecture_t::ScaleAdd(masterLayer.GetWeights(),
604 masterLayer.GetWeightGradients(),
606 Architecture_t::ScaleAdd(masterLayer.GetBiases(),
607 masterLayer.GetBiasGradients(),
613template<
typename Architecture_t>
614template <
typename Net_t>
620 net.Forward(input,
true);
621 net.Backward(input,
output);
623 for (
size_t i = 0; i < net.GetDepth(); i++)
625 auto &layer = net.GetLayer(i);
626 Architecture_t::ScaleAdd(layer.GetWeights(),
627 layer.GetWeightGradients(),
630 Architecture_t::ScaleAdd(layer.GetBiases(),
631 layer.GetBiasGradients(),
638template <
typename Architecture_t>
639template <
typename Net_t>
645 fTrainingError = loss;
646 net.Backward(input,
output, weights);
648 for (
size_t i = 0; i < net.GetDepth(); i++)
650 auto &layer = net.GetLayer(i);
651 Architecture_t::ScaleAdd(layer.GetWeights(),
652 layer.GetWeightGradients(),
655 Architecture_t::ScaleAdd(layer.GetBiases(),
656 layer.GetBiasGradients(),
664template<
typename Architecture_t>
667 if (fTestError < fMinimumError * 0.999) {
668 fConvergenceCount = 0;
669 fMinimumError = fTestError;
674 return (fConvergenceCount >= fConvergenceSteps);
678template<
typename Architecture_t>
681 fTestError = testError;
682 if (fTestError < fMinimumError * 0.999) {
683 fConvergenceCount = 0;
684 fMinimumError = fTestError;
686 fConvergenceCount += fTestInterval;
688 return (fConvergenceCount >= fConvergenceSteps);
static RooMathCoreReg dummy
TBatch< AArchitecture > GetBatch()
Return the next batch from the training set.
void Shuffle()
Shuffle the order of the samples in the batch.
size_t fConvergenceCount
Current number of training epochs without.
bool HasConverged()
Increases the minimization step counter by the test error evaluation period and uses the current inte...
Scalar_t fTrainingError
Holds the most recently computed training loss.
void Step(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Perform a single optimization step on a given batch.
size_t fConvergenceSteps
Number of training epochs without considerable.
Scalar_t StepReducedWeightsLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Similar to StepReducedWeights(...) but also evaluates the loss.
Scalar_t StepLoss(Net_t &net, Matrix_t &input, const Matrix_t &output, const Matrix_t &weights)
Same as Step(...) but also evaluate the loss on the given training data.
void Reset()
Reset minimizer object to default state.
Scalar_t GetTrainingError() const
size_t fTestInterval
Interval for the computation of the test error.
void StepNesterov(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses Nesterov momentum.
void SetBatchSize(Scalar_t rate)
Scalar_t fTestError
Holds the most recently computed test loss.
typename Architecture_t::Matrix_t Matrix_t
size_t GetTestInterval() const
size_t fStepCount
Number of steps performed in the current training session.
void StepMomentum(Net_t &master, std::vector< Net_t > &nets, std::vector< TBatch< Architecture_t > > &batches, Scalar_t momentum)
Same as the Step(...) method for multiple batches but uses momentum.
void SetConvergenceSteps(size_t steps)
Scalar_t TrainMomentum(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, Scalar_t momentum, size_t nThreads=1)
Same as Train(...) but uses the given momentum.
size_t GetConvergenceCount() const
Scalar_t fMinimumError
The minimum loss achieved on the training set.
void SetLearningRate(Scalar_t rate)
Scalar_t fLearningRate
Learning rate .
size_t fBatchSize
Batch size to use for the training.
size_t GetConvergenceSteps() const
Scalar_t Train(const Data_t &TrainingDataIn, size_t nTrainingSamples, const Data_t &TestDataIn, size_t nTestSamples, Net_t &net, size_t nThreads=1)
Train the given net using the given training input data (events), training output data (labels),...
void SetTestInterval(size_t interval)
void StepReducedWeights(Net_t &net, Matrix_t &input, const Matrix_t &output)
Does not evaluate the loss and therefore not trigger a possible synchronization with the device.
Scalar_t GetTestError() const
typename Architecture_t::Scalar_t Scalar_t
void Copy(void *source, void *dest)
void Step(const gsl_rng *r, void *xp, double step_size)
create variable transformations
static void output(int code)