1 // @(#)root/tmva: $Id$
2 // Author: Simon Pfreundschuh 20/06/16
4 /*************************************************************************
5  * Copyright (C) 2016, Simon Pfreundschuh *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
12 #ifndef TMVA_DNN_NET
13 #define TMVA_DNN_NET
15 #include <vector>
16 #include <iostream>
18 #include "Layer.h"
20 namespace TMVA {
21 namespace DNN {
23 /** \class TNet
25  Generic neural network class.
27  This generic neural network class represents a concrete neural
28  network through a vector of layers and coordinates the forward
29  and backward propagation through the net.
31  The net takes as input a batch from the training data given in
32  matrix form, with each row corresponding to a certain training
33  event.
35  On construction, the neural network allocates all the memory
36  required for the training of the neural net and keeps it until
37  its destruction.
39  The Architecture type argument simply holds the
40  architecture-specific data types, which are just the matrix type
41  Matrix_t and the used scalar type Scalar_t.
43  \tparam Architecture The Architecture type that holds the
44  \tparam Layer_t The type used for the layers. Can be either
45  Layer<Architecture> or SharedWeightLayer<Architecture>.
46  datatypes for a given architecture.
47 */
48 template<typename Architecture_t, typename Layer_t = TLayer<Architecture_t>>
49  class TNet {
51 public:
52  using Matrix_t = typename Architecture_t::Matrix_t;
53  using Scalar_t = typename Architecture_t::Scalar_t;
54  using LayerIterator_t = typename std::vector<Layer_t>::iterator;
56 private:
57  size_t fBatchSize; ///< Batch size for training and evaluation of the Network.
58  size_t fInputWidth; ///< Number of features in a single input event.
60  std::vector<Layer_t> fLayers; ///< Layers in the network.
62  Matrix_t fDummy; ///< Empty matrix for last step in back propagation.
63  ELossFunction fJ; ///< The loss function of the network.
64  ERegularization fR; ///< The regularization used for the network.
65  Scalar_t fWeightDecay; ///< The weight decay factor.
67 public:
68  TNet();
69  TNet(const TNet & other);
70  template<typename OtherArchitecture_t>
71  TNet(size_t batchSize, const TNet<OtherArchitecture_t> &);
72  /*! Construct a neural net for a given batch size with
73  * given output function * and regularization. */
74  TNet(size_t batchSize,
75  size_t inputWidth,
76  ELossFunction fJ,
78  Scalar_t fWeightDecay = 0.0);
79  /*! Create a clone that uses the same weight and biases matrices but
80  * potentially a difference batch size. */
83  /*! Add a layer of the given size to the neural net. */
84  void AddLayer(size_t width, EActivationFunction f,
85  Scalar_t dropoutProbability = 1.0);
87  /*! Remove all layers from the network.*/
88  void Clear();
90  /*! Add a layer which shares its weights with another TNet instance. */
91  template <typename SharedLayer>
92  void AddLayer(SharedLayer & layer);
94  /*! Iterator to the first layer of the net. */
97  /*! Iterator to the last layer of the net. */
100  /*! Initialize the weights in the net with the
101  * initialization method. */
102  inline void Initialize(EInitialization m);
104  /*! Initialize the gradients in the net to zero. Required if net is
105  * used to store velocities of momentum-based minimization techniques. */
106  inline void InitializeGradients();
108  /*! Forward a given input through the neural net. Computes
109  * all layer activations up to the output layer */
110  inline void Forward(Matrix_t& X, bool applyDropout = false);
112  /*! Compute the weight gradients in the net from the given training
113  * samples X and training labels Y. */
114  inline void Backward(const Matrix_t &X, const Matrix_t &Y);
116  /*! Evaluate the loss function of the net using the activations
117  * that are currently stored in the output layer. */
118  inline Scalar_t Loss(const Matrix_t &Y, bool includeRegularization = true) const;
120  /*! Propagate the input batch X through the net and evaluate the
121  * error function for the resulting activations of the output
122  * layer */
123  inline Scalar_t Loss(Matrix_t &X, const Matrix_t &Y, bool applyDropout = false);
125  /*! Compute the neural network predictionion obtained from forwarding the
126  * batch X through the neural network and applying the output function
127  * f to the activation of the last layer in the network. */
128  inline void Prediction(Matrix_t &Y_hat, Matrix_t &X, EOutputFunction f);
130  /*! Compute the neural network rediction obtained from applying the output
131  * function f to the activation of the last layer in the network. */
132  inline void Prediction(Matrix_t &Y_hat, EOutputFunction f) const;
136  size_t GetDepth() const {return fLayers.size();}
137  size_t GetBatchSize() const {return fBatchSize;}
138  Layer_t & GetLayer(size_t i) {return fLayers[i];}
139  const Layer_t & GetLayer(size_t i) const {return fLayers[i];}
140  ELossFunction GetLossFunction() const {return fJ;}
141  Matrix_t & GetOutput() {return fLayers.back().GetOutput();}
142  size_t GetInputWidth() const {return fInputWidth;}
143  size_t GetOutputWidth() const {return fLayers.back().GetWidth();}
147  void SetBatchSize(size_t batchSize) {fBatchSize = batchSize;}
148  void SetInputWidth(size_t inputWidth) {fInputWidth = inputWidth;}
150  void SetLossFunction(ELossFunction J) {fJ = J;}
152  void SetDropoutProbabilities(const std::vector<Double_t> & probabilities);
154  void Print();
155 };
157 //______________________________________________________________________________
158 template<typename Architecture_t, typename Layer_t>
160  : fBatchSize(0), fInputWidth(0), fLayers(), fDummy(0,0),
162  fWeightDecay(0.0)
163 {
164  // Nothing to do here.
165 }
167 //______________________________________________________________________________
168 template<typename Architecture_t, typename Layer_t>
170  : fBatchSize(other.fBatchSize), fInputWidth(other.fInputWidth),
171  fLayers(other.fLayers), fDummy(0,0), fJ(other.fJ), fR(other.fR),
172  fWeightDecay(other.fWeightDecay)
173 {
174  // Nothing to do here.
175 }
177 //______________________________________________________________________________
178 template<typename Architecture_t, typename Layer_t>
179 template<typename OtherArchitecture_t>
181  const TNet<OtherArchitecture_t> & other)
182  : fBatchSize(batchSize), fInputWidth(other.GetInputWidth()), fLayers(),
183  fDummy(0,0), fJ(other.GetLossFunction()), fR(other.GetRegularization()),
184  fWeightDecay(other.GetWeightDecay())
185 {
186  fLayers.reserve(other.GetDepth());
187  for (size_t i = 0; i < other.GetDepth(); i++) {
188  AddLayer(other.GetLayer(i).GetWidth(),
189  other.GetLayer(i).GetActivationFunction(),
190  other.GetLayer(i).GetDropoutProbability());
191  fLayers[i].GetWeights() = (TMatrixT<Double_t>) other.GetLayer(i).GetWeights();
192  fLayers[i].GetBiases() = (TMatrixT<Double_t>) other.GetLayer(i).GetBiases();
193  }
194 }
196 //______________________________________________________________________________
197 template<typename Architecture_t, typename Layer_t>
199  size_t inputWidth,
200  ELossFunction J,
203  : fBatchSize(batchSize), fInputWidth(inputWidth), fLayers(), fDummy(0,0),
204  fJ(J), fR(R), fWeightDecay(weightDecay)
205 {
206  // Nothing to do here.
207 }
209 //______________________________________________________________________________
210 template<typename Architecture_t, typename Layer_t>
213 {
214  TNet<Architecture_t, TSharedLayer<Architecture_t>> other(BatchSize, fInputWidth,
215  fJ, fR);
216  for (auto &l : fLayers) {
217  other.AddLayer(l);
218  }
219  return other;
220 }
222 //______________________________________________________________________________
223 template<typename Architecture_t, typename Layer_t>
226  Scalar_t dropoutProbability)
227 {
228  if (fLayers.size() == 0) {
229  fLayers.emplace_back(fBatchSize, fInputWidth, width, f, dropoutProbability);
230  } else {
231  size_t prevWidth = fLayers.back().GetWidth();
232  fLayers.emplace_back(fBatchSize, prevWidth, width, f, dropoutProbability);
233  }
234 }
236 //______________________________________________________________________________
237 template<typename Architecture_t, typename Layer_t>
239 {
240  fLayers.clear();
241 }
243 //______________________________________________________________________________
244 template<typename Architecture_t, typename Layer_t>
245  template<typename SharedLayer_t>
246  inline void TNet<Architecture_t, Layer_t>::AddLayer(SharedLayer_t & layer)
247 {
248  fLayers.emplace_back(fBatchSize, layer);
249 }
251 //______________________________________________________________________________
252 template<typename Architecture_t, typename Layer_t>
254 {
255  for (auto &l : fLayers) {
256  l.Initialize(m);
257  }
258 }
260 //______________________________________________________________________________
261 template<typename Architecture_t, typename Layer_t>
263 {
264  for (auto &l : fLayers) {
265  initialize<Architecture_t>(l.GetWeightGradients(), EInitialization::kZero);
266  initialize<Architecture_t>(l.GetBiasGradients(), EInitialization::kZero);
267  }
268 }
270 //______________________________________________________________________________
271 template<typename Architecture_t, typename Layer_t>
273  bool applyDropout)
274 {
275  fLayers.front().Forward(input, applyDropout);
277  for (size_t i = 1; i < fLayers.size(); i++) {
278  fLayers[i].Forward(fLayers[i-1].GetOutput(), applyDropout);
279  }
280 }
282 //______________________________________________________________________________
283 template<typename Architecture_t, typename Layer_t>
285  const Matrix_t &Y)
286 {
288  evaluateGradients<Architecture_t>(fLayers.back().GetActivationGradients(),
289  fJ, Y, fLayers.back().GetOutput());
291  for (size_t i = fLayers.size()-1; i > 0; i--) {
292  auto & activation_gradient_backward
293  = fLayers[i-1].GetActivationGradients();
294  auto & activations_backward
295  = fLayers[i-1].GetOutput();
296  fLayers[i].Backward(activation_gradient_backward,
297  activations_backward, fR, fWeightDecay);
298  }
299  fLayers[0].Backward(fDummy, X, fR, fWeightDecay);
301 }
303 //______________________________________________________________________________
304 template<typename Architecture_t, typename Layer_t>
306  bool includeRegularization) const
307  -> Scalar_t
308 {
309  auto loss = evaluate<Architecture_t>(fJ, Y, fLayers.back().GetOutput());
310  includeRegularization &= (fR != ERegularization::kNone);
311  if (includeRegularization) {
312  for (auto &l : fLayers) {
313  loss += fWeightDecay * regularization<Architecture_t>(l.GetWeights(), fR);
314  }
315  }
316  return loss;
317 }
319 //______________________________________________________________________________
320 template<typename Architecture_t, typename Layer_t>
322  const Matrix_t &Y,
323  bool applyDropout)
324  -> Scalar_t
325 {
326  Forward(X, applyDropout);
327  return Loss(Y);
328 }
330 //______________________________________________________________________________
331 template<typename Architecture_t, typename Layer_t>
333  Matrix_t &X,
335 {
336  Forward(X, false);
337  evaluate<Architecture_t>(Yhat, f, fLayers.back().GetOutput());
338 }
340 //______________________________________________________________________________
341 template<typename Architecture_t, typename Layer_t>
343  EOutputFunction f) const
344 {
345  evaluate<Architecture_t>(Y_hat, f, fLayers.back().GetOutput());
346 }
348 //______________________________________________________________________________
349 template<typename Architecture_t, typename Layer_t>
351  -> Scalar_t
352 {
353  Scalar_t flops = 0;
355  Scalar_t nb = (Scalar_t) fBatchSize;
356  Scalar_t nlp = (Scalar_t) fInputWidth;
358  for(size_t i = 0; i < fLayers.size(); i++) {
359  Layer_t & layer = fLayers[i];
360  Scalar_t nl = (Scalar_t) layer.GetWidth();
362  // Forward propagation.
363  flops += nb * nl * (2.0 * nlp - 1); // Matrix mult.
364  flops += nb * nl; // Add bias values.
365  flops += 2 * nb * nl; // Apply activation function and compute
366  // derivative.
367  // Backward propagation.
368  flops += nb * nl; // Hadamard
369  flops += nlp * nl * (2.0 * nb - 1.0); // Weight gradients
370  flops += nl * (nb - 1); // Bias gradients
371  if (i > 0) {
372  flops += nlp * nb * (2.0 * nl - 1.0); // Previous layer gradients.
373  }
374  nlp = nl;
375  }
376  return flops;
377 }
379 //______________________________________________________________________________
380 template<typename Architecture_t, typename Layer_t>
382  const std::vector<Double_t> & probabilities)
383 {
384  for (size_t i = 0; i < fLayers.size(); i++) {
385  if (i < probabilities.size()) {
386  fLayers[i].SetDropoutProbability(probabilities[i]);
387  } else {
388  fLayers[i].SetDropoutProbability(1.0);
389  }
390  }
391 }
393 //______________________________________________________________________________
394 template<typename Architecture_t, typename Layer_t>
396 {
397  std::cout << "DEEP NEURAL NETWORK:";
398  std::cout << " Loss function = " << static_cast<char>(fJ);
399  std::cout << ", Depth = " << fLayers.size() << std::endl;
401  size_t i = 1;
402  for (auto & l : fLayers) {
403  std::cout << "DNN Layer " << i << ":" << std::endl;
404  l.Print();
405  i++;
406  }
408 }
410 } // namespace DNN
411 } // namespace TMVA
413 #endif
