Logo ROOT  
Reference Guide
 
Loading...
Searching...
No Matches
RSampler.cxx
Go to the documentation of this file.
2
3#include <algorithm>
4#include <cmath>
5#include <random>
6#include <stdexcept>
7
9
11
12RSampler::~RSampler() = default;
13
14RSampler::RSampler(std::vector<RFlat2DMatrix> &datasets, const std::string &sampleType, float sampleRatio,
15 bool replacement, bool shuffle, std::size_t setSeed)
16 : fDatasets(datasets),
17 fSampleType(sampleType),
18 fSampleRatio(sampleRatio),
19 fReplacement(replacement),
20 fShuffle(shuffle),
21 fSetSeed(setSeed)
22{
23 fTensorOperators = std::make_unique<RFlat2DMatrixOperators>(fShuffle, fSetSeed);
24
25 // setup the sampler for the datasets
27}
28
29//////////////////////////////////////////////////////////////////////////
30/// \brief Calculate fNumEntries and major/minor variables
32{
33 if (fSampleType == "undersampling") {
35 } else if (fSampleType == "oversampling") {
37 }
38}
39
40//////////////////////////////////////////////////////////////////////////
41/// \brief Collection of sampling types
42/// \param[in] SampledTensor Tensor with all the sampled entries
44{
45 if (fSampleType == "undersampling") {
47 } else if (fSampleType == "oversampling") {
49 }
50}
51
52//////////////////////////////////////////////////////////////////////////
53/// \brief Calculate fNumEntries and major/minor variables for the random undersampler
55{
56 if (fDatasets[0].GetRows() > fDatasets[1].GetRows()) {
57 fMajor = 0;
58 fMinor = 1;
59 } else {
60 fMajor = 1;
61 fMinor = 0;
62 }
63
64 fNumMajor = fDatasets[fMajor].GetRows();
65 fNumMinor = fDatasets[fMinor].GetRows();
66 fNumResampledMajor = static_cast<std::size_t>(fNumMinor / fSampleRatio);
68 auto minRatio = std::to_string(std::round(double(fNumMinor) / double(fNumMajor) * 100.0) / 100.0);
69 minRatio.erase(minRatio.find('.') + 3);
70 throw std::invalid_argument(
71 "The sampling_ratio is too low: not enough entries in the majority class to sample from.\n"
72 "Choose sampling_ratio > " +
73 minRatio + " or set replacement to True.");
74 }
76}
77
78//////////////////////////////////////////////////////////////////////////
79/// \brief Calculate fNumEntries and major/minor variables for the random oversampler
81{
82 if (fDatasets[0].GetRows() > fDatasets[1].GetRows()) {
83 fMajor = 0;
84 fMinor = 1;
85 } else {
86 fMajor = 1;
87 fMinor = 0;
88 }
89
90 fNumMajor = fDatasets[fMajor].GetRows();
91 fNumMinor = fDatasets[fMinor].GetRows();
92 fNumResampledMinor = static_cast<std::size_t>(fSampleRatio * fNumMajor);
94}
95
96//////////////////////////////////////////////////////////////////////////
97/// \brief Undersample entries randomly from the majority dataset
98/// \param[in] SampledTensor Tensor with all the sampled entries
100{
101 if (fReplacement) {
103 }
104
105 else {
107 }
108
109 std::size_t cols = fDatasets[0].GetCols();
112
113 std::size_t index = 0;
114 for (std::size_t i = 0; i < fNumResampledMajor; i++) {
115 std::copy(fDatasets[fMajor].GetData() + fSamples[i] * cols,
116 fDatasets[fMajor].GetData() + (fSamples[i] + 1) * cols,
117 UndersampledMajorTensor.GetData() + index * cols);
118 index++;
119 }
120
123}
124
125//////////////////////////////////////////////////////////////////////////
126/// \brief Oversample entries randomly from the minority dataset
127/// \param[in] SampledTensor Tensor with all the sampled entries
129{
131
132 std::size_t cols = fDatasets[0].GetCols();
135
136 std::size_t index = 0;
137 for (std::size_t i = 0; i < fNumResampledMinor; i++) {
138 std::copy(fDatasets[fMinor].GetData() + fSamples[i] * cols,
139 fDatasets[fMinor].GetData() + (fSamples[i] + 1) * cols,
140 OversampledMinorTensor.GetData() + index * cols);
141 index++;
142 }
143
146}
147
148//////////////////////////////////////////////////////////////////////////
149/// \brief Add indices with replacement to fSamples
150/// \param[in] n_samples Number of indices to sample
151/// \param[in] max Max index of the sample distribution
152void RSampler::SampleWithReplacement(std::size_t n_samples, std::size_t max)
153{
154 std::uniform_int_distribution<> dist(0, max - 1);
155 fSamples.clear();
156 fSamples.reserve(n_samples);
157 for (std::size_t i = 0; i < n_samples; ++i) {
158 std::size_t sample;
159 if (fShuffle) {
160 std::random_device rd;
161 std::mt19937 g;
162
163 if (fSetSeed == 0) {
164 g.seed(rd());
165 } else {
166 g.seed(fSetSeed);
167 }
168
169 sample = dist(g);
170 }
171
172 else {
173 sample = i % max;
174 }
175 fSamples.push_back(sample);
176 }
177}
178
179//////////////////////////////////////////////////////////////////////////
180/// \brief Add indices without replacement to fSamples
181/// \param[in] n_samples Number of indices to sample
182/// \param[in] max Max index of the sample distribution
183void RSampler::SampleWithoutReplacement(std::size_t n_samples, std::size_t max)
184{
185 std::vector<std::size_t> UniqueSamples;
186 UniqueSamples.reserve(max);
187 fSamples.clear();
188 fSamples.reserve(n_samples);
189
190 for (std::size_t i = 0; i < max; ++i)
191 UniqueSamples.push_back(i);
192
193 if (fShuffle) {
194 std::random_device rd;
195 std::mt19937 g;
196
197 if (fSetSeed == 0) {
198 g.seed(rd());
199 } else {
200 g.seed(fSetSeed);
201 }
202 std::shuffle(UniqueSamples.begin(), UniqueSamples.end(), g);
203 }
204
205 for (std::size_t i = 0; i < n_samples; ++i) {
206 fSamples.push_back(UniqueSamples[i]);
207 }
208}
209
210} // namespace ROOT::Experimental::Internal::ML
#define g(i)
Definition RSha256.hxx:105
ROOT::Detail::TRangeCast< T, true > TRangeDynCast
TRangeDynCast is an adapter class that allows the typed iteration through a TCollection.
Option_t Option_t TPoint TPoint const char GetTextMagnitude GetFillStyle GetLineColor GetLineWidth GetMarkerStyle GetTextAlign GetTextColor GetTextSize void char Point_t Rectangle_t WindowAttributes_t index
void SampleWithoutReplacement(std::size_t n_samples, std::size_t max)
Add indices without replacement to fSamples.
Definition RSampler.cxx:183
void SetupRandomUndersampler()
Calculate fNumEntries and major/minor variables for the random undersampler.
Definition RSampler.cxx:54
void RandomOversampler(RFlat2DMatrix &ShuffledTensor)
Oversample entries randomly from the minority dataset.
Definition RSampler.cxx:128
void SampleWithReplacement(std::size_t n_samples, std::size_t max)
Add indices with replacement to fSamples.
Definition RSampler.cxx:152
void SetupRandomOversampler()
Calculate fNumEntries and major/minor variables for the random oversampler.
Definition RSampler.cxx:80
void SetupSampler()
Calculate fNumEntries and major/minor variables.
Definition RSampler.cxx:31
std::vector< std::size_t > fSamples
Definition RSampler.hxx:49
std::unique_ptr< RFlat2DMatrixOperators > fTensorOperators
Definition RSampler.hxx:51
RSampler(std::vector< RFlat2DMatrix > &datasets, const std::string &sampleType, float sampleRatio, bool replacement=false, bool shuffle=true, std::size_t setSeed=0)
Definition RSampler.cxx:14
void RandomUndersampler(RFlat2DMatrix &ShuffledTensor)
Undersample entries randomly from the majority dataset.
Definition RSampler.cxx:99
std::vector< RFlat2DMatrix > & fDatasets
Definition RSampler.hxx:34
void Sampler(RFlat2DMatrix &SampledTensor)
Collection of sampling types.
Definition RSampler.cxx:43
const_iterator begin() const
const_iterator end() const
Wrapper around ROOT::RVec<float> representing a 2D matrix.