[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

sampling.hxx
1 /************************************************************************/
2 /* */
3 /* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */
4 /* */
5 /* This file is part of the VIGRA computer vision library. */
6 /* The VIGRA Website is */
7 /* http://hci.iwr.uni-heidelberg.de/vigra/ */
8 /* Please direct questions, bug reports, and contributions to */
9 /* ullrich.koethe@iwr.uni-heidelberg.de or */
10 /* vigra@informatik.uni-hamburg.de */
11 /* */
12 /* Permission is hereby granted, free of charge, to any person */
13 /* obtaining a copy of this software and associated documentation */
14 /* files (the "Software"), to deal in the Software without */
15 /* restriction, including without limitation the rights to use, */
16 /* copy, modify, merge, publish, distribute, sublicense, and/or */
17 /* sell copies of the Software, and to permit persons to whom the */
18 /* Software is furnished to do so, subject to the following */
19 /* conditions: */
20 /* */
21 /* The above copyright notice and this permission notice shall be */
22 /* included in all copies or substantial portions of the */
23 /* Software. */
24 /* */
25 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32 /* OTHER DEALINGS IN THE SOFTWARE. */
33 /* */
34 /************************************************************************/
35 
36 #ifndef VIGRA_SAMPLING_HXX
37 #define VIGRA_SAMPLING_HXX
38 
39 #include "array_vector.hxx"
40 #include "random.hxx"
41 #include <map>
42 #include <memory>
43 #include <cmath>
44 
45 namespace vigra
46 {
47 
48 /** \addtogroup MachineLearning Machine Learning
49 **/
50 //@{
51 
52 
53 /**\brief Options object for the Sampler class.
54 
55  <b>usage:</b>
56 
57  \code
58  SamplerOptions opt = SamplerOptions()
59  .withReplacement()
60  .sampleProportion(0.5);
61  \endcode
62 
63  Note that the return value of all methods is <tt>*this</tt> which makes
64  concatenating of options as above possible.
65 */
67 {
68  public:
69 
70  double sample_proportion;
71  unsigned int sample_size;
72  bool sample_with_replacement;
73  bool stratified_sampling;
74 
76  : sample_proportion(1.0),
77  sample_size(0),
78  sample_with_replacement(true),
79  stratified_sampling(false)
80  {}
81 
82  /**\brief Sample from training population with replacement.
83  *
84  * <br> Default: true
85  */
87  {
88  sample_with_replacement = in;
89  return *this;
90  }
91 
92  /**\brief Sample from training population without replacement.
93  *
94  * <br> Default (if you don't call this function): false
95  */
97  {
98  sample_with_replacement = !in;
99  return *this;
100  }
101 
102  /**\brief Draw the given number of samples.
103  * If stratifiedSampling is true, the \a size is equally distributed
104  * accross all strata (e.g. <tt>size / strataCount</tt> samples are taken
105  * from each stratum, subject to rounding).
106  *
107  * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
108  */
109  SamplerOptions& sampleSize(unsigned int size)
110  {
111  sample_size = size;
112  return *this;
113  }
114 
115 
116  /**\brief Determine the number of samples to draw as a proportion of the total
117  * number. That is, we draw <tt>count = totalCount * proportion</tt> samples.
118  * This option is overridden when an absolute count is specified by sampleSize().
119  *
120  * If stratifiedSampling is true, the count is equally distributed
121  * accross all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken
122  * from each stratum).
123  *
124  * <br> Default: 1.0
125  */
126  SamplerOptions& sampleProportion(double proportion)
127  {
128  vigra_precondition(proportion >= 0.0,
129  "SamplerOptions::sampleProportion(): argument must not be negative.");
130  sample_proportion = proportion;
131  return *this;
132  }
133 
134  /**\brief Draw equally many samples from each "stratum".
135  * A stratum is a group of like entities, e.g. pixels belonging
136  * to the same object class. This is useful to create balanced samples
137  * when the class probabilities are very unbalanced (e.g.
138  * when there are many background and few foreground pixels).
139  * Stratified sampling thus avoids that a trained classifier is biased
140  * towards the majority class.
141  *
142  * <br> Default (if you don't call this function): false
143  */
144  SamplerOptions& stratified(bool in = true)
145  {
146  stratified_sampling = in;
147  return *this;
148  }
149 };
150 
151 /************************************************************/
152 /* */
153 /* Sampler */
154 /* */
155 /************************************************************/
156 
157 /** \brief Create random samples from a sequence of indices.
158 
159  Selecting data items at random is a basic task of machine learning,
160  for example in boostrapping, RandomForest training, and cross validation.
161  This class implements various ways to select random samples via their indices.
162  Indices are assumed to be consecutive in
163  the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
164 
165  The class always contains a current sample which can be accessed by
166  the index operator or by the function sampledIndices(). The indices
167  that are not in the current sample (out-of-bag indices) can be accessed
168  via the function oobIndices().
169 
170  The sampling method (with/without replacement, stratified or not) and the
171  number of samples to draw are determined by the option object
172  SamplerOptions.
173 
174  <b>Usage:</b>
175 
176  <b>\#include</b> <<a href="sampling_8hxx-source.html">vigra/sampling.hxx</a>><br>
177  Namespace: vigra
178 
179  Create a Sampler with default options, i.e. sample as many indices as there
180  are data elements, with replacement. On average, the sample will contain
181  <tt>0.63*totalCount</tt> distinct indices.
182 
183  \code
184  int totalCount = 10000; // total number of data elements
185  int numberOfSamples = 20; // repeat experiment 20 times
186  Sampler<> sampler(totalCount);
187  for(int k=0; k<numberOfSamples; ++k)
188  {
189  // process current sample
190  for(int i=0; i<sampler.sampleSize(); ++i)
191  {
192  int currentIndex = sampler[i];
193  processData(data[currentIndex]);
194  }
195  // create next sample
196  sampler.sample();
197  }
198  \endcode
199 
200  Create a Sampler for stratified sampling, without replacement.
201 
202  \code
203  // prepare the strata (i.e. specify which stratum each element belongs to)
204  int stratumSize1 = 2000, stratumSize2 = 8000,
205  totalCount = stratumSize1 + stratumSize2;
206  ArrayVerctor<int> strata(totalCount);
207  for(int i=0; i<stratumSize1; ++i)
208  strata[i] = 1;
209  for(int i=stratumSize1; i<stratumSize2; ++i)
210  strata[i] = 2;
211 
212  int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
213  int numberOfSamples = 20; // repeat experiment 20 times
214  Sampler<> stratifiedSampler(strata.begin(), strata.end(),
215  SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
216 
217  for(int k=0; k<numberOfSamples; ++k)
218  {
219  // process current sample
220  for(int i=0; i<sampler.sampleSize(); ++i)
221  {
222  int currentIndex = sampler[i];
223  processData(data[currentIndex]);
224  }
225  // create next sample
226  sampler.sample();
227  }
228  \endcode
229 */
230 template<class Random = MersenneTwister >
231 class Sampler
232 {
233  public:
234  /** Internal type of the indices.
235  Currently, 64-bit indices are not supported because this
236  requires extension of the random number generator classes.
237  */
238  typedef Int32 IndexType;
239 
241 
242  /** Type of the array view object that is returned by
243  sampledIndices() and oobIndices().
244  */
246 
247  private:
248  typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
249  typedef std::map<IndexType, int> StrataSizesType;
252 
253  static const int oobInvalid = -1;
254 
255  int total_count_, sample_size_;
256  mutable int current_oob_count_;
257  StrataIndicesType strata_indices_;
258  StrataSizesType strata_sample_size_;
259  IndexArrayType current_sample_;
260  mutable IndexArrayType current_oob_sample_;
261  IsUsedArrayType is_used_;
262  Random const & random_;
263  SamplerOptions options_;
264 
265  void initStrataCount()
266  {
267  // compute how many samples to take from each stratum
268  // (may be unequal if sample_size_ is not a multiple of strataCount())
269  int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount());
270  int strata_total_count = strata_sample_size * strataCount();
271 
272  for(StrataIndicesType::iterator i = strata_indices_.begin();
273  i != strata_indices_.end(); ++i)
274  {
275  if(strata_total_count > sample_size_)
276  {
277  strata_sample_size_[i->first] = strata_sample_size - 1;
278  --strata_total_count;
279  }
280  else
281  {
282  strata_sample_size_[i->first] = strata_sample_size;
283  }
284  }
285  }
286 
287  public:
288 
289  /** Create a sampler for \a totalCount data objects.
290 
291  In each invocation of <tt>sample()</tt> below, it will sample
292  indices according to the options passed. If no options are given,
293  <tt>totalCount</tt> indices will be drawn with replacement.
294  */
296  Random const & rnd = Random(RandomSeed))
297  : total_count_(totalCount),
298  sample_size_(opt.sample_size == 0
299  ? (int)(std::ceil(total_count_ * opt.sample_proportion))
300  : opt.sample_size),
301  current_oob_count_(oobInvalid),
302  current_sample_(sample_size_),
303  current_oob_sample_(total_count_),
304  is_used_(total_count_),
305  random_(rnd),
306  options_(opt)
307  {
308  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
309  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
310 
311  vigra_precondition(!opt.stratified_sampling,
312  "Sampler(): Stratified sampling requested, but no strata given.");
313 
314  // initialize a single stratum containing all data
315  strata_indices_[0].resize(total_count_);
316  for(int i=0; i<total_count_; ++i)
317  strata_indices_[0][i] = i;
318 
319  initStrataCount();
320  //this is screwing up the random forest tests.
321  //sample();
322  }
323 
324  /** Create a sampler for stratified sampling.
325 
326  <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence
327  which specifies for each sample the stratum it belongs to. The
328  total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
329  Equally many samples (subject to rounding) will be drawn from each stratum,
330  unless the option object explicitly requests unstratified sampling,
331  in which case the strata are ignored.
332  */
333  template <class Iterator>
334  Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(),
335  Random const & rnd = Random(RandomSeed))
336  : total_count_(strataEnd - strataBegin),
337  sample_size_(opt.sample_size == 0
338  ? (int)(std::ceil(total_count_ * opt.sample_proportion))
339  : opt.sample_size),
340  current_oob_count_(oobInvalid),
341  current_sample_(sample_size_),
342  current_oob_sample_(total_count_),
343  is_used_(total_count_),
344  random_(rnd),
345  options_(opt)
346  {
347  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
348  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
349 
350  // copy the strata indices
351  if(opt.stratified_sampling)
352  {
353  for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
354  {
355  strata_indices_[*strataBegin].push_back(i);
356  }
357  }
358  else
359  {
360  strata_indices_[0].resize(total_count_);
361  for(int i=0; i<total_count_; ++i)
362  strata_indices_[0][i] = i;
363  }
364 
365  vigra_precondition(sample_size_ >= (int)strata_indices_.size(),
366  "Sampler(): Requested sample count must be at least as large as the number of strata.");
367 
368  initStrataCount();
369  //this is screwing up the random forest tests.
370  //sample();
371  }
372 
373  /** Return the k-th index in the current sample.
374  */
375  IndexType operator[](int k) const
376  {
377  return current_sample_[k];
378  }
379 
380  /** Create a new sample.
381  */
382  void sample();
383 
384  /** The total number of data elements.
385  */
386  int totalCount() const
387  {
388  return total_count_;
389  }
390 
391  /** The number of data elements that have been sampled.
392  */
393  int sampleSize() const
394  {
395  return sample_size_;
396  }
397 
398  /** Same as sampleSize().
399  */
400  int size() const
401  {
402  return sample_size_;
403  }
404 
405  /** The number of strata to be used.
406  Will be 1 if no strata are given. Will be ognored when
407  stratifiedSampling() is false.
408  */
409  int strataCount() const
410  {
411  return strata_indices_.size();
412  }
413 
414  /** Whether to use stratified sampling.
415  (If this is false, strata will be ignored even if present.)
416  */
417  bool stratifiedSampling() const
418  {
419  return options_.stratified_sampling;
420  }
421 
422  /** Whether sampling should be performed with replacement.
423  */
424  bool withReplacement() const
425  {
426  return options_.sample_with_replacement;
427  }
428 
429  /** Return an array view containing the indices in the current sample.
430  */
432  {
433  return current_sample_;
434  }
435 
436  /** Return an array view containing the out-of-bag indices.
437  (i.e. the indices that are not in the current sample)
438  */
440  {
441  if(current_oob_count_ == oobInvalid)
442  {
443  current_oob_count_ = 0;
444  for(int i = 0; i<total_count_; ++i)
445  {
446  if(!is_used_[i])
447  {
448  current_oob_sample_[current_oob_count_] = i;
449  ++current_oob_count_;
450  }
451  }
452  }
453  return current_oob_sample_.subarray(0, current_oob_count_);
454  }
455  IsUsedArrayType const & is_used() const
456  {
457  return is_used_;
458  }
459 };
460 
461 
462 template<class Random>
464 {
465  current_oob_count_ = oobInvalid;
466  is_used_.init(false);
467 
468  if(options_.sample_with_replacement)
469  {
470  //Go thru all strata
471  int j = 0;
472  StrataIndicesType::iterator iter;
473  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
474  {
475  // do sampling with replacement in each strata and copy data.
476  int stratum_size = iter->second.size();
477  for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
478  {
479  current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
480  is_used_[current_sample_[j]] = true;
481  }
482  }
483  }
484  else
485  {
486  //Go thru all strata
487  int j = 0;
488  StrataIndicesType::iterator iter;
489  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
490  {
491  // do sampling without replacement in each strata and copy data.
492  int stratum_size = iter->second.size();
493  for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
494  {
495  std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
496  current_sample_[j] = iter->second[i];
497  is_used_[current_sample_[j]] = true;
498  }
499  }
500  }
501 }
502 
503 template<class Random =RandomTT800 >
504 class PoissonSampler
505 {
506 public:
507  Random randfloat;
508  typedef Int32 IndexType;
509  typedef vigra::ArrayVector <IndexType> IndexArrayType;
510  IndexArrayType used_indices_;
511  double lambda;
512  int minIndex;
513  int maxIndex;
514 
515  PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex)
516  : lambda(lambda),
517  minIndex(minIndex),
518  maxIndex(maxIndex)
519  {}
520 
521  void sample( )
522  {
523  used_indices_.clear();
524  IndexType i;
525  for(i=minIndex;i<maxIndex;++i)
526  {
527  //from http://en.wikipedia.org/wiki/Poisson_distribution
528  int k=0;
529  double p=1;
530  double L=exp(-lambda);
531  do
532  {
533  ++k;
534  p*=randfloat.uniform53();
535 
536  }while(p>L);
537  --k;
538  //Insert i this many time
539  while(k>0)
540  {
541  used_indices_.push_back(i);
542  --k;
543  }
544  }
545  }
546 
547  IndexType const & operator[](int in) const
548  {
549  return used_indices_[in];
550  }
551 
552  int numOfSamples() const
553  {
554  return used_indices_.size();
555  }
556 };
557 
558 //@}
559 
560 } // namespace vigra
561 
562 #endif /*VIGRA_SAMPLING_HXX*/

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.7.1 (Thu Jun 14 2012)