[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

rf_algorithm.hxx
1 /************************************************************************/
2 /* */
3 /* Copyright 2008-2009 by Rahul Nair */
4 /* */
5 /* This file is part of the VIGRA computer vision library. */
6 /* The VIGRA Website is */
7 /* http://hci.iwr.uni-heidelberg.de/vigra/ */
8 /* Please direct questions, bug reports, and contributions to */
9 /* ullrich.koethe@iwr.uni-heidelberg.de or */
10 /* vigra@informatik.uni-hamburg.de */
11 /* */
12 /* Permission is hereby granted, free of charge, to any person */
13 /* obtaining a copy of this software and associated documentation */
14 /* files (the "Software"), to deal in the Software without */
15 /* restriction, including without limitation the rights to use, */
16 /* copy, modify, merge, publish, distribute, sublicense, and/or */
17 /* sell copies of the Software, and to permit persons to whom the */
18 /* Software is furnished to do so, subject to the following */
19 /* conditions: */
20 /* */
21 /* The above copyright notice and this permission notice shall be */
22 /* included in all copies or substantial portions of the */
23 /* Software. */
24 /* */
25 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32 /* OTHER DEALINGS IN THE SOFTWARE. */
33 /* */
34 /************************************************************************/
35 #define VIGRA_RF_ALGORTIHM_HXX
36 
37 #include <vector>
38 #include "splices.hxx"
39 #include <queue>
40 namespace vigra
41 {
42 
43 namespace rf
44 {
45 /** This namespace contains all algorithms developed for feature
46  * selection
47  *
48  */
49 namespace algorithms
50 {
51 
52 namespace detail
53 {
54  /** create a MultiArray containing only columns supplied between iterators
55  b and e
56  */
57  template<class OrigMultiArray,
58  class Iter,
59  class DestMultiArray>
60  void choose(OrigMultiArray const & in,
61  Iter const & b,
62  Iter const & e,
63  DestMultiArray & out)
64  {
65  int columnCount = std::distance(b, e);
66  int rowCount = in.shape(0);
67  out.reshape(MultiArrayShape<2>::type(rowCount, columnCount));
68  int ii = 0;
69  for(Iter iter = b; iter != e; ++iter, ++ii)
70  {
71  columnVector(out, ii) = columnVector(in, *iter);
72  }
73  }
74 }
75 
76 
77 
78 /** Standard random forest Errorrate callback functor
79  *
80  * returns the random forest error estimate when invoked.
81  */
83 {
84  RandomForestOptions options;
85 
86  public:
87  /** Default constructor
88  *
89  * optionally supply options to the random forest classifier
90  * \sa RandomForestOptions
91  */
93  : options(opt)
94  {}
95 
96  /** returns the RF OOB error estimate given features and
97  * labels
98  */
99  template<class Feature_t, class Response_t>
100  double operator() (Feature_t const & features,
101  Response_t const & response)
102  {
103  RandomForest<> rf(options);
105  rf.learn(features,
106  response,
108  return oob.oob_breiman;
109  }
110 };
111 
112 
113 /** Structure to hold Variable Selection results
114  */
116 {
117  bool initialized;
118 
119  public:
121  : initialized(false)
122  {}
123 
124  typedef std::vector<int> FeatureList_t;
125  typedef std::vector<double> ErrorList_t;
126  typedef FeatureList_t::iterator Pivot_t;
127 
128  Pivot_t pivot;
129 
130  /** list of features.
131  */
132  FeatureList_t selected;
133 
134  /** vector of size (number of features)
135  *
136  * the i-th entry encodes the error rate obtained
137  * while using features [0 - i](including i)
138  *
139  * if the i-th entry is -1 then no error rate was obtained
140  * this may happen if more than one feature is added to the
141  * selected list in one step of the algorithm.
142  *
143  * during initialisation error[m+n-1] is always filled
144  */
145  ErrorList_t errors;
146 
147 
148  /** errorrate using no features
149  */
150  double no_features;
151 
152  template<class FeatureT,
153  class ResponseT,
154  class Iter,
155  class ErrorRateCallBack>
156  bool init(FeatureT const & all_features,
157  ResponseT const & response,
158  Iter b,
159  Iter e,
160  ErrorRateCallBack errorcallback)
161  {
162  bool ret_ = init(all_features, response, errorcallback);
163  if(!ret_)
164  return false;
165  vigra_precondition(std::distance(b, e) == selected.size(),
166  "Number of features in ranking != number of features matrix");
167  std::copy(b, e, selected.begin());
168  return true;
169  }
170 
171  template<class FeatureT,
172  class ResponseT,
173  class Iter>
174  bool init(FeatureT const & all_features,
175  ResponseT const & response,
176  Iter b,
177  Iter e)
178  {
179  RFErrorCallback ecallback;
180  return init(all_features, response, b, e, ecallback);
181  }
182 
183 
184  template<class FeatureT,
185  class ResponseT>
186  bool init(FeatureT const & all_features,
187  ResponseT const & response)
188  {
189  return init(all_features, response, RFErrorCallback());
190  }
191  /**initialization routine. Will be called only once in the lifetime
192  * of a VariableSelectionResult. Subsequent calls will not reinitialize
193  * member variables.
194  *
195  * This is intended, to allow continuing variable selection at a point
196  * stopped in an earlier iteration.
197  *
198  * returns true if initialization was successful and false if
199  * the object was already initialized before.
200  */
201  template<class FeatureT,
202  class ResponseT,
203  class ErrorRateCallBack>
204  bool init(FeatureT const & all_features,
205  ResponseT const & response,
206  ErrorRateCallBack errorcallback)
207  {
208  if(initialized)
209  {
210  return false;
211  }
212  // calculate error with all features
213  selected.resize(all_features.shape(1), 0);
214  for(unsigned int ii = 0; ii < selected.size(); ++ii)
215  selected[ii] = ii;
216  errors.resize(all_features.shape(1), -1);
217  errors.back() = errorcallback(all_features, response);
218 
219  // calculate error rate if no features are chosen
220  // corresponds to max(prior probability) of the classes
221  std::map<typename ResponseT::value_type, int> res_map;
222  std::vector<int> cts;
223  int counter = 0;
224  for(int ii = 0; ii < response.shape(0); ++ii)
225  {
226  if(res_map.find(response(ii, 0)) == res_map.end())
227  {
228  res_map[response(ii, 0)] = counter;
229  ++counter;
230  cts.push_back(0);
231  }
232  cts[res_map[response(ii,0)]] +=1;
233  }
234  no_features = double(*(std::max_element(cts.begin(),
235  cts.end())))
236  / double(response.shape(0));
237 
238  /*init not_selected vector;
239  not_selected.resize(all_features.shape(1), 0);
240  for(int ii = 0; ii < not_selected.size(); ++ii)
241  {
242  not_selected[ii] = ii;
243  }
244  initialized = true;
245  */
246  pivot = selected.begin();
247  return true;
248  }
249 };
250 
251 
252 
253 /** Perform forward selection
254  *
255  * \param features IN: n x p matrix containing n instances with p attributes/features
256  * used in the variable selection algorithm
257  * \param response IN: n x 1 matrix containing the corresponding response
258  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
259  * of the algorithm.
260  * Features between result.selected.begin() and result.pivot will
261  * be left untouched.
262  * \sa VariableSelectionResult
263  * \param errorcallback
264  * IN, OPTIONAL:
265  * Functor that returns the error rate given a set of
266  * features and labels. Default is the RandomForest OOB Error.
267  *
268  * Forward selection subsequently chooses the next feature that decreases the Error rate most.
269  *
270  * usage:
271  * \code
272  * MultiArray<2, double> features = createSomeFeatures();
273  * MultiArray<2, int> labels = createCorrespondingLabels();
274  * VariableSelectionResult result;
275  * forward_selection(features, labels, result);
276  * \endcode
277  * To use forward selection but ensure that a specific feature e.g. feature 5 is always
278  * included one would do the following
279  *
280  * \code
281  * VariableSelectionResult result;
282  * result.init(features, labels);
283  * std::swap(result.selected[0], result.selected[5]);
284  * result.setPivot(1);
285  * forward_selection(features, labels, result);
286  * \endcode
287  *
288  * \sa VariableSelectionResult
289  *
290  */
291 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
292 void forward_selection(FeatureT const & features,
293  ResponseT const & response,
294  VariableSelectionResult & result,
295  ErrorRateCallBack errorcallback)
296 {
297  VariableSelectionResult::FeatureList_t & selected = result.selected;
298  VariableSelectionResult::ErrorList_t & errors = result.errors;
299  VariableSelectionResult::Pivot_t & pivot = result.pivot;
300  int featureCount = features.shape(1);
301  // initialize result struct if in use for the first time
302  if(!result.init(features, response, errorcallback))
303  {
304  //result is being reused just ensure that the number of features is
305  //the same.
306  vigra_precondition(selected.size() == featureCount,
307  "forward_selection(): Number of features in Feature "
308  "matrix and number of features in previously used "
309  "result struct mismatch!");
310  }
311 
312 
313  int not_selected_size = std::distance(pivot, selected.end());
314  while(not_selected_size > 1)
315  {
316  std::vector<int> current_errors;
317  VariableSelectionResult::Pivot_t next = pivot;
318  for(int ii = 0; ii < not_selected_size; ++ii, ++next)
319  {
320  std::swap(*pivot, *next);
321  MultiArray<2, double> cur_feats;
322  detail::choose( features,
323  selected.begin(),
324  pivot+1,
325  cur_feats);
326  double error = errorcallback(cur_feats, response);
327  current_errors.push_back(error);
328  std::swap(*pivot, *next);
329  }
330  int pos = std::distance(current_errors.begin(),
331  std::min_element(current_errors.begin(),
332  current_errors.end()));
333  next = pivot;
334  std::advance(next, pos);
335  std::swap(*pivot, *next);
336  errors[std::distance(selected.begin(), pivot)] = current_errors[pos];
337  ++pivot;
338  not_selected_size = std::distance(pivot, selected.end());
339  }
340 }
341 template<class FeatureT, class ResponseT>
342 void forward_selection(FeatureT const & features,
343  ResponseT const & response,
344  VariableSelectionResult & result)
345 {
346  forward_selection(features, response, result, RFErrorCallback());
347 }
348 
349 
350 /** Perform backward elimination
351  *
352  * \param features IN: n x p matrix containing n instances with p attributes/features
353  * used in the variable selection algorithm
354  * \param response IN: n x 1 matrix containing the corresponding response
355  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
356  * of the algorithm.
357  * Features between result.pivot and result.selected.end() will
358  * be left untouched.
359  * \sa VariableSelectionResult
360  * \param errorcallback
361  * IN, OPTIONAL:
362  * Functor that returns the error rate given a set of
363  * features and labels. Default is the RandomForest OOB Error.
364  *
365  * Backward elimination subsequently eliminates features that have the least influence
366  * on the error rate
367  *
368  * usage:
369  * \code
370  * MultiArray<2, double> features = createSomeFeatures();
371  * MultiArray<2, int> labels = createCorrespondingLabels();
372  * VariableSelectionResult result;
373  * backward_elimination(features, labels, result);
374  * \endcode
375  * To use backward elimination but ensure that a specific feature e.g. feature 5 is always
376  * excluded one would do the following:
377  *
378  * \code
379  * VariableSelectionResult result;
380  * result.init(features, labels);
381  * std::swap(result.selected[result.selected.size()-1], result.selected[5]);
382  * result.setPivot(result.selected.size()-1);
383  * backward_elimination(features, labels, result);
384  * \endcode
385  *
386  * \sa VariableSelectionResult
387  *
388  */
389 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
390 void backward_elimination(FeatureT const & features,
391  ResponseT const & response,
392  VariableSelectionResult & result,
393  ErrorRateCallBack errorcallback)
394 {
395  int featureCount = features.shape(1);
396  VariableSelectionResult::FeatureList_t & selected = result.selected;
397  VariableSelectionResult::ErrorList_t & errors = result.errors;
398  VariableSelectionResult::Pivot_t & pivot = result.pivot;
399 
400  // initialize result struct if in use for the first time
401  if(!result.init(features, response, errorcallback))
402  {
403  //result is being reused just ensure that the number of features is
404  //the same.
405  vigra_precondition(selected.size() == featureCount,
406  "backward_elimination(): Number of features in Feature "
407  "matrix and number of features in previously used "
408  "result struct mismatch!");
409  }
410  pivot = selected.end() - 1;
411 
412  int selected_size = std::distance(selected.begin(), pivot);
413  while(selected_size > 1)
414  {
415  VariableSelectionResult::Pivot_t next = selected.begin();
416  std::vector<int> current_errors;
417  for(int ii = 0; ii < selected_size; ++ii, ++next)
418  {
419  std::swap(*pivot, *next);
420  MultiArray<2, double> cur_feats;
421  detail::choose( features,
422  selected.begin(),
423  pivot,
424  cur_feats);
425  double error = errorcallback(cur_feats, response);
426  current_errors.push_back(error);
427  std::swap(*pivot, *next);
428  }
429  int pos = std::distance(current_errors.begin(),
430  std::max_element(current_errors.begin(),
431  current_errors.end()));
432  next = selected.begin();
433  std::advance(next, pos);
434  std::swap(*pivot, *next);
435 // std::cerr << std::distance(selected.begin(), pivot) << " " << pos << " " << current_errors.size() << " " << errors.size() << std::endl;
436  errors[std::distance(selected.begin(), pivot)] = current_errors[pos];
437  selected_size = std::distance(selected.begin(), pivot);
438  --pivot;
439  }
440 }
441 
442 template<class FeatureT, class ResponseT>
443 void backward_elimination(FeatureT const & features,
444  ResponseT const & response,
445  VariableSelectionResult & result)
446 {
447  backward_elimination(features, response, result, RFErrorCallback());
448 }
449 
450 /** Perform rank selection using a predefined ranking
451  *
452  * \param features IN: n x p matrix containing n instances with p attributes/features
453  * used in the variable selection algorithm
454  * \param response IN: n x 1 matrix containing the corresponding response
455  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
456  * of the algorithm. The struct should be initialized with the
457  * predefined ranking.
458  *
459  * \sa VariableSelectionResult
460  * \param errorcallback
461  * IN, OPTIONAL:
462  * Functor that returns the error rate given a set of
463  * features and labels. Default is the RandomForest OOB Error.
464  *
465  * Often some variable importance, score measure is used to create the ordering in which
466  * variables have to be selected. This method takes such a ranking and calculates the
467  * corresponding error rates.
468  *
469  * usage:
470  * \code
471  * MultiArray<2, double> features = createSomeFeatures();
472  * MultiArray<2, int> labels = createCorrespondingLabels();
473  * std::vector<int> ranking = createRanking(features);
474  * VariableSelectionResult result;
475  * result.init(features, labels, ranking.begin(), ranking.end());
476  * backward_elimination(features, labels, result);
477  * \endcode
478  *
479  * \sa VariableSelectionResult
480  *
481  */
482 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
483 void rank_selection (FeatureT const & features,
484  ResponseT const & response,
485  VariableSelectionResult & result,
486  ErrorRateCallBack errorcallback)
487 {
488  VariableSelectionResult::FeatureList_t & selected = result.selected;
489  VariableSelectionResult::ErrorList_t & errors = result.errors;
490  VariableSelectionResult::Pivot_t & iter = result.pivot;
491  int featureCount = features.shape(1);
492  // initialize result struct if in use for the first time
493  if(!result.init(features, response, errorcallback))
494  {
495  //result is being reused just ensure that the number of features is
496  //the same.
497  vigra_precondition(selected.size() == featureCount,
498  "forward_selection(): Number of features in Feature "
499  "matrix and number of features in previously used "
500  "result struct mismatch!");
501  }
502 
503  int ii = 0;
504  for(; iter != selected.end(); ++iter)
505  {
506 // std::cerr << ii<< std::endl;
507  ++ii;
508  MultiArray<2, double> cur_feats;
509  detail::choose( features,
510  selected.begin(),
511  iter,
512  cur_feats);
513  double error = errorcallback(cur_feats, response);
514  errors[std::distance(selected.begin(), iter)] = error;
515 
516  }
517 }
518 
519 template<class FeatureT, class ResponseT>
520 void rank_selection (FeatureT const & features,
521  ResponseT const & response,
522  VariableSelectionResult & result)
523 {
524  rank_selection(features, response, result, RFErrorCallback());
525 }
526 
527 
528 
529 enum ClusterLeafTypes{c_Leaf = 95, c_Node = 99};
530 
531 /* View of a Node in the hierarchical clustering
532  * class
533  * For internal use only -
534  * \sa NodeBase
535  */
536 class ClusterNode
537 : public NodeBase
538 {
539  public:
540 
541  typedef NodeBase BT;
542 
543  /**constructors **/
544  ClusterNode():NodeBase(){}
545  ClusterNode( int nCol,
546  BT::T_Container_type & topology,
547  BT::P_Container_type & split_param)
548  : BT(nCol + 5, 5,topology, split_param)
549  {
550  status() = 0;
551  BT::column_data()[0] = nCol;
552  if(nCol == 1)
553  BT::typeID() = c_Leaf;
554  else
555  BT::typeID() = c_Node;
556  }
557 
558  ClusterNode( BT::T_Container_type const & topology,
559  BT::P_Container_type const & split_param,
560  int n )
561  : NodeBase(5 , 5,topology, split_param, n)
562  {
563  //TODO : is there a more elegant way to do this?
564  BT::topology_size_ += BT::column_data()[0];
565  }
566 
567  ClusterNode( BT & node_)
568  : BT(5, 5, node_)
569  {
570  //TODO : is there a more elegant way to do this?
571  BT::topology_size_ += BT::column_data()[0];
572  BT::parameter_size_ += 0;
573  }
574  int index()
575  {
576  return static_cast<int>(BT::parameters_begin()[1]);
577  }
578  void set_index(int in)
579  {
580  BT::parameters_begin()[1] = in;
581  }
582  double& mean()
583  {
584  return BT::parameters_begin()[2];
585  }
586  double& stdev()
587  {
588  return BT::parameters_begin()[3];
589  }
590  double& status()
591  {
592  return BT::parameters_begin()[4];
593  }
594 };
595 
596 /** Stackentry class for HClustering class
597  */
598 struct HC_Entry
599 {
600  int parent;
601  int level;
602  int addr;
603  bool infm;
604  HC_Entry(int p, int l, int a, bool in)
605  : parent(p), level(l), addr(a), infm(in)
606  {}
607 };
608 
609 
610 /** Hierarchical Clustering class.
611  * Performs single linkage clustering
612  * \code
613  * Matrix<double> distance = get_distance_matrix();
614  * linkage.cluster(distance);
615  * // Draw clustering tree.
616  * Draw<double, int> draw(features, labels, "linkagetree.graph");
617  * linkage.breadth_first_traversal(draw);
618  * \endcode
619  * \sa ClusterImportanceVisitor
620  *
621  * once the clustering has taken place. Information queries can be made
622  * using the breadth_first_traversal() method and iterate() method
623  *
624  */
626 {
627 public:
629  ArrayVector<int> topology_;
630  ArrayVector<double> parameters_;
631  int begin_addr;
632 
633  // Calculates the distance between two
634  double dist_func(double a, double b)
635  {
636  return std::min(a, b);
637  }
638 
639  /** Visit each node with a Functor
640  * in creation order (should be depth first)
641  */
642  template<class Functor>
643  void iterate(Functor & tester)
644  {
645 
646  std::vector<int> stack;
647  stack.push_back(begin_addr);
648  while(!stack.empty())
649  {
650  ClusterNode node(topology_, parameters_, stack.back());
651  stack.pop_back();
652  if(!tester(node))
653  {
654  if(node.columns_size() != 1)
655  {
656  stack.push_back(node.child(0));
657  stack.push_back(node.child(1));
658  }
659  }
660  }
661  }
662 
663  /** Perform breadth first traversal of hierarchical cluster tree
664  */
665  template<class Functor>
666  void breadth_first_traversal(Functor & tester)
667  {
668 
669  std::queue<HC_Entry> queue;
670  int level = 0;
671  int parent = -1;
672  int addr = -1;
673  bool infm = false;
674  queue.push(HC_Entry(parent,level,begin_addr, infm));
675  while(!queue.empty())
676  {
677  level = queue.front().level;
678  parent = queue.front().parent;
679  addr = queue.front().addr;
680  infm = queue.front().infm;
681  ClusterNode node(topology_, parameters_, queue.front().addr);
682  ClusterNode parnt;
683  if(parent != -1)
684  {
685  parnt = ClusterNode(topology_, parameters_, parent);
686  }
687  queue.pop();
688  bool istrue = tester(node, level, parnt, infm);
689  if(node.columns_size() != 1)
690  {
691  queue.push(HC_Entry(addr, level +1,node.child(0),istrue));
692  queue.push(HC_Entry(addr, level +1,node.child(1),istrue));
693  }
694  }
695  }
696 #ifdef HasHDF5
697  /**save to HDF5 - defunct - has to be updated to new HDF5 interface
698  */
699  void save(std::string file, std::string prefix)
700  {
701 
702  vigra::writeHDF5(file.c_str(), (prefix + "topology").c_str(),
704  Shp(topology_.size(),1),
705  topology_.data()));
706  vigra::writeHDF5(file.c_str(), (prefix + "parameters").c_str(),
708  Shp(parameters_.size(), 1),
709  parameters_.data()));
710  vigra::writeHDF5(file.c_str(), (prefix + "begin_addr").c_str(),
711  MultiArrayView<2, int>(Shp(1,1), &begin_addr));
712 
713  }
714 #endif
715 
716  /**Perform single linkage clustering
717  * \param distance distance matrix used. \sa CorrelationVisitor
718  */
719  template<class T, class C>
721  {
722  MultiArray<2, T> dist(distance);
723  std::vector<std::pair<int, int> > addr;
724  typedef std::pair<int, int> Entry;
725  int index = 0;
726  for(int ii = 0; ii < distance.shape(0); ++ii)
727  {
728  addr.push_back(std::make_pair(topology_.size(), ii));
729  ClusterNode leaf(1, topology_, parameters_);
730  leaf.set_index(index);
731  ++index;
732  leaf.columns_begin()[0] = ii;
733  }
734 
735  while(addr.size() != 1)
736  {
737  //find the two nodes with the smallest distance
738  int ii_min = 0;
739  int jj_min = 1;
740  double min_dist = dist((addr.begin()+ii_min)->second,
741  (addr.begin()+jj_min)->second);
742  for(unsigned int ii = 0; ii < addr.size(); ++ii)
743  {
744  for(unsigned int jj = ii+1; jj < addr.size(); ++jj)
745  {
746  if( dist((addr.begin()+ii_min)->second,
747  (addr.begin()+jj_min)->second)
748  > dist((addr.begin()+ii)->second,
749  (addr.begin()+jj)->second))
750  {
751  min_dist = dist((addr.begin()+ii)->second,
752  (addr.begin()+jj)->second);
753  ii_min = ii;
754  jj_min = jj;
755  }
756  }
757  }
758 
759  //merge two nodes
760  int col_size = 0;
761  // The problem is that creating a new node invalidates the iterators stored
762  // in firstChild and secondChild.
763  {
764  ClusterNode firstChild(topology_,
765  parameters_,
766  (addr.begin() +ii_min)->first);
767  ClusterNode secondChild(topology_,
768  parameters_,
769  (addr.begin() +jj_min)->first);
770  col_size = firstChild.columns_size() + secondChild.columns_size();
771  }
772  int cur_addr = topology_.size();
773  begin_addr = cur_addr;
774 // std::cerr << col_size << std::endl;
775  ClusterNode parent(col_size,
776  topology_,
777  parameters_);
778  ClusterNode firstChild(topology_,
779  parameters_,
780  (addr.begin() +ii_min)->first);
781  ClusterNode secondChild(topology_,
782  parameters_,
783  (addr.begin() +jj_min)->first);
784  parent.parameters_begin()[0] = min_dist;
785  parent.set_index(index);
786  ++index;
787  std::merge(firstChild.columns_begin(), firstChild.columns_end(),
788  secondChild.columns_begin(),secondChild.columns_end(),
789  parent.columns_begin());
790  //merge nodes in addr
791  int to_keep;
792  int to_desc;
793  int ii_keep;
794  if(*parent.columns_begin() == *firstChild.columns_begin())
795  {
796  parent.child(0) = (addr.begin()+ii_min)->first;
797  parent.child(1) = (addr.begin()+jj_min)->first;
798  (addr.begin()+ii_min)->first = cur_addr;
799  ii_keep = ii_min;
800  to_keep = (addr.begin()+ii_min)->second;
801  to_desc = (addr.begin()+jj_min)->second;
802  addr.erase(addr.begin()+jj_min);
803  }
804  else
805  {
806  parent.child(1) = (addr.begin()+ii_min)->first;
807  parent.child(0) = (addr.begin()+jj_min)->first;
808  (addr.begin()+jj_min)->first = cur_addr;
809  ii_keep = jj_min;
810  to_keep = (addr.begin()+jj_min)->second;
811  to_desc = (addr.begin()+ii_min)->second;
812  addr.erase(addr.begin()+ii_min);
813  }
814  //update distances;
815 
816  for(unsigned int jj = 0 ; jj < addr.size(); ++jj)
817  {
818  if(jj == ii_keep)
819  continue;
820  double bla = dist_func(
821  dist(to_desc, (addr.begin()+jj)->second),
822  dist((addr.begin()+ii_keep)->second,
823  (addr.begin()+jj)->second));
824 
825  dist((addr.begin()+ii_keep)->second,
826  (addr.begin()+jj)->second) = bla;
827  dist((addr.begin()+jj)->second,
828  (addr.begin()+ii_keep)->second) = bla;
829  }
830  }
831  }
832 
833 };
834 
835 
836 /** Normalize the status value in the HClustering tree (HClustering Visitor)
837  */
839 {
840 public:
841  double n;
842  /** Constructor
843  * \param m normalize status() by m
844  */
845  NormalizeStatus(double m)
846  :n(m)
847  {}
848  template<class Node>
849  bool operator()(Node& node)
850  {
851  node.status()/=n;
852  return false;
853  }
854 };
855 
856 
857 /** Perform Permutation importance on HClustering clusters
858  * (See visit_after_tree() method of visitors::VariableImportance to
859  * see the basic idea. (Just that we apply the permutation not only to
860  * variables but also to clusters))
861  */
862 template<class Iter, class DT>
864 {
865 public:
867  Matrix<double> tmp_mem_;
868  MultiArrayView<2, double> perm_imp;
869  MultiArrayView<2, double> orig_imp;
870  Matrix<double> feats_;
871  Matrix<int> labels_;
872  const int nPerm;
873  DT const & dt;
874  int index;
875  int oob_size;
876 
877  template<class Feat_T, class Label_T>
878  PermuteCluster(Iter a,
879  Iter b,
880  Feat_T const & feats,
881  Label_T const & labls,
884  int np,
885  DT const & dt_)
886  :tmp_mem_(_spl(a, b).size(), feats.shape(1)),
887  perm_imp(p_imp),
888  orig_imp(o_imp),
889  feats_(_spl(a,b).size(), feats.shape(1)),
890  labels_(_spl(a,b).size(),1),
891  nPerm(np),
892  dt(dt_),
893  index(0),
894  oob_size(b-a)
895  {
896  copy_splice(_spl(a,b),
897  _spl(feats.shape(1)),
898  feats,
899  feats_);
900  copy_splice(_spl(a,b),
901  _spl(labls.shape(1)),
902  labls,
903  labels_);
904  }
905 
906  template<class Node>
907  bool operator()(Node& node)
908  {
909  tmp_mem_ = feats_;
910  RandomMT19937 random;
911  int class_count = perm_imp.shape(1) - 1;
912  //permute columns together
913  for(int kk = 0; kk < nPerm; ++kk)
914  {
915  tmp_mem_ = feats_;
916  for(int ii = 0; ii < rowCount(feats_); ++ii)
917  {
918  int index = random.uniformInt(rowCount(feats_) - ii) +ii;
919  for(int jj = 0; jj < node.columns_size(); ++jj)
920  {
921  if(node.columns_begin()[jj] != feats_.shape(1))
922  tmp_mem_(ii, node.columns_begin()[jj])
923  = tmp_mem_(index, node.columns_begin()[jj]);
924  }
925  }
926 
927  for(int ii = 0; ii < rowCount(tmp_mem_); ++ii)
928  {
929  if(dt
930  .predictLabel(rowVector(tmp_mem_, ii))
931  == labels_(ii, 0))
932  {
933  //per class
934  ++perm_imp(index,labels_(ii, 0));
935  //total
936  ++perm_imp(index, class_count);
937  }
938  }
939  }
940  double node_status = perm_imp(index, class_count);
941  node_status /= nPerm;
942  node_status -= orig_imp(0, class_count);
943  node_status *= -1;
944  node_status /= oob_size;
945  node.status() += node_status;
946  ++index;
947 
948  return false;
949  }
950 };
951 
952 /** Convert ClusteringTree into a list (HClustering visitor)
953  */
955 {
956 public:
957  /** NumberOfClusters x NumberOfVariables MultiArrayView containing
958  * in each row the variable belonging to a cluster
959  */
961  int index;
963  :variables(vars), index(0)
964  {}
965 #ifdef HasHDF5
966  void save(std::string file, std::string prefix)
967  {
968  vigra::writeHDF5(file.c_str(), (prefix + "_variables").c_str(),
969  variables);
970  }
971 #endif
972 
973  template<class Node>
974  bool operator()(Node& node)
975  {
976  for(int ii = 0; ii < node.columns_size(); ++ii)
977  variables(index, ii) = node.columns_begin()[ii];
978  ++index;
979  return false;
980  }
981 };
982 /** corrects the status fields of a linkage Clustering (HClustering Visitor)
983  *
984  * such that status(currentNode) = min(status(parent), status(currentNode))
985  * \sa cluster_permutation_importance()
986  */
988 {
989 public:
990  template<class Nde>
991  bool operator()(Nde & cur, int level, Nde parent, bool infm)
992  {
993  if(parent.hasData_)
994  cur.status() = std::min(parent.status(), cur.status());
995  return true;
996  }
997 };
998 
999 
1000 /** draw current linkage Clustering (HClustering Visitor)
1001  *
1002  * create a graphviz .dot file
1003  * usage:
1004  * \code
1005  * Matrix<double> distance = get_distance_matrix();
1006  * linkage.cluster(distance);
1007  * Draw<double, int> draw(features, labels, "linkagetree.graph");
1008  * linkage.breadth_first_traversal(draw);
1009  * \endcode
1010  */
1011 template<class T1,
1012  class T2,
1013  class C1 = UnstridedArrayTag,
1014  class C2 = UnstridedArrayTag>
1015 class Draw
1016 {
1017 public:
1018  typedef MultiArrayShape<2>::type Shp;
1019  MultiArrayView<2, T1, C1> const & features_;
1020  MultiArrayView<2, T2, C2> const & labels_;
1021  std::ofstream graphviz;
1022 
1023 
1024  Draw(MultiArrayView<2, T1, C1> const & features,
1025  MultiArrayView<2, T2, C2> const& labels,
1026  std::string const gz)
1027  :features_(features), labels_(labels),
1028  graphviz(gz.c_str(), std::ios::out)
1029  {
1030  graphviz << "digraph G\n{\n node [shape=\"record\"]";
1031  }
1032  ~Draw()
1033  {
1034  graphviz << "\n}\n";
1035  graphviz.close();
1036  }
1037 
1038  template<class Nde>
1039  bool operator()(Nde & cur, int level, Nde parent, bool infm)
1040  {
1041  graphviz << "node" << cur.index() << " [style=\"filled\"][label = \" #Feats: "<< cur.columns_size() << "\\n";
1042  graphviz << " status: " << cur.status() << "\\n";
1043  for(int kk = 0; kk < cur.columns_size(); ++kk)
1044  {
1045  graphviz << cur.columns_begin()[kk] << " ";
1046  if(kk % 15 == 14)
1047  graphviz << "\\n";
1048  }
1049  graphviz << "\"] [color = \"" <<cur.status() << " 1.000 1.000\"];\n";
1050  if(parent.hasData_)
1051  graphviz << "\"node" << parent.index() << "\" -> \"node" << cur.index() <<"\";\n";
1052  return true;
1053  }
1054 };
1055 
1056 /** calculate Cluster based permutation importance while learning. (RandomForestVisitor)
1057  */
1059 {
1060  public:
1061 
1062  /** List of variables as produced by GetClusterVariables
1063  */
1065  /** Corresponding importance measures
1066  */
1068  /** Corresponding error
1069  */
1071  int repetition_count_;
1072  bool in_place_;
1073  HClustering & clustering;
1074 
1075 
1076 #ifdef HasHDF5
1077  void save(std::string filename, std::string prefix)
1078  {
1079  std::string prefix1 = "cluster_importance_" + prefix;
1080  writeHDF5(filename.c_str(),
1081  prefix1.c_str(),
1083  prefix1 = "vars_" + prefix;
1084  writeHDF5(filename.c_str(),
1085  prefix1.c_str(),
1086  variables);
1087  }
1088 #endif
1089 
1090  ClusterImportanceVisitor(HClustering & clst, int rep_cnt = 10)
1091  : repetition_count_(rep_cnt), clustering(clst)
1092 
1093  {}
1094 
1095  /** Allocate enough memory
1096  */
1097  template<class RF, class PR>
1098  void visit_at_beginning(RF const & rf, PR const & pr)
1099  {
1100  Int32 const class_count = rf.ext_param_.class_count_;
1101  Int32 const column_count = rf.ext_param_.column_count_+1;
1103  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1104  class_count+1));
1106  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1107  class_count+1));
1108  variables
1109  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1110  column_count), -1);
1112  clustering.iterate(gcv);
1113 
1114  }
1115 
1116  /**compute permutation based var imp.
1117  * (Only an Array of size oob_sample_count x 1 is created.
1118  * - apposed to oob_sample_count x feature_count in the other method.
1119  *
1120  * \sa FieldProxy
1121  */
1122  template<class RF, class PR, class SM, class ST>
1123  void after_tree_ip_impl(RF& rf, PR & pr, SM & sm, ST & st, int index)
1124  {
1125  typedef MultiArrayShape<2>::type Shp_t;
1126  Int32 column_count = rf.ext_param_.column_count_ +1;
1127  Int32 class_count = rf.ext_param_.class_count_;
1128 
1129  // remove the const cast on the features (yep , I know what I am
1130  // doing here.) data is not destroyed.
1131  typename PR::Feature_t & features
1132  = const_cast<typename PR::Feature_t &>(pr.features());
1133 
1134  //find the oob indices of current tree.
1135  ArrayVector<Int32> oob_indices;
1137  iter;
1138 
1139  if(rf.ext_param_.actual_msample_ < pr.features().shape(0)- 10000)
1140  {
1141  ArrayVector<int> cts(2, 0);
1142  ArrayVector<Int32> indices(pr.features().shape(0));
1143  for(int ii = 0; ii < pr.features().shape(0); ++ii)
1144  indices.push_back(ii);
1145  std::random_shuffle(indices.begin(), indices.end());
1146  for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1147  {
1148  if(!sm.is_used()[indices[ii]] && cts[pr.response()(indices[ii], 0)] < 3000)
1149  {
1150  oob_indices.push_back(indices[ii]);
1151  ++cts[pr.response()(indices[ii], 0)];
1152  }
1153  }
1154  }
1155  else
1156  {
1157  for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1158  if(!sm.is_used()[ii])
1159  oob_indices.push_back(ii);
1160  }
1161 
1162  // Random foo
1163  RandomMT19937 random(RandomSeed);
1165  randint(random);
1166 
1167  //make some space for the results
1169  oob_right(Shp_t(1, class_count + 1));
1170 
1171  // get the oob success rate with the original samples
1172  for(iter = oob_indices.begin();
1173  iter != oob_indices.end();
1174  ++iter)
1175  {
1176  if(rf.tree(index)
1177  .predictLabel(rowVector(features, *iter))
1178  == pr.response()(*iter, 0))
1179  {
1180  //per class
1181  ++oob_right[pr.response()(*iter,0)];
1182  //total
1183  ++oob_right[class_count];
1184  }
1185  }
1186 
1188  perm_oob_right (Shp_t(2* column_count-1, class_count + 1));
1189 
1190  PermuteCluster<ArrayVector<Int32>::iterator,typename RF::DecisionTree_t>
1191  pc(oob_indices.begin(), oob_indices.end(),
1192  pr.features(),
1193  pr.response(),
1194  perm_oob_right,
1195  oob_right,
1196  repetition_count_,
1197  rf.tree(index));
1198  clustering.iterate(pc);
1199 
1200  perm_oob_right /= repetition_count_;
1201  for(int ii = 0; ii < rowCount(perm_oob_right); ++ii)
1202  rowVector(perm_oob_right, ii) -= oob_right;
1203 
1204  perm_oob_right *= -1;
1205  perm_oob_right /= oob_indices.size();
1206  cluster_importance_ += perm_oob_right;
1207  }
1208 
1209  /** calculate permutation based impurity after every tree has been
1210  * learned default behaviour is that this happens out of place.
1211  * If you have very big data sets and want to avoid copying of data
1212  * set the in_place_ flag to true.
1213  */
1214  template<class RF, class PR, class SM, class ST>
1215  void visit_after_tree(RF& rf, PR & pr, SM & sm, ST & st, int index)
1216  {
1217  after_tree_ip_impl(rf, pr, sm, st, index);
1218  }
1219 
1220  /** Normalise variable importance after the number of trees is known.
1221  */
1222  template<class RF, class PR>
1223  void visit_at_end(RF & rf, PR & pr)
1224  {
1225  NormalizeStatus nrm(rf.tree_count());
1226  clustering.iterate(nrm);
1227  cluster_importance_ /= rf.trees_.size();
1228  }
1229 };
1230 
1231 /** Perform hierarchical clustering of variables and assess importance of clusters
1232  *
1233  * \param features IN: n x p matrix containing n instances with p attributes/features
1234  * used in the variable selection algorithm
1235  * \param response IN: n x 1 matrix containing the corresponding response
1236  * \param linkage OUT: Hierarchical grouping of variables.
1237  * \param distance OUT: distance matrix used for creating the linkage
1238  *
1239  * Performs Hierarchical clustering of variables. And calculates the permutation importance
1240  * measures of each of the clusters. Use the Draw functor to create human readable output
1241  * The cluster-permutation importance measure corresponds to the normal permutation importance
1242  * measure with all columns corresponding to a cluster permuted.
1243  * The importance measure for each cluster is stored as the status() field of each clusternode
1244  * \sa HClustering
1245  *
1246  * usage:
1247  * \code
1248  * MultiArray<2, double> features = createSomeFeatures();
1249  * MultiArray<2, int> labels = createCorrespondingLabels();
1250  * HClustering linkage;
1251  * MultiArray<2, double> distance;
1252  * cluster_permutation_importance(features, labels, linkage, distance)
1253  * // create graphviz output
1254  *
1255  * Draw<double, int> draw(features, labels, "linkagetree.graph");
1256  * linkage.breadth_first_traversal(draw);
1257  *
1258  * \endcode
1259  *
1260  *
1261  */
1262 template<class FeatureT, class ResponseT>
1263 void cluster_permutation_importance(FeatureT const & features,
1264  ResponseT const & response,
1265  HClustering & linkage,
1266  MultiArray<2, double> & distance)
1267 {
1268 
1269  RandomForestOptions opt;
1270  opt.tree_count(100);
1271  if(features.shape(0) > 40000)
1272  opt.samples_per_tree(20000).use_stratification(RF_EQUAL);
1273 
1274 
1275  vigra::RandomForest<int> RF(opt);
1278  RF.learn(features, response,
1279  create_visitor(missc, progress));
1280  distance = missc.distance;
1281  /*
1282  missc.save(exp_dir + dset.name() + "_result.h5", dset.name()+"MACH");
1283  */
1284 
1285 
1286  // Produce linkage
1287  linkage.cluster(distance);
1288 
1289  //linkage.save(exp_dir + dset.name() + "_result.h5", "_linkage_CC/");
1290  vigra::RandomForest<int> RF2(opt);
1291  ClusterImportanceVisitor ci(linkage);
1292  RF2.learn(features,
1293  response,
1294  create_visitor(progress, ci));
1295 
1296 
1297  CorrectStatus cs;
1298  linkage.breadth_first_traversal(cs);
1299 
1300  //ci.save(exp_dir + dset.name() + "_result.h5", dset.name());
1301  //Draw<double, int> draw(dset.features(), dset.response(), exp_dir+ dset.name() + ".graph");
1302  //linkage.breadth_first_traversal(draw);
1303 
1304 }
1305 
1306 
1307 template<class FeatureT, class ResponseT>
1308 void cluster_permutation_importance(FeatureT const & features,
1309  ResponseT const & response,
1310  HClustering & linkage)
1311 {
1312  MultiArray<2, double> distance;
1313  cluster_permutation_importance(features, response, linkage, distance);
1314 }
1315 }//namespace algorithms
1316 }//namespace rf
1317 }//namespace vigra

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.7.1 (Thu Jun 14 2012)