SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006 Christian Gehl 00008 * Written (W) 2006-2009 Soeren Sonnenburg 00009 * Written (W) 2011 Sergey Lisitsyn 00010 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00011 */ 00012 00013 #include <shogun/classifier/KNN.h> 00014 #include <shogun/features/Labels.h> 00015 #include <shogun/mathematics/Math.h> 00016 #include <shogun/lib/Signal.h> 00017 #include <shogun/base/Parameter.h> 00018 00019 using namespace shogun; 00020 00021 CKNN::CKNN() 00022 : CDistanceMachine() 00023 { 00024 init(); 00025 } 00026 00027 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab) 00028 : CDistanceMachine() 00029 { 00030 init(); 00031 00032 m_k=k; 00033 00034 ASSERT(d); 00035 ASSERT(trainlab); 00036 00037 set_distance(d); 00038 set_labels(trainlab); 00039 train_labels.vlen=trainlab->get_num_labels(); 00040 } 00041 00042 void CKNN::init() 00043 { 00044 /* do not store model features by default (CDistanceMachine::apply(...) is 00045 * overwritten */ 00046 set_store_model_features(false); 00047 00048 m_k=3; 00049 m_q=1.0; 00050 num_classes=0; 00051 00052 m_parameters->add(&m_k, "k", "Parameter k"); 00053 m_parameters->add(&m_q, "q", "Parameter q"); 00054 m_parameters->add(&num_classes, "num_classes", "Number of classes"); 00055 } 00056 00057 CKNN::~CKNN() 00058 { 00059 SG_FREE(train_labels.vector); 00060 } 00061 00062 bool CKNN::train_machine(CFeatures* data) 00063 { 00064 ASSERT(labels); 00065 ASSERT(distance); 00066 00067 if (data) 00068 { 00069 if (labels->get_num_labels() != data->get_num_vectors()) 00070 SG_ERROR("Number of training vectors does not match number of labels\n"); 00071 distance->init(data, data); 00072 } 00073 00074 SGVector<int32_t> lab=labels->get_int_labels(); 00075 train_labels.vlen=lab.vlen; 00076 train_labels.vector=CMath::clone_vector(lab.vector, lab.vlen); 00077 lab.free_vector(); 00078 ASSERT(train_labels.vlen>0); 00079 00080 int32_t max_class=train_labels.vector[0]; 00081 int32_t min_class=train_labels.vector[0]; 00082 00083 int32_t i; 00084 for (i=1; i<train_labels.vlen; i++) 00085 { 00086 max_class=CMath::max(max_class, train_labels.vector[i]); 00087 min_class=CMath::min(min_class, train_labels.vector[i]); 00088 } 00089 00090 for (i=0; i<train_labels.vlen; i++) 00091 train_labels.vector[i]-=min_class; 00092 00093 min_label=min_class; 00094 num_classes=max_class-min_class+1; 00095 00096 SG_INFO( "num_classes: %d (%+d to %+d) num_train: %d\n", num_classes, 00097 min_class, max_class, train_labels.vlen); 00098 return true; 00099 } 00100 00101 CLabels* CKNN::apply() 00102 { 00103 ASSERT(num_classes>0); 00104 ASSERT(distance); 00105 ASSERT(distance->get_num_vec_rhs()); 00106 00107 int32_t num_lab=distance->get_num_vec_rhs(); 00108 ASSERT(m_k<=num_lab); 00109 00110 CLabels* output=new CLabels(num_lab); 00111 00112 //distances to train data and working buffer of train_labels 00113 float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen); 00114 int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen); 00115 00116 SG_INFO( "%d test examples\n", num_lab); 00117 CSignal::clear_cancel(); 00118 00120 float64_t* classes=SG_MALLOC(float64_t, num_classes); 00121 00122 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00123 { 00124 SG_PROGRESS(i, 0, num_lab); 00125 00126 // lhs idx 1..n and rhs idx i 00127 distances_lhs(dists,0,train_labels.vlen-1,i); 00128 int32_t j; 00129 00130 for (j=0; j<train_labels.vlen; j++) 00131 train_lab[j]=train_labels.vector[j]; 00132 00133 //sort the distance vector for test example j to all train examples 00134 //classes[1..k] then holds the classes for minimum distance 00135 CMath::qsort_index(dists, train_lab, train_labels.vlen); 00136 00137 //compute histogram of class outputs of the first k nearest neighbours 00138 for (j=0; j<num_classes; j++) 00139 classes[j]=0.0; 00140 00141 float64_t multiplier = m_q; 00142 for (j=0; j<m_k; j++) 00143 { 00144 classes[train_lab[j]]+= multiplier; 00145 multiplier*= multiplier; 00146 } 00147 00148 //choose the class that got 'outputted' most often 00149 int32_t out_idx=0; 00150 float64_t out_max=0; 00151 00152 for (j=0; j<num_classes; j++) 00153 { 00154 if (out_max< classes[j]) 00155 { 00156 out_idx= j; 00157 out_max= classes[j]; 00158 } 00159 } 00160 output->set_label(i, out_idx+min_label); 00161 } 00162 00163 SG_FREE(classes); 00164 SG_FREE(dists); 00165 SG_FREE(train_lab); 00166 00167 return output; 00168 } 00169 00170 CLabels* CKNN::apply(CFeatures* data) 00171 { 00172 init_distance(data); 00173 00174 // redirecting to fast (without sorting) classify if k==1 00175 if (m_k == 1) 00176 return classify_NN(); 00177 00178 return apply(); 00179 } 00180 00181 CLabels* CKNN::classify_NN() 00182 { 00183 ASSERT(distance); 00184 ASSERT(num_classes>0); 00185 00186 int32_t num_lab = distance->get_num_vec_rhs(); 00187 ASSERT(num_lab); 00188 00189 CLabels* output = new CLabels(num_lab); 00190 float64_t* distances = SG_MALLOC(float64_t, train_labels.vlen); 00191 00192 SG_INFO("%d test examples\n", num_lab); 00193 CSignal::clear_cancel(); 00194 00195 // for each test example 00196 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00197 { 00198 SG_PROGRESS(i,0,num_lab); 00199 00200 // get distances from i-th test example to 0..num_train_labels-1 train examples 00201 distances_lhs(distances,0,train_labels.vlen-1,i); 00202 int32_t j; 00203 00204 // assuming 0th train examples as nearest to i-th test example 00205 int32_t out_idx = 0; 00206 float64_t min_dist = distances[0]; 00207 00208 // searching for nearest neighbor by comparing distances 00209 for (j=0; j<train_labels.vlen; j++) 00210 { 00211 if (distances[j]<min_dist) 00212 { 00213 min_dist = distances[j]; 00214 out_idx = j; 00215 } 00216 } 00217 00218 // label i-th test example with label of nearest neighbor with out_idx index 00219 output->set_label(i,train_labels.vector[out_idx]+min_label); 00220 } 00221 00222 delete [] distances; 00223 return output; 00224 } 00225 00226 SGMatrix<int32_t> CKNN::classify_for_multiple_k() 00227 { 00228 ASSERT(num_classes>0); 00229 ASSERT(distance); 00230 ASSERT(distance->get_num_vec_rhs()); 00231 00232 int32_t num_lab=distance->get_num_vec_rhs(); 00233 ASSERT(m_k<=num_lab); 00234 00235 int32_t* output=SG_MALLOC(int32_t, m_k*num_lab); 00236 00237 //distances to train data and working buffer of train_labels 00238 float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen); 00239 int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen); 00240 00242 int32_t* classes=SG_MALLOC(int32_t, num_classes); 00243 00244 SG_INFO( "%d test examples\n", num_lab); 00245 CSignal::clear_cancel(); 00246 00247 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00248 { 00249 SG_PROGRESS(i, 0, num_lab); 00250 00251 // lhs idx 1..n and rhs idx i 00252 distances_lhs(dists,0,train_labels.vlen-1,i); 00253 for (int32_t j=0; j<train_labels.vlen; j++) 00254 train_lab[j]=train_labels.vector[j]; 00255 00256 //sort the distance vector for test example j to all train examples 00257 //classes[1..k] then holds the classes for minimum distance 00258 CMath::qsort_index(dists, train_lab, train_labels.vlen); 00259 00260 //compute histogram of class outputs of the first k nearest neighbours 00261 for (int32_t j=0; j<num_classes; j++) 00262 classes[j]=0; 00263 00264 for (int32_t j=0; j<m_k; j++) 00265 { 00266 classes[train_lab[j]]++; 00267 00268 //choose the class that got 'outputted' most often 00269 int32_t out_idx=0; 00270 int32_t out_max=0; 00271 00272 for (int32_t c=0; c<num_classes; c++) 00273 { 00274 if (out_max< classes[c]) 00275 { 00276 out_idx= c; 00277 out_max= classes[c]; 00278 } 00279 } 00280 output[j*num_lab+i]=out_idx+min_label; 00281 } 00282 } 00283 00284 SG_FREE(dists); 00285 SG_FREE(train_lab); 00286 SG_FREE(classes); 00287 00288 return SGMatrix<int32_t>(output,num_lab,m_k); 00289 } 00290 00291 void CKNN::init_distance(CFeatures* data) 00292 { 00293 if (!distance) 00294 SG_ERROR("No distance assigned!\n"); 00295 CFeatures* lhs=distance->get_lhs(); 00296 if (!lhs || !lhs->get_num_vectors()) 00297 { 00298 SG_UNREF(lhs); 00299 SG_ERROR("No vectors on left hand side\n"); 00300 } 00301 distance->init(lhs, data); 00302 SG_UNREF(lhs); 00303 } 00304 00305 bool CKNN::load(FILE* srcfile) 00306 { 00307 SG_SET_LOCALE_C; 00308 SG_RESET_LOCALE; 00309 return false; 00310 } 00311 00312 bool CKNN::save(FILE* dstfile) 00313 { 00314 SG_SET_LOCALE_C; 00315 SG_RESET_LOCALE; 00316 return false; 00317 } 00318 00319 void CKNN::store_model_features() 00320 { 00321 CFeatures* d_lhs=distance->get_lhs(); 00322 CFeatures* d_rhs=distance->get_rhs(); 00323 00324 /* copy lhs of underlying distance */ 00325 distance->init(d_lhs->duplicate(), d_rhs); 00326 00327 SG_UNREF(d_lhs); 00328 SG_UNREF(d_rhs); 00329 }