SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 2007-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/clustering/KMeans.h> 00013 #include <shogun/distance/Distance.h> 00014 #include <shogun/features/Labels.h> 00015 #include <shogun/features/SimpleFeatures.h> 00016 #include <shogun/mathematics/Math.h> 00017 #include <shogun/base/Parallel.h> 00018 00019 #ifdef HAVE_PTHREAD 00020 #include <pthread.h> 00021 #endif 00022 00023 #define MUSRECALC 00024 00025 #define PAR_THRESH 10 00026 00027 using namespace shogun; 00028 00029 CKMeans::CKMeans() 00030 : CDistanceMachine() 00031 { 00032 init(); 00033 } 00034 00035 CKMeans::CKMeans(int32_t k_, CDistance* d) 00036 : CDistanceMachine() 00037 { 00038 init(); 00039 k=k_; 00040 set_distance(d); 00041 } 00042 00043 CKMeans::~CKMeans() 00044 { 00045 R.destroy_vector(); 00046 } 00047 00048 bool CKMeans::train_machine(CFeatures* data) 00049 { 00050 ASSERT(distance); 00051 00052 if (data) 00053 distance->init(data, data); 00054 00055 ASSERT(distance->get_feature_type()==F_DREAL); 00056 00057 CSimpleFeatures<float64_t>* lhs= 00058 (CSimpleFeatures<float64_t>*)distance->get_lhs(); 00059 ASSERT(lhs); 00060 int32_t num=lhs->get_num_vectors(); 00061 SG_UNREF(lhs); 00062 00063 Weights=SGVector<float64_t>(num); 00064 for (int32_t i=0; i<num; i++) 00065 Weights.vector[i]=1.0; 00066 00067 clustknb(false, NULL); 00068 Weights.destroy_vector(); 00069 00070 return true; 00071 } 00072 00073 bool CKMeans::load(FILE* srcfile) 00074 { 00075 SG_SET_LOCALE_C; 00076 SG_RESET_LOCALE; 00077 return false; 00078 } 00079 00080 bool CKMeans::save(FILE* dstfile) 00081 { 00082 SG_SET_LOCALE_C; 00083 SG_RESET_LOCALE; 00084 return false; 00085 } 00086 00087 00088 void CKMeans::set_k(int32_t p_k) 00089 { 00090 ASSERT(p_k>0); 00091 this->k=p_k; 00092 } 00093 00094 int32_t CKMeans::get_k() 00095 { 00096 return k; 00097 } 00098 00099 void CKMeans::set_max_iter(int32_t iter) 00100 { 00101 ASSERT(iter>0); 00102 max_iter=iter; 00103 } 00104 00105 float64_t CKMeans::get_max_iter() 00106 { 00107 return max_iter; 00108 } 00109 00110 SGVector<float64_t> CKMeans::get_radiuses() 00111 { 00112 return R; 00113 } 00114 00115 SGMatrix<float64_t> CKMeans::get_cluster_centers() 00116 { 00117 if (!R.vector) 00118 return SGMatrix<float64_t>(); 00119 00120 CSimpleFeatures<float64_t>* lhs= 00121 (CSimpleFeatures<float64_t>*)distance->get_lhs(); 00122 SGMatrix<float64_t> centers=lhs->get_feature_matrix(); 00123 SG_UNREF(lhs); 00124 return centers; 00125 } 00126 00127 int32_t CKMeans::get_dimensions() 00128 { 00129 return dimensions; 00130 } 00131 00132 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00133 struct thread_data 00134 { 00135 float64_t* x; 00136 CSimpleFeatures<float64_t>* y; 00137 float64_t* z; 00138 int32_t n1, n2, m; 00139 int32_t js, je; /* defines the matrix stripe */ 00140 int32_t offs; 00141 }; 00142 #endif // DOXYGEN_SHOULD_SKIP_THIS 00143 00144 namespace shogun 00145 { 00146 void *sqdist_thread_func(void * P) 00147 { 00148 struct thread_data *TD=(struct thread_data*) P; 00149 float64_t* x=TD->x; 00150 CSimpleFeatures<float64_t>* y=TD->y; 00151 float64_t* z=TD->z; 00152 int32_t n1=TD->n1, 00153 m=TD->m, 00154 js=TD->js, 00155 je=TD->je, 00156 offs=TD->offs, 00157 j,i,k; 00158 00159 for (j=js; j<je; j++) 00160 { 00161 int32_t vlen=0; 00162 bool vfree=false; 00163 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree); 00164 00165 for (i=0; i<n1; i++) 00166 { 00167 float64_t sum=0; 00168 for (k=0; k<m; k++) 00169 sum = sum + CMath::sq(x[i*m + k] - vec[k]); 00170 z[j*n1 + i] = sum; 00171 } 00172 00173 y->free_feature_vector(vec, j, vfree); 00174 } 00175 return NULL; 00176 } 00177 } 00178 00179 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start) 00180 { 00181 ASSERT(distance && distance->get_feature_type()==F_DREAL); 00182 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs(); 00183 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0); 00184 00185 int32_t XSize=lhs->get_num_vectors(); 00186 dimensions=lhs->get_num_features(); 00187 int32_t i, changed=1; 00188 const int32_t XDimk=dimensions*k; 00189 int32_t iter=0; 00190 00191 R.destroy_vector(); 00192 R=SGVector<float64_t>(k); 00193 00194 mus=SGMatrix<float64_t>(dimensions, k); 00195 00196 int32_t *ClList=SG_CALLOC(int32_t, XSize); 00197 float64_t *weights_set=SG_CALLOC(float64_t, k); 00198 float64_t *dists=SG_CALLOC(float64_t, k*XSize); 00199 00201 CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0); 00202 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus); 00203 00204 int32_t vlen=0; 00205 bool vfree=false; 00206 float64_t* vec=NULL; 00207 00208 /* ClList=zeros(XSize,1) ; */ 00209 memset(ClList, 0, sizeof(int32_t)*XSize); 00210 /* weights_set=zeros(k,1) ; */ 00211 memset(weights_set, 0, sizeof(float64_t)*k); 00212 00213 /* cluster_centers=zeros(dimensions, k) ; */ 00214 memset(mus.matrix, 0, sizeof(float64_t)*XDimk); 00215 00216 if (!use_old_mus) 00217 { 00218 for (i=0; i<XSize; i++) 00219 { 00220 const int32_t Cl=CMath::random(0, k-1); 00221 int32_t j; 00222 float64_t weight=Weights.vector[i]; 00223 00224 weights_set[Cl]+=weight; 00225 ClList[i]=Cl; 00226 00227 vec=lhs->get_feature_vector(i, vlen, vfree); 00228 00229 for (j=0; j<dimensions; j++) 00230 mus.matrix[Cl*dimensions+j] += weight*vec[j]; 00231 00232 lhs->free_feature_vector(vec, i, vfree); 00233 } 00234 for (i=0; i<k; i++) 00235 { 00236 int32_t j; 00237 00238 if (weights_set[i]!=0.0) 00239 for (j=0; j<dimensions; j++) 00240 mus.matrix[i*dimensions+j] /= weights_set[i]; 00241 } 00242 } 00243 else 00244 { 00245 ASSERT(mus_start); 00246 00248 rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k)); 00249 float64_t* p_dists=dists; 00250 00251 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k) 00252 distances_rhs(p_dists,0,k,idx); 00253 p_dists=NULL; 00254 00255 for (i=0; i<XSize; i++) 00256 { 00257 float64_t mini=dists[i*k]; 00258 int32_t Cl = 0, j; 00259 00260 for (j=1; j<k; j++) 00261 { 00262 if (dists[i*k+j]<mini) 00263 { 00264 Cl=j; 00265 mini=dists[i*k+j]; 00266 } 00267 } 00268 ClList[i]=Cl; 00269 } 00270 00271 /* Compute the sum of all points belonging to a cluster 00272 * and count the points */ 00273 for (i=0; i<XSize; i++) 00274 { 00275 const int32_t Cl = ClList[i]; 00276 float64_t weight=Weights.vector[i]; 00277 weights_set[Cl]+=weight; 00278 #ifndef MUSRECALC 00279 vec=lhs->get_feature_vector(i, vlen, vfree); 00280 00281 for (j=0; j<dimensions; j++) 00282 mus.matrix[Cl*dimensions+j] += weight*vec[j]; 00283 00284 lhs->free_feature_vector(vec, i, vfree); 00285 #endif 00286 } 00287 #ifndef MUSRECALC 00288 /* normalization to get the mean */ 00289 for (i=0; i<k; i++) 00290 { 00291 if (weights_set[i]!=0.0) 00292 { 00293 int32_t j; 00294 for (j=0; j<dimensions; j++) 00295 mus.matrix[i*dimensions+j] /= weights_set[i]; 00296 } 00297 } 00298 #endif 00299 } 00300 00301 00302 00303 while (changed && (iter<max_iter)) 00304 { 00305 iter++; 00306 if (iter==max_iter-1) 00307 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1); 00308 00309 if (iter%1000 == 0) 00310 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed); 00311 changed=0; 00312 00313 #ifdef MUSRECALC 00314 /* mus=zeros(dimensions, k) ; */ 00315 memset(mus.matrix, 0, sizeof(float64_t)*XDimk); 00316 00317 for (i=0; i<XSize; i++) 00318 { 00319 int32_t j; 00320 int32_t Cl=ClList[i]; 00321 float64_t weight=Weights.vector[i]; 00322 00323 vec=lhs->get_feature_vector(i, vlen, vfree); 00324 00325 for (j=0; j<dimensions; j++) 00326 mus.matrix[Cl*dimensions+j] += weight*vec[j]; 00327 00328 lhs->free_feature_vector(vec, i, vfree); 00329 } 00330 for (i=0; i<k; i++) 00331 { 00332 int32_t j; 00333 00334 if (weights_set[i]!=0.0) 00335 for (j=0; j<dimensions; j++) 00336 mus.matrix[i*dimensions+j] /= weights_set[i]; 00337 } 00338 #endif 00339 00340 rhs_mus->copy_feature_matrix(mus); 00341 00342 for (i=0; i<XSize; i++) 00343 { 00344 /* ks=ceil(rand(1,XSize)*XSize) ; */ 00345 const int32_t Pat= CMath::random(0, XSize-1); 00346 const int32_t ClList_Pat=ClList[Pat]; 00347 int32_t imini, j; 00348 float64_t mini, weight; 00349 00350 weight=Weights.vector[Pat]; 00351 00352 /* compute the distance of this point to all centers */ 00353 for(int32_t idx_k=0;idx_k<k;idx_k++) 00354 dists[idx_k]=distance->distance(Pat,idx_k); 00355 00356 /* [mini,imini]=min(dists(:,i)) ; */ 00357 imini=0 ; mini=dists[0]; 00358 for (j=1; j<k; j++) 00359 if (dists[j]<mini) 00360 { 00361 mini=dists[j]; 00362 imini=j; 00363 } 00364 00365 if (imini!=ClList_Pat) 00366 { 00367 changed= changed + 1; 00368 00369 /* weights_set(imini) = weights_set(imini) + weight ; */ 00370 weights_set[imini]+= weight; 00371 /* weights_set(j) = weights_set(j) - weight ; */ 00372 weights_set[ClList_Pat]-= weight; 00373 00374 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00375 00376 for (j=0; j<dimensions; j++) 00377 { 00378 mus.matrix[imini*dimensions+j]-=(vec[j] 00379 -mus.matrix[imini*dimensions+j]) 00380 *(weight/weights_set[imini]); 00381 } 00382 00383 lhs->free_feature_vector(vec, Pat, vfree); 00384 00385 /* mu_new = mu_old - (x - mu_old)/(n-1) */ 00386 /* if weights_set(j)~=0 */ 00387 if (weights_set[ClList_Pat]!=0.0) 00388 { 00389 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00390 00391 for (j=0; j<dimensions; j++) 00392 { 00393 mus.matrix[ClList_Pat*dimensions+j]-= 00394 (vec[j] 00395 -mus.matrix[ClList_Pat 00396 *dimensions+j]) 00397 *(weight/weights_set[ClList_Pat]); 00398 } 00399 lhs->free_feature_vector(vec, Pat, vfree); 00400 } 00401 else 00402 /* mus(:,j)=zeros(dimensions,1) ; */ 00403 for (j=0; j<dimensions; j++) 00404 mus.matrix[ClList_Pat*dimensions+j]=0; 00405 00406 /* ClList(i)= imini ; */ 00407 ClList[Pat] = imini; 00408 } 00409 } 00410 } 00411 00412 /* compute the ,,variances'' of the clusters */ 00413 for (i=0; i<k; i++) 00414 { 00415 float64_t rmin1=0; 00416 float64_t rmin2=0; 00417 00418 bool first_round=true; 00419 00420 for (int32_t j=0; j<k; j++) 00421 { 00422 if (j!=i) 00423 { 00424 int32_t l; 00425 float64_t dist = 0; 00426 00427 for (l=0; l<dimensions; l++) 00428 { 00429 dist+=CMath::sq( 00430 mus.matrix[i*dimensions+l] 00431 -mus.matrix[j*dimensions+l]); 00432 } 00433 00434 if (first_round) 00435 { 00436 rmin1=dist; 00437 rmin2=dist; 00438 first_round=false; 00439 } 00440 else 00441 { 00442 if ((dist<rmin2) && (dist>=rmin1)) 00443 rmin2=dist; 00444 00445 if (dist<rmin1) 00446 { 00447 rmin2=rmin1; 00448 rmin1=dist; 00449 } 00450 } 00451 } 00452 } 00453 00454 R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2)); 00455 } 00456 distance->replace_rhs(rhs_cache); 00457 delete rhs_mus; 00458 SG_FREE(ClList); 00459 SG_FREE(weights_set); 00460 SG_FREE(dists); 00461 SG_UNREF(lhs); 00462 } 00463 00464 void CKMeans::store_model_features() 00465 { 00466 /* set lhs of underlying distance to cluster centers */ 00467 CSimpleFeatures<float64_t>* cluster_centers=new CSimpleFeatures<float64_t>( 00468 mus); 00469 00470 /* reset mus variable to avoid interference with above features */ 00471 mus.do_free=false; 00472 mus.free_matrix(); 00473 00474 /* store cluster centers in lhs of distance variable */ 00475 CFeatures* rhs=distance->get_rhs(); 00476 distance->init(cluster_centers, rhs); 00477 SG_UNREF(rhs); 00478 } 00479 00480 void CKMeans::init() 00481 { 00482 max_iter=10000; 00483 k=3; 00484 dimensions=0; 00485 00486 m_parameters->add(&max_iter, "max_iter", "Maximum number of iterations"); 00487 m_parameters->add(&k, "k", "Parameter k"); 00488 m_parameters->add(&dimensions, "dimensions", "Dimensions of data"); 00489 m_parameters->add(&R, "R", "Cluster radiuses"); 00490 } 00491