SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Sergey Lisitsyn 00008 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/converter/MultidimensionalScaling.h> 00012 #ifdef HAVE_LAPACK 00013 #include <shogun/converter/EmbeddingConverter.h> 00014 #include <shogun/mathematics/lapack.h> 00015 #include <shogun/mathematics/arpack.h> 00016 #include <shogun/distance/CustomDistance.h> 00017 #include <shogun/lib/common.h> 00018 #include <shogun/mathematics/Math.h> 00019 #include <shogun/io/SGIO.h> 00020 #include <shogun/distance/EuclidianDistance.h> 00021 00022 #ifdef HAVE_PTHREAD 00023 #include <pthread.h> 00024 #endif 00025 00026 using namespace shogun; 00027 00028 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00029 struct TRIANGULATION_THREAD_PARAM 00030 { 00032 int32_t idx_start; 00034 int32_t idx_stop; 00036 int32_t idx_step; 00038 int32_t lmk_N; 00040 int32_t total_N; 00042 int32_t m_target_dim; 00044 float64_t* current_dist_to_lmks; 00046 float64_t* lmk_feature_matrix; 00048 float64_t* new_feature_matrix; 00050 const float64_t* distance_matrix; 00052 const float64_t* mean_sq_dist_vector; 00054 const int32_t* lmk_idxs; 00056 const bool* to_process; 00057 }; 00058 #endif 00059 00060 CMultidimensionalScaling::CMultidimensionalScaling() : CEmbeddingConverter() 00061 { 00062 m_eigenvalues = SGVector<float64_t>(NULL,0,false); 00063 m_landmark_number = 3; 00064 m_landmark = false; 00065 00066 init(); 00067 } 00068 00069 void CMultidimensionalScaling::init() 00070 { 00071 m_parameters->add(&m_eigenvalues, "eigenvalues", "eigenvalues of last embedding"); 00072 m_parameters->add(&m_landmark, "landmark", "indicates if landmark approximation should be used"); 00073 m_parameters->add(&m_landmark_number, "landmark number", "the number of landmarks for approximation"); 00074 } 00075 00076 CMultidimensionalScaling::~CMultidimensionalScaling() 00077 { 00078 m_eigenvalues.destroy_vector(); 00079 } 00080 00081 SGVector<float64_t> CMultidimensionalScaling::get_eigenvalues() const 00082 { 00083 return m_eigenvalues; 00084 } 00085 00086 void CMultidimensionalScaling::set_landmark_number(int32_t num) 00087 { 00088 if (num<3) 00089 SG_ERROR("Number of landmarks should be greater than 3 to make triangulation possible while %d given.", 00090 num); 00091 m_landmark_number = num; 00092 } 00093 00094 int32_t CMultidimensionalScaling::get_landmark_number() const 00095 { 00096 return m_landmark_number; 00097 } 00098 00099 void CMultidimensionalScaling::set_landmark(bool landmark) 00100 { 00101 m_landmark = landmark; 00102 } 00103 00104 bool CMultidimensionalScaling::get_landmark() const 00105 { 00106 return m_landmark; 00107 } 00108 00109 const char* CMultidimensionalScaling::get_name() const 00110 { 00111 return "MultidimensionalScaling"; 00112 }; 00113 00114 CSimpleFeatures<float64_t>* CMultidimensionalScaling::embed_distance(CDistance* distance) 00115 { 00116 ASSERT(distance); 00117 00118 // compute feature_matrix by landmark or classic embedding of distance matrix 00119 SGMatrix<float64_t> distance_matrix = process_distance_matrix(distance->get_distance_matrix()); 00120 SGMatrix<float64_t> feature_matrix; 00121 if (m_landmark) 00122 feature_matrix = landmark_embedding(distance_matrix); 00123 else 00124 feature_matrix = classic_embedding(distance_matrix); 00125 00126 distance_matrix.destroy_matrix(); 00127 00128 return new CSimpleFeatures<float64_t>(feature_matrix); 00129 } 00130 00131 SGMatrix<float64_t> CMultidimensionalScaling::process_distance_matrix(SGMatrix<float64_t> distance_matrix) 00132 { 00133 return distance_matrix; 00134 } 00135 00136 CFeatures* CMultidimensionalScaling::apply(CFeatures* features) 00137 { 00138 SG_REF(features); 00139 ASSERT(m_distance); 00140 00141 m_distance->init(features,features); 00142 CSimpleFeatures<float64_t>* embedding = embed_distance(m_distance); 00143 m_distance->remove_lhs_and_rhs(); 00144 00145 SG_UNREF(features); 00146 return (CFeatures*)embedding; 00147 } 00148 00149 SGMatrix<float64_t> CMultidimensionalScaling::classic_embedding(SGMatrix<float64_t> distance_matrix) 00150 { 00151 ASSERT(m_target_dim>0); 00152 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows); 00153 int32_t N = distance_matrix.num_cols; 00154 00155 // loop variables 00156 int32_t i,j; 00157 00158 // double center distance_matrix 00159 float64_t dsq; 00160 for (i=0; i<N; i++) 00161 { 00162 for (j=i; j<N; j++) 00163 { 00164 dsq = CMath::sq(distance_matrix[i*N+j]); 00165 distance_matrix[i*N+j] = dsq; 00166 distance_matrix[j*N+i] = dsq; 00167 } 00168 } 00169 CMath::center_matrix(distance_matrix.matrix,N,N); 00170 for (i=0; i<N; i++) 00171 { 00172 distance_matrix[i*N+i] *= -0.5; 00173 for (j=i+1; j<N; j++) 00174 { 00175 distance_matrix[i*N+j] *= -0.5; 00176 distance_matrix[j*N+i] *= -0.5; 00177 } 00178 } 00179 00180 // feature matrix representing given distance 00181 float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim); 00182 00183 // status of eigenproblem to be solved 00184 int eigenproblem_status = 0; 00185 #ifdef HAVE_ARPACK 00186 // using ARPACK 00187 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim); 00188 // solve eigenproblem with ARPACK (faster) 00189 arpack_dsxupd(distance_matrix.matrix,NULL,false,N,m_target_dim,"LM",false,1,false,0.0,0.0, 00190 eigenvalues_vector,replace_feature_matrix,eigenproblem_status); 00191 // check for failure 00192 ASSERT(eigenproblem_status == 0); 00193 // reverse eigenvectors order 00194 float64_t tmp; 00195 for (j=0; j<N; j++) 00196 { 00197 for (i=0; i<m_target_dim/2; i++) 00198 { 00199 tmp = replace_feature_matrix[j*m_target_dim+i]; 00200 replace_feature_matrix[j*m_target_dim+i] = 00201 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)]; 00202 replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp; 00203 } 00204 } 00205 // reverse eigenvalues order 00206 for (i=0; i<m_target_dim/2; i++) 00207 { 00208 tmp = eigenvalues_vector[i]; 00209 eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1]; 00210 eigenvalues_vector[m_target_dim-i-1] = tmp; 00211 } 00212 00213 // finally construct embedding 00214 for (i=0; i<m_target_dim; i++) 00215 { 00216 for (j=0; j<N; j++) 00217 replace_feature_matrix[j*m_target_dim+i] *= 00218 CMath::sqrt(eigenvalues_vector[i]); 00219 } 00220 00221 // set eigenvalues vector 00222 m_eigenvalues.destroy_vector(); 00223 m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim,true); 00224 #else /* not HAVE_ARPACK */ 00225 // using LAPACK 00226 float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N); 00227 float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N); 00228 // solve eigenproblem with LAPACK 00229 wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status); 00230 // check for failure 00231 ASSERT(eigenproblem_status==0); 00232 00233 // set eigenvalues vector 00234 m_eigenvalues.destroy_vector(); 00235 m_eigenvalues = SGVector<float64_t>(m_target_dim); 00236 m_eigenvalues.do_free = false; 00237 00238 // fill eigenvalues vector in backwards order 00239 for (i=0; i<m_target_dim; i++) 00240 m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1]; 00241 00242 SG_FREE(eigenvalues_vector); 00243 00244 // construct embedding 00245 for (i=0; i<m_target_dim; i++) 00246 { 00247 for (j=0; j<N; j++) 00248 { 00249 replace_feature_matrix[j*m_target_dim+i] = 00250 eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]); 00251 } 00252 } 00253 SG_FREE(eigenvectors); 00254 #endif /* HAVE_ARPACK else */ 00255 00256 // warn user if there are negative or zero eigenvalues 00257 for (i=0; i<m_eigenvalues.vlen; i++) 00258 { 00259 if (m_eigenvalues.vector[i]<=0.0) 00260 { 00261 SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong", 00262 i, m_eigenvalues.vlen-1); 00263 break; 00264 } 00265 } 00266 00267 return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N); 00268 } 00269 00270 SGMatrix<float64_t> CMultidimensionalScaling::landmark_embedding(SGMatrix<float64_t> distance_matrix) 00271 { 00272 ASSERT(m_target_dim>0); 00273 ASSERT(distance_matrix.num_cols==distance_matrix.num_rows); 00274 int32_t lmk_N = m_landmark_number; 00275 int32_t i,j,t; 00276 int32_t total_N = distance_matrix.num_cols; 00277 if (lmk_N<3) 00278 { 00279 SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n", 00280 lmk_N); 00281 } 00282 if (lmk_N>total_N) 00283 { 00284 SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n", 00285 lmk_N, total_N); 00286 } 00287 00288 // get landmark indexes with random permutation 00289 SGVector<int32_t> lmk_idxs = shuffle(lmk_N,total_N); 00290 // compute distances between landmarks 00291 float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N); 00292 for (i=0; i<lmk_N; i++) 00293 { 00294 for (j=0; j<lmk_N; j++) 00295 lmk_dist_matrix[i*lmk_N+j] = 00296 distance_matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]]; 00297 } 00298 00299 // get landmarks embedding 00300 SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N); 00301 // compute mean vector of squared distances 00302 float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N); 00303 for (i=0; i<lmk_N; i++) 00304 { 00305 for (j=0; j<lmk_N; j++) 00306 mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]); 00307 00308 mean_sq_dist_vector[i] /= lmk_N; 00309 } 00310 SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix); 00311 00312 lmk_dist_sgmatrix.destroy_matrix(); 00313 00314 // construct new feature matrix 00315 float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N); 00316 00317 // fill new feature matrix with embedded landmarks 00318 for (i=0; i<lmk_N; i++) 00319 { 00320 for (j=0; j<m_target_dim; j++) 00321 new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] = 00322 lmk_feature_matrix[i*m_target_dim+j]; 00323 } 00324 00325 // get exactly defined pseudoinverse of landmarks feature matrix 00326 ASSERT(m_eigenvalues.vector && m_eigenvalues.vlen == m_target_dim); 00327 for (i=0; i<lmk_N; i++) 00328 { 00329 for (j=0; j<m_target_dim; j++) 00330 lmk_feature_matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j]; 00331 } 00332 00333 00334 // set to_process els true if should be processed 00335 bool* to_process = SG_MALLOC(bool, total_N); 00336 for (j=0; j<total_N; j++) 00337 to_process[j] = true; 00338 for (j=0; j<lmk_N; j++) 00339 to_process[lmk_idxs.vector[j]] = false; 00340 00341 // get embedding for non-landmark vectors 00342 #ifdef HAVE_PTHREAD 00343 int32_t num_threads = parallel->get_num_threads(); 00344 ASSERT(num_threads>0); 00345 // allocate threads and it's parameters 00346 pthread_t* threads = SG_MALLOC(pthread_t, num_threads); 00347 TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads); 00348 pthread_attr_t attr; 00349 pthread_attr_init(&attr); 00350 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 00351 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads); 00352 // run threads 00353 for (t=0; t<num_threads; t++) 00354 { 00355 parameters[t].idx_start = t; 00356 parameters[t].idx_stop = total_N; 00357 parameters[t].idx_step = num_threads; 00358 parameters[t].lmk_N = lmk_N; 00359 parameters[t].total_N = total_N; 00360 parameters[t].m_target_dim = m_target_dim; 00361 parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N; 00362 parameters[t].distance_matrix = distance_matrix.matrix; 00363 parameters[t].mean_sq_dist_vector = mean_sq_dist_vector; 00364 parameters[t].lmk_idxs = lmk_idxs.vector; 00365 parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix; 00366 parameters[t].new_feature_matrix = new_feature_matrix; 00367 parameters[t].to_process = to_process; 00368 pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)¶meters[t]); 00369 } 00370 // join threads 00371 for (t=0; t<num_threads; t++) 00372 pthread_join(threads[t], NULL); 00373 pthread_attr_destroy(&attr); 00374 SG_FREE(parameters); 00375 SG_FREE(threads); 00376 #else 00377 // run single 'thread' 00378 float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N); 00379 TRIANGULATION_THREAD_PARAM single_thread_param; 00380 single_thread_param.idx_start = 0; 00381 single_thread_param.idx_stop = total_N; 00382 single_thread_param.idx_step = 1; 00383 single_thread_param.lmk_N = lmk_N; 00384 single_thread_param.total_N = total_N; 00385 single_thread_param.m_target_dim = m_target_dim; 00386 single_thread_param.current_dist_to_lmks = current_dist_to_lmks; 00387 single_thread_param.distance_matrix = distance_matrix.matrix; 00388 single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector; 00389 single_thread_param.lmk_idxs = lmk_idxs.vector; 00390 single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix; 00391 single_thread_param.new_feature_matrix = new_feature_matrix; 00392 single_thread_param.to_process = to_process; 00393 run_triangulation_thread((void*)&single_thread_param); 00394 #endif 00395 // cleanup 00396 lmk_feature_matrix.destroy_matrix(); 00397 SG_FREE(current_dist_to_lmks); 00398 lmk_idxs.destroy_vector(); 00399 SG_FREE(mean_sq_dist_vector); 00400 SG_FREE(to_process); 00401 lmk_idxs.destroy_vector(); 00402 00403 return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N); 00404 } 00405 00406 void* CMultidimensionalScaling::run_triangulation_thread(void* p) 00407 { 00408 TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p; 00409 int32_t idx_start = parameters->idx_start; 00410 int32_t idx_step = parameters->idx_step; 00411 int32_t idx_stop = parameters->idx_stop; 00412 const int32_t* lmk_idxs = parameters->lmk_idxs; 00413 const float64_t* distance_matrix = parameters->distance_matrix; 00414 const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector; 00415 float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks; 00416 int32_t m_target_dim = parameters->m_target_dim; 00417 int32_t lmk_N = parameters->lmk_N; 00418 int32_t total_N = parameters->total_N; 00419 const bool* to_process = parameters->to_process; 00420 float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix; 00421 float64_t* new_feature_matrix = parameters->new_feature_matrix; 00422 00423 int32_t i,k; 00424 for (i=idx_start; i<idx_stop; i+=idx_step) 00425 { 00426 // skip if landmark 00427 if (!to_process[i]) 00428 continue; 00429 00430 // compute difference from mean landmark distance vector 00431 for (k=0; k<lmk_N; k++) 00432 { 00433 current_dist_to_lmks[k] = 00434 CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) - 00435 mean_sq_dist_vector[k]; 00436 } 00437 // compute embedding 00438 cblas_dgemv(CblasColMajor,CblasNoTrans, 00439 m_target_dim,lmk_N, 00440 -0.5,lmk_feature_matrix,m_target_dim, 00441 current_dist_to_lmks,1, 00442 0.0,(new_feature_matrix+i*m_target_dim),1); 00443 } 00444 return NULL; 00445 } 00446 00447 00448 SGVector<int32_t> CMultidimensionalScaling::shuffle(int32_t count, int32_t total_count) 00449 { 00450 int32_t* idxs = SG_MALLOC(int32_t, total_count); 00451 int32_t i,rnd; 00452 int32_t* permuted_idxs = SG_MALLOC(int32_t, count); 00453 00454 // reservoir sampling 00455 for (i=0; i<total_count; i++) 00456 idxs[i] = i; 00457 for (i=0; i<count; i++) 00458 permuted_idxs[i] = idxs[i]; 00459 for (i=count; i<total_count; i++) 00460 { 00461 rnd = CMath::random(1,i); 00462 if (rnd<count) 00463 permuted_idxs[rnd] = idxs[i]; 00464 } 00465 SG_FREE(idxs); 00466 00467 CMath::qsort(permuted_idxs,count); 00468 return SGVector<int32_t>(permuted_idxs, count); 00469 } 00470 00471 #endif /* HAVE_LAPACK */