SHOGUN
v1.1.0
|
00001 /* 00002 * this program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Christian Gehl 00008 * Written (W) 2006-2009 Soeren Sonnenburg 00009 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/lib/config.h> 00013 #include <shogun/lib/common.h> 00014 #include <shogun/io/SGIO.h> 00015 #include <shogun/io/File.h> 00016 #include <shogun/lib/Time.h> 00017 #include <shogun/base/Parallel.h> 00018 #include <shogun/base/Parameter.h> 00019 00020 #include <shogun/distance/Distance.h> 00021 #include <shogun/features/Features.h> 00022 00023 #include <string.h> 00024 #include <unistd.h> 00025 00026 #ifdef HAVE_PTHREAD 00027 #include <pthread.h> 00028 #endif 00029 00030 using namespace shogun; 00031 00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00033 struct DISTANCE_THREAD_PARAM 00034 { 00035 // CDistance instance used by thread to compute distance 00036 CDistance* distance; 00037 // distance matrix to store computed distances 00038 float64_t* distance_matrix; 00039 // starting index of the main loop 00040 int32_t idx_start; 00041 // end index of the main loop 00042 int32_t idx_stop; 00043 // step of the main loop 00044 int32_t idx_step; 00045 // number of lhs vectors 00046 int32_t lhs_vectors_number; 00047 // number of rhs vectors 00048 int32_t rhs_vectors_number; 00049 // whether matrix distance is symmetric 00050 bool symmetric; 00051 // chunking method 00052 bool chunk_by_lhs; 00053 }; 00054 #endif /* DOXYGEN_SHOULD_SKIP_THIS */ 00055 00056 CDistance::CDistance() : CSGObject() 00057 { 00058 init(); 00059 } 00060 00061 00062 CDistance::CDistance(CFeatures* p_lhs, CFeatures* p_rhs) : CSGObject() 00063 { 00064 init(); 00065 init(p_lhs, p_rhs); 00066 } 00067 00068 CDistance::~CDistance() 00069 { 00070 SG_FREE(precomputed_matrix); 00071 precomputed_matrix=NULL; 00072 00073 remove_lhs_and_rhs(); 00074 } 00075 00076 bool CDistance::init(CFeatures* l, CFeatures* r) 00077 { 00078 //make sure features were indeed supplied 00079 ASSERT(l); 00080 ASSERT(r); 00081 00082 //make sure features are compatible 00083 ASSERT(l->get_feature_class()==r->get_feature_class()); 00084 ASSERT(l->get_feature_type()==r->get_feature_type()); 00085 00086 //remove references to previous features 00087 remove_lhs_and_rhs(); 00088 00089 //increase reference counts 00090 SG_REF(l); 00091 SG_REF(r); 00092 00093 lhs=l; 00094 rhs=r; 00095 00096 SG_FREE(precomputed_matrix); 00097 precomputed_matrix=NULL ; 00098 00099 return true; 00100 } 00101 00102 void CDistance::load(CFile* loader) 00103 { 00104 SG_SET_LOCALE_C; 00105 SG_RESET_LOCALE; 00106 } 00107 00108 void CDistance::save(CFile* writer) 00109 { 00110 SG_SET_LOCALE_C; 00111 SG_RESET_LOCALE; 00112 } 00113 00114 void CDistance::remove_lhs_and_rhs() 00115 { 00116 SG_UNREF(rhs); 00117 rhs = NULL; 00118 00119 SG_UNREF(lhs); 00120 lhs = NULL; 00121 } 00122 00123 void CDistance::remove_lhs() 00124 { 00125 SG_UNREF(lhs); 00126 lhs = NULL; 00127 } 00128 00130 void CDistance::remove_rhs() 00131 { 00132 SG_UNREF(rhs); 00133 rhs = NULL; 00134 } 00135 00136 CFeatures* CDistance::replace_rhs(CFeatures* r) 00137 { 00138 //make sure features were indeed supplied 00139 ASSERT(r); 00140 00141 //make sure features are compatible 00142 ASSERT(lhs->get_feature_class()==r->get_feature_class()); 00143 ASSERT(lhs->get_feature_type()==r->get_feature_type()); 00144 00145 //remove references to previous rhs features 00146 CFeatures* tmp=rhs; 00147 00148 rhs=r; 00149 00150 SG_FREE(precomputed_matrix); 00151 precomputed_matrix=NULL ; 00152 00153 // return old features including reference count 00154 return tmp; 00155 } 00156 00157 void CDistance::do_precompute_matrix() 00158 { 00159 int32_t num_left=lhs->get_num_vectors(); 00160 int32_t num_right=rhs->get_num_vectors(); 00161 SG_INFO( "precomputing distance matrix (%ix%i)\n", num_left, num_right) ; 00162 00163 ASSERT(num_left==num_right); 00164 ASSERT(lhs==rhs); 00165 int32_t num=num_left; 00166 00167 SG_FREE(precomputed_matrix); 00168 precomputed_matrix=SG_MALLOC(float32_t, num*(num+1)/2); 00169 00170 for (int32_t i=0; i<num; i++) 00171 { 00172 SG_PROGRESS(i*i,0,num*num); 00173 for (int32_t j=0; j<=i; j++) 00174 precomputed_matrix[i*(i+1)/2+j] = compute(i,j) ; 00175 } 00176 00177 SG_PROGRESS(num*num,0,num*num); 00178 SG_DONE(); 00179 } 00180 00181 SGMatrix<float64_t> CDistance::get_distance_matrix() 00182 { 00183 int32_t m,n; 00184 float64_t* data=get_distance_matrix_real(m,n,NULL); 00185 return SGMatrix<float64_t>(data, m,n); 00186 } 00187 00188 float32_t* CDistance::get_distance_matrix_shortreal( 00189 int32_t &num_vec1, int32_t &num_vec2, float32_t* target) 00190 { 00191 float32_t* result = NULL; 00192 CFeatures* f1 = lhs; 00193 CFeatures* f2 = rhs; 00194 00195 if (f1 && f2) 00196 { 00197 if (target && (num_vec1!=f1->get_num_vectors() || num_vec2!=f2->get_num_vectors())) 00198 SG_ERROR("distance matrix does not fit into target\n"); 00199 00200 num_vec1=f1->get_num_vectors(); 00201 num_vec2=f2->get_num_vectors(); 00202 int64_t total_num=num_vec1*num_vec2; 00203 int32_t num_done=0; 00204 00205 SG_DEBUG("returning distance matrix of size %dx%d\n", num_vec1, num_vec2); 00206 00207 if (target) 00208 result=target; 00209 else 00210 result=SG_MALLOC(float32_t, total_num); 00211 00212 if ( (f1 == f2) && (num_vec1 == num_vec2) ) 00213 { 00214 for (int32_t i=0; i<num_vec1; i++) 00215 { 00216 for (int32_t j=i; j<num_vec1; j++) 00217 { 00218 float64_t v=distance(i,j); 00219 00220 result[i+j*num_vec1]=v; 00221 result[j+i*num_vec1]=v; 00222 00223 if (num_done%100000) 00224 SG_PROGRESS(num_done, 0, total_num-1); 00225 00226 if (i!=j) 00227 num_done+=2; 00228 else 00229 num_done+=1; 00230 } 00231 } 00232 } 00233 else 00234 { 00235 for (int32_t i=0; i<num_vec1; i++) 00236 { 00237 for (int32_t j=0; j<num_vec2; j++) 00238 { 00239 result[i+j*num_vec1]=distance(i,j) ; 00240 00241 if (num_done%100000) 00242 SG_PROGRESS(num_done, 0, total_num-1); 00243 00244 num_done++; 00245 } 00246 } 00247 } 00248 00249 SG_DONE(); 00250 } 00251 else 00252 SG_ERROR("no features assigned to distance\n"); 00253 00254 return result; 00255 } 00256 00257 float64_t* CDistance::get_distance_matrix_real( 00258 int32_t &lhs_vectors_number, int32_t &rhs_vectors_number, float64_t* target) 00259 { 00260 float64_t* distance_matrix = NULL; 00261 CFeatures* lhs_features = lhs; 00262 CFeatures* rhs_features = rhs; 00263 00264 // check for errors 00265 if (!lhs_features || !rhs_features) 00266 SG_ERROR("No features assigned to the distance.\n"); 00267 00268 if (target && 00269 (lhs_vectors_number!=lhs_features->get_num_vectors() || 00270 rhs_vectors_number!=rhs_features->get_num_vectors())) 00271 SG_ERROR("Distance matrix does not fit into the given target.\n"); 00272 00273 // init numbers of vectors and total number of distances 00274 lhs_vectors_number = lhs_features->get_num_vectors(); 00275 rhs_vectors_number = rhs_features->get_num_vectors(); 00276 int64_t total_distances_number = lhs_vectors_number*rhs_vectors_number; 00277 00278 SG_DEBUG("Calculating distance matrix of size %dx%d.\n", lhs_vectors_number, rhs_vectors_number); 00279 00280 // redirect to target or allocate memory 00281 if (target) 00282 distance_matrix = target; 00283 else 00284 distance_matrix = SG_MALLOC(float64_t, total_distances_number); 00285 00286 // check if we're computing symmetric distance_matrix 00287 bool symmetric = (lhs_features==rhs_features) || (lhs_vectors_number==rhs_vectors_number); 00288 // select chunking method according to greatest dimension 00289 bool chunk_by_lhs = (lhs_vectors_number >= rhs_vectors_number); 00290 00291 #ifdef HAVE_PTHREAD 00292 // init parallel to work 00293 int32_t num_threads = parallel->get_num_threads(); 00294 ASSERT(num_threads>0); 00295 pthread_t* threads = SG_MALLOC(pthread_t, num_threads); 00296 DISTANCE_THREAD_PARAM* parameters = SG_MALLOC(DISTANCE_THREAD_PARAM,num_threads); 00297 pthread_attr_t attr; 00298 pthread_attr_init(&attr); 00299 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 00300 // run threads 00301 for (int32_t t=0; t<num_threads; t++) 00302 { 00303 parameters[t].idx_start = t; 00304 parameters[t].idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number; 00305 parameters[t].idx_step = num_threads; 00306 parameters[t].distance_matrix = distance_matrix; 00307 parameters[t].symmetric = symmetric; 00308 parameters[t].lhs_vectors_number = lhs_vectors_number; 00309 parameters[t].rhs_vectors_number = rhs_vectors_number; 00310 parameters[t].chunk_by_lhs = chunk_by_lhs; 00311 parameters[t].distance = this; 00312 pthread_create(&threads[t], &attr, run_distance_thread, (void*)¶meters[t]); 00313 } 00314 // join, i.e. wait threads for finish 00315 for (int32_t t=0; t<num_threads; t++) 00316 { 00317 pthread_join(threads[t], NULL); 00318 } 00319 // cleanup 00320 pthread_attr_destroy(&attr); 00321 SG_FREE(parameters); 00322 SG_FREE(threads); 00323 #else 00324 // init one-threaded parameters 00325 DISTANCE_THREAD_PARAM single_thread_param; 00326 single_thread_param.idx_start = 0; 00327 single_thread_param.idx_stop = chunk_by_lhs ? lhs_vectors_number : rhs_vectors_number; 00328 single_thread_param.idx_step = 1; 00329 single_thread_param.distance_matrix = distance_matrix; 00330 single_thread_param.symmetric = symmetric; 00331 single_thread_param.lhs_vectors_number = lhs_vectors_number; 00332 single_thread_param.rhs_vectors_number = rhs_vectors_number; 00333 single_thread_param.chunk_by_lhs = chunk_by_lhs; 00334 single_thread_param.distance = this; 00335 // run thread 00336 run_distance_thread((void*)&single_thread_param); 00337 #endif 00338 00339 return distance_matrix; 00340 } 00341 00342 void CDistance::init() 00343 { 00344 precomputed_matrix = NULL; 00345 precompute_matrix = false; 00346 lhs = NULL; 00347 rhs = NULL; 00348 00349 m_parameters->add((CSGObject**) &lhs, "lhs", 00350 "Feature vectors to occur on left hand side."); 00351 m_parameters->add((CSGObject**) &rhs, "rhs", 00352 "Feature vectors to occur on right hand side."); 00353 } 00354 00355 void* CDistance::run_distance_thread(void* p) 00356 { 00357 DISTANCE_THREAD_PARAM* parameters = (DISTANCE_THREAD_PARAM*)p; 00358 float64_t* distance_matrix = parameters->distance_matrix; 00359 CDistance* distance = parameters->distance; 00360 int32_t idx_start = parameters->idx_start; 00361 int32_t idx_stop = parameters->idx_stop; 00362 int32_t idx_step = parameters->idx_step; 00363 int32_t lhs_vectors_number = parameters->lhs_vectors_number; 00364 int32_t rhs_vectors_number = parameters->rhs_vectors_number; 00365 bool symmetric = parameters->symmetric; 00366 bool chunk_by_lhs = parameters->chunk_by_lhs; 00367 00368 if (symmetric) 00369 { 00370 for (int32_t i=idx_start; i<idx_stop; i+=idx_step) 00371 { 00372 for (int32_t j=i; j<rhs_vectors_number; j++) 00373 { 00374 float64_t ij_distance = distance->compute(i,j); 00375 distance_matrix[i*rhs_vectors_number+j] = ij_distance; 00376 distance_matrix[j*rhs_vectors_number+i] = ij_distance; 00377 } 00378 } 00379 } 00380 else 00381 { 00382 if (chunk_by_lhs) 00383 { 00384 for (int32_t i=idx_start; i<idx_stop; i+=idx_step) 00385 { 00386 for (int32_t j=0; j<rhs_vectors_number; j++) 00387 { 00388 distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j); 00389 } 00390 } 00391 } 00392 else 00393 { 00394 for (int32_t j=idx_start; j<idx_stop; j+=idx_step) 00395 { 00396 for (int32_t i=0; i<lhs_vectors_number; i++) 00397 { 00398 distance_matrix[j*lhs_vectors_number+i] = distance->compute(i,j); 00399 } 00400 } 00401 } 00402 } 00403 00404 return NULL; 00405 }