SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 // Math.cpp: implementation of the CMath class. 00013 // 00015 00016 00017 #include <shogun/lib/common.h> 00018 #include <shogun/mathematics/Math.h> 00019 #include <shogun/mathematics/lapack.h> 00020 #include <shogun/io/SGIO.h> 00021 00022 #include <stdio.h> 00023 #include <stdlib.h> 00024 #include <math.h> 00025 00026 using namespace shogun; 00027 00028 #ifdef USE_LOGCACHE 00029 #ifdef USE_HMMDEBUG 00030 #define MAX_LOG_TABLE_SIZE 10*1024*1024 00031 #define LOG_TABLE_PRECISION 1e-6 00032 #else //USE_HMMDEBUG 00033 #define MAX_LOG_TABLE_SIZE 123*1024*1024 00034 #define LOG_TABLE_PRECISION 1e-15 00035 #endif //USE_HMMDEBUG 00036 int32_t CMath::LOGACCURACY = 0; // 100000 steps per integer 00037 #endif // USE_LOGCACHE 00038 00039 int32_t CMath::LOGRANGE = 0; // range for logtable: log(1+exp(x)) -25 <= x <= 0 00040 00041 const float64_t CMath::INFTY = -log(0.0); // infinity 00042 const float64_t CMath::ALMOST_INFTY = +1e+20; //a large number 00043 const float64_t CMath::ALMOST_NEG_INFTY = -1000; 00044 const float64_t CMath::PI=PI; 00045 const float64_t CMath::MACHINE_EPSILON=5E-16; 00046 const float64_t CMath::MAX_REAL_NUMBER=1E300; 00047 const float64_t CMath::MIN_REAL_NUMBER=1E-300; 00048 00049 #ifdef USE_LOGCACHE 00050 float64_t* CMath::logtable = NULL; 00051 #endif 00052 char* CMath::rand_state = NULL; 00053 uint32_t CMath::seed = 0; 00054 00055 CMath::CMath() 00056 : CSGObject() 00057 { 00058 CMath::rand_state=SG_MALLOC(char, RNG_SEED_SIZE); 00059 init_random(); 00060 00061 #ifdef USE_LOGCACHE 00062 LOGRANGE=CMath::determine_logrange(); 00063 LOGACCURACY=CMath::determine_logaccuracy(LOGRANGE); 00064 CMath::logtable=SG_MALLOC(float64_t, LOGRANGE*LOGACCURACY); 00065 init_log_table(); 00066 #else 00067 int32_t i=0; 00068 while ((float64_t)log(1+((float64_t)exp(-float64_t(i))))) 00069 i++; 00070 00071 LOGRANGE=i; 00072 #endif 00073 } 00074 00075 CMath::~CMath() 00076 { 00077 SG_FREE(CMath::rand_state); 00078 CMath::rand_state=NULL; 00079 #ifdef USE_LOGCACHE 00080 SG_FREE(CMath::logtable); 00081 CMath::logtable=NULL; 00082 #endif 00083 } 00084 00085 namespace shogun 00086 { 00087 template <> 00088 void CMath::display_vector(const uint8_t* vector, int32_t n, const char* name) 00089 { 00090 ASSERT(n>=0); 00091 SG_SPRINT("%s=[", name); 00092 for (int32_t i=0; i<n; i++) 00093 SG_SPRINT("%d%s", vector[i], i==n-1? "" : ","); 00094 SG_SPRINT("]\n"); 00095 } 00096 00097 template <> 00098 void CMath::display_vector(const int32_t* vector, int32_t n, const char* name) 00099 { 00100 ASSERT(n>=0); 00101 SG_SPRINT("%s=[", name); 00102 for (int32_t i=0; i<n; i++) 00103 SG_SPRINT("%d%s", vector[i], i==n-1? "" : ","); 00104 SG_SPRINT("]\n"); 00105 } 00106 00107 template <> 00108 void CMath::display_vector(const int64_t* vector, int32_t n, const char* name) 00109 { 00110 ASSERT(n>=0); 00111 SG_SPRINT("%s=[", name); 00112 for (int32_t i=0; i<n; i++) 00113 SG_SPRINT("%lld%s", vector[i], i==n-1? "" : ","); 00114 SG_SPRINT("]\n"); 00115 } 00116 00117 template <> 00118 void CMath::display_vector(const uint64_t* vector, int32_t n, const char* name) 00119 { 00120 ASSERT(n>=0); 00121 SG_SPRINT("%s=[", name); 00122 for (int32_t i=0; i<n; i++) 00123 SG_SPRINT("%llu%s", vector[i], i==n-1? "" : ","); 00124 SG_SPRINT("]\n"); 00125 } 00126 00127 template <> 00128 void CMath::display_vector(const float32_t* vector, int32_t n, const char* name) 00129 { 00130 ASSERT(n>=0); 00131 SG_SPRINT("%s=[", name); 00132 for (int32_t i=0; i<n; i++) 00133 SG_SPRINT("%g%s", vector[i], i==n-1? "" : ","); 00134 SG_SPRINT("]\n"); 00135 } 00136 00137 template <> 00138 void CMath::display_vector(const float64_t* vector, int32_t n, const char* name) 00139 { 00140 ASSERT(n>=0); 00141 SG_SPRINT("%s=[", name); 00142 for (int32_t i=0; i<n; i++) 00143 SG_SPRINT("%.18g%s", vector[i], i==n-1? "" : ","); 00144 SG_SPRINT("]\n"); 00145 } 00146 00147 template <> 00148 void CMath::display_vector(const floatmax_t* vector, int32_t n, const char* name) 00149 { 00150 ASSERT(n>=0); 00151 SG_SPRINT("%s=[", name); 00152 for (int32_t i=0; i<n; i++) 00153 SG_SPRINT("%.36Lg%s", (long double) vector[i], i==n-1? "" : ","); 00154 SG_SPRINT("]\n"); 00155 } 00156 00157 template <> 00158 void CMath::display_matrix( 00159 const int32_t* matrix, int32_t rows, int32_t cols, const char* name) 00160 { 00161 ASSERT(rows>=0 && cols>=0); 00162 SG_SPRINT("%s=[\n", name); 00163 for (int32_t i=0; i<rows; i++) 00164 { 00165 SG_SPRINT("["); 00166 for (int32_t j=0; j<cols; j++) 00167 SG_SPRINT("\t%d%s", matrix[j*rows+i], 00168 j==cols-1? "" : ","); 00169 SG_SPRINT("]%s\n", i==rows-1? "" : ","); 00170 } 00171 SG_SPRINT("]\n"); 00172 } 00173 00174 template <> 00175 void CMath::display_matrix( 00176 const float64_t* matrix, int32_t rows, int32_t cols, const char* name) 00177 { 00178 ASSERT(rows>=0 && cols>=0); 00179 SG_SPRINT("%s=[\n", name); 00180 for (int32_t i=0; i<rows; i++) 00181 { 00182 SG_SPRINT("["); 00183 for (int32_t j=0; j<cols; j++) 00184 SG_SPRINT("\t%.18g%s", (double) matrix[j*rows+i], 00185 j==cols-1? "" : ","); 00186 SG_SPRINT("]%s\n", i==rows-1? "" : ","); 00187 } 00188 SG_SPRINT("]\n"); 00189 } 00190 00191 template <> 00192 void CMath::display_matrix( 00193 const float32_t* matrix, int32_t rows, int32_t cols, const char* name) 00194 { 00195 ASSERT(rows>=0 && cols>=0); 00196 SG_SPRINT("%s=[\n", name); 00197 for (int32_t i=0; i<rows; i++) 00198 { 00199 SG_SPRINT("["); 00200 for (int32_t j=0; j<cols; j++) 00201 SG_SPRINT("\t%.18g%s", (float) matrix[j*rows+i], 00202 j==cols-1? "" : ","); 00203 SG_SPRINT("]%s\n", i==rows-1? "" : ","); 00204 } 00205 SG_SPRINT("]\n"); 00206 } 00207 00208 } 00209 00210 SGVector<float64_t> CMath::fishers_exact_test_for_multiple_2x3_tables(SGMatrix<float64_t> tables) 00211 { 00212 SGMatrix<float64_t> table(NULL,2,3,false); 00213 int32_t len=tables.num_cols/3; 00214 00215 SGVector<float64_t> v(len); 00216 for (int32_t i=0; i<len; i++) 00217 { 00218 table.matrix=&tables.matrix[2*3*i]; 00219 v.vector[i]=fishers_exact_test_for_2x3_table(table); 00220 } 00221 return v; 00222 } 00223 00224 float64_t CMath::fishers_exact_test_for_2x3_table(SGMatrix<float64_t> table) 00225 { 00226 ASSERT(table.num_rows==2); 00227 ASSERT(table.num_cols==3); 00228 00229 int32_t m_len=3+2; 00230 float64_t* m=SG_MALLOC(float64_t, 3+2); 00231 m[0]=table.matrix[0]+table.matrix[2]+table.matrix[4]; 00232 m[1]=table.matrix[1]+table.matrix[3]+table.matrix[5]; 00233 m[2]=table.matrix[0]+table.matrix[1]; 00234 m[3]=table.matrix[2]+table.matrix[3]; 00235 m[4]=table.matrix[4]+table.matrix[5]; 00236 00237 float64_t n = CMath::sum(m, m_len) / 2.0; 00238 int32_t x_len=2*3* CMath::sq(CMath::max(m, m_len)); 00239 float64_t* x = SG_MALLOC(float64_t, x_len); 00240 CMath::fill_vector(x, x_len, 0.0); 00241 00242 float64_t log_nom=0.0; 00243 for (int32_t i=0; i<3+2; i++) 00244 log_nom+=CMath::lgamma(m[i]+1); 00245 log_nom-=CMath::lgamma(n+1.0); 00246 00247 float64_t log_denomf=0; 00248 floatmax_t log_denom=0; 00249 00250 for (int32_t i=0; i<3*2; i++) 00251 { 00252 log_denom+=CMath::lgammal((floatmax_t) table.matrix[i]+1); 00253 log_denomf+=gamma(table.matrix[i]+1); 00254 } 00255 00256 floatmax_t prob_table_log=log_nom - log_denom; 00257 00258 int32_t dim1 = CMath::min(m[0], m[2]); 00259 00260 //traverse all possible tables with given m 00261 int32_t counter = 0; 00262 for (int32_t k=0; k<=dim1; k++) 00263 { 00264 for (int32_t l=CMath::max(0.0,m[0]-m[4]-k); l<=CMath::min(m[0]-k, m[3]); l++) 00265 { 00266 x[0 + 0*2 + counter*2*3] = k; 00267 x[0 + 1*2 + counter*2*3] = l; 00268 x[0 + 2*2 + counter*2*3] = m[0] - x[0 + 0*2 + counter*2*3] - x[0 + 1*2 + counter*2*3]; 00269 x[1 + 0*2 + counter*2*3] = m[2] - x[0 + 0*2 + counter*2*3]; 00270 x[1 + 1*2 + counter*2*3] = m[3] - x[0 + 1*2 + counter*2*3]; 00271 x[1 + 2*2 + counter*2*3] = m[4] - x[0 + 2*2 + counter*2*3]; 00272 00273 counter++; 00274 } 00275 } 00276 00277 //#define DEBUG_FISHER_TABLE 00278 #ifdef DEBUG_FISHER_TABLE 00279 SG_SPRINT("counter=%d\n", counter); 00280 SG_SPRINT("dim1=%d\n", dim1); 00281 SG_SPRINT("l=%g...%g\n", CMath::max(0.0,m[0]-m[4]-0), CMath::min(m[0]-0, m[3])); 00282 SG_SPRINT("n=%g\n", n); 00283 SG_SPRINT("prob_table_log=%.18Lg\n", prob_table_log); 00284 SG_SPRINT("log_denomf=%.18g\n", log_denomf); 00285 SG_SPRINT("log_denom=%.18Lg\n", log_denom); 00286 SG_SPRINT("log_nom=%.18g\n", log_nom); 00287 display_vector(m, m_len, "marginals"); 00288 display_vector(x, 2*3*counter, "x"); 00289 #endif // DEBUG_FISHER_TABLE 00290 00291 00292 floatmax_t* log_denom_vec=SG_MALLOC(floatmax_t, counter); 00293 CMath::fill_vector(log_denom_vec, counter, (floatmax_t) 0.0); 00294 00295 for (int32_t k=0; k<counter; k++) 00296 { 00297 for (int32_t j=0; j<3; j++) 00298 { 00299 for (int32_t i=0; i<2; i++) 00300 log_denom_vec[k]+=CMath::lgammal(x[i + j*2 + k*2*3]+1.0); 00301 } 00302 } 00303 00304 for (int32_t i=0; i<counter; i++) 00305 log_denom_vec[i]=log_nom-log_denom_vec[i]; 00306 00307 #ifdef DEBUG_FISHER_TABLE 00308 display_vector(log_denom_vec, counter, "log_denom_vec"); 00309 #endif // DEBUG_FISHER_TABLE 00310 00311 00312 float64_t nonrand_p=-CMath::INFTY; 00313 for (int32_t i=0; i<counter; i++) 00314 { 00315 if (log_denom_vec[i]<=prob_table_log) 00316 nonrand_p=CMath::logarithmic_sum(nonrand_p, log_denom_vec[i]); 00317 } 00318 00319 #ifdef DEBUG_FISHER_TABLE 00320 SG_SPRINT("nonrand_p=%.18g\n", nonrand_p); 00321 SG_SPRINT("exp_nonrand_p=%.18g\n", CMath::exp(nonrand_p)); 00322 #endif // DEBUG_FISHER_TABLE 00323 00324 nonrand_p=CMath::exp(nonrand_p); 00325 00326 SG_FREE(log_denom_vec); 00327 SG_FREE(x); 00328 SG_FREE(m); 00329 00330 return nonrand_p; 00331 } 00332 00333 00334 #ifdef USE_LOGCACHE 00335 int32_t CMath::determine_logrange() 00336 { 00337 int32_t i; 00338 float64_t acc=0; 00339 for (i=0; i<50; i++) 00340 { 00341 acc=((float64_t)log(1+((float64_t)exp(-float64_t(i))))); 00342 if (acc<=(float64_t)LOG_TABLE_PRECISION) 00343 break; 00344 } 00345 00346 SG_SINFO( "determined range for x in table log(1+exp(-x)) is:%d (error:%G)\n",i,acc); 00347 return i; 00348 } 00349 00350 int32_t CMath::determine_logaccuracy(int32_t range) 00351 { 00352 range=MAX_LOG_TABLE_SIZE/range/((int)sizeof(float64_t)); 00353 SG_SINFO( "determined accuracy for x in table log(1+exp(-x)) is:%d (error:%G)\n",range,1.0/(double) range); 00354 return range; 00355 } 00356 00357 //init log table of form log(1+exp(x)) 00358 void CMath::init_log_table() 00359 { 00360 for (int32_t i=0; i< LOGACCURACY*LOGRANGE; i++) 00361 logtable[i]=log(1+exp(float64_t(-i)/float64_t(LOGACCURACY))); 00362 } 00363 #endif 00364 00365 void CMath::sort(int32_t *a, int32_t cols, int32_t sort_col) 00366 { 00367 int32_t changed=1; 00368 if (a[0]==-1) return ; 00369 while (changed) 00370 { 00371 changed=0; int32_t i=0 ; 00372 while ((a[(i+1)*cols]!=-1) && (a[(i+1)*cols+1]!=-1)) // to be sure 00373 { 00374 if (a[i*cols+sort_col]>a[(i+1)*cols+sort_col]) 00375 { 00376 for (int32_t j=0; j<cols; j++) 00377 CMath::swap(a[i*cols+j],a[(i+1)*cols+j]) ; 00378 changed=1 ; 00379 } ; 00380 i++ ; 00381 } ; 00382 } ; 00383 } 00384 00385 void CMath::sort(float64_t *a, int32_t* idx, int32_t N) 00386 { 00387 int32_t changed=1; 00388 while (changed) 00389 { 00390 changed=0; 00391 for (int32_t i=0; i<N-1; i++) 00392 { 00393 if (a[i]>a[i+1]) 00394 { 00395 swap(a[i],a[i+1]) ; 00396 swap(idx[i],idx[i+1]) ; 00397 changed=1 ; 00398 } ; 00399 } ; 00400 } ; 00401 00402 } 00403 00404 float64_t CMath::Align( 00405 char* seq1, char* seq2, int32_t l1, int32_t l2, float64_t gapCost) 00406 { 00407 float64_t actCost=0 ; 00408 int32_t i1, i2 ; 00409 float64_t* const gapCosts1 = SG_MALLOC(float64_t, l1 ); 00410 float64_t* const gapCosts2 = SG_MALLOC(float64_t, l2 ); 00411 float64_t* costs2_0 = SG_MALLOC(float64_t, l2 + 1 ); 00412 float64_t* costs2_1 = SG_MALLOC(float64_t, l2 + 1 ); 00413 00414 // initialize borders 00415 for( i1 = 0; i1 < l1; ++i1 ) { 00416 gapCosts1[ i1 ] = gapCost * i1; 00417 } 00418 costs2_1[ 0 ] = 0; 00419 for( i2 = 0; i2 < l2; ++i2 ) { 00420 gapCosts2[ i2 ] = gapCost * i2; 00421 costs2_1[ i2+1 ] = costs2_1[ i2 ] + gapCosts2[ i2 ]; 00422 } 00423 // compute alignment 00424 for( i1 = 0; i1 < l1; ++i1 ) { 00425 swap( costs2_0, costs2_1 ); 00426 actCost = costs2_0[ 0 ] + gapCosts1[ i1 ]; 00427 costs2_1[ 0 ] = actCost; 00428 for( i2 = 0; i2 < l2; ++i2 ) { 00429 const float64_t actMatch = costs2_0[ i2 ] + ( seq1[i1] == seq2[i2] ); 00430 const float64_t actGap1 = costs2_0[ i2+1 ] + gapCosts1[ i1 ]; 00431 const float64_t actGap2 = actCost + gapCosts2[ i2 ]; 00432 const float64_t actGap = min( actGap1, actGap2 ); 00433 actCost = min( actMatch, actGap ); 00434 costs2_1[ i2+1 ] = actCost; 00435 } 00436 } 00437 00438 delete [] gapCosts1; 00439 delete [] gapCosts2; 00440 delete [] costs2_0; 00441 delete [] costs2_1; 00442 00443 // return the final cost 00444 return actCost; 00445 } 00446 00447 float64_t CMath::mutual_info(float64_t* p1, float64_t* p2, int32_t len) 00448 { 00449 double e=0; 00450 00451 for (int32_t i=0; i<len; i++) 00452 for (int32_t j=0; j<len; j++) 00453 e+=exp(p2[j*len+i])*(p2[j*len+i]-p1[i]-p1[j]); 00454 00455 return (float64_t) e; 00456 } 00457 00458 float64_t CMath::relative_entropy(float64_t* p, float64_t* q, int32_t len) 00459 { 00460 double e=0; 00461 00462 for (int32_t i=0; i<len; i++) 00463 e+=exp(p[i])*(p[i]-q[i]); 00464 00465 return (float64_t) e; 00466 } 00467 00468 float64_t CMath::entropy(float64_t* p, int32_t len) 00469 { 00470 double e=0; 00471 00472 for (int32_t i=0; i<len; i++) 00473 e-=exp(p[i])*p[i]; 00474 00475 return (float64_t) e; 00476 } 00477 00478 00479 //Howto construct the pseudo inverse (from "The Matrix Cookbook") 00480 // 00481 //Assume A does not have full rank, i.e. A is n \times m and rank(A) = r < min(n;m). 00482 // 00483 //The matrix A+ known as the pseudo inverse is unique and does always exist. 00484 // 00485 //The pseudo inverse A+ can be constructed from the singular value 00486 //decomposition A = UDV^T , by A^+ = V(D+)U^T. 00487 00488 #ifdef HAVE_LAPACK 00489 float64_t* CMath::pinv( 00490 float64_t* matrix, int32_t rows, int32_t cols, float64_t* target) 00491 { 00492 if (!target) 00493 target=SG_MALLOC(float64_t, rows*cols); 00494 00495 char jobu='A'; 00496 char jobvt='A'; 00497 int m=rows; /* for calling external lib */ 00498 int n=cols; /* for calling external lib */ 00499 int lda=m; /* for calling external lib */ 00500 int ldu=m; /* for calling external lib */ 00501 int ldvt=n; /* for calling external lib */ 00502 int info=-1; /* for calling external lib */ 00503 int32_t lsize=CMath::min((int32_t) m, (int32_t) n); 00504 double* s=SG_MALLOC(double, lsize); 00505 double* u=SG_MALLOC(double, m*m); 00506 double* vt=SG_MALLOC(double, n*n); 00507 00508 wrap_dgesvd(jobu, jobvt, m, n, matrix, lda, s, u, ldu, vt, ldvt, &info); 00509 ASSERT(info==0); 00510 00511 for (int32_t i=0; i<n; i++) 00512 { 00513 for (int32_t j=0; j<lsize; j++) 00514 vt[i*n+j]=vt[i*n+j]/s[j]; 00515 } 00516 00517 cblas_dgemm(CblasColMajor, CblasTrans, CblasTrans, m, n, m, 1.0, vt, ldvt, u, ldu, 0, target, m); 00518 00519 SG_FREE(u); 00520 SG_FREE(vt); 00521 SG_FREE(s); 00522 00523 return target; 00524 } 00525 #endif