SHOGUN
v1.1.0
|
00001 /* 00002 * Compute the local alignment kernel 00003 * 00004 * Largely based on LAkernel.c (version 0.3) 00005 * 00006 * Copyright 2003 Jean-Philippe Vert 00007 * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo 00008 * 00009 * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg 00010 * 00011 * Reference: 00012 * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology 00013 * detection using string alignment kernels", Bioinformatics, 00014 * vol.20, p.1682-1689, 2004. 00015 * 00016 * This program is free software; you can redistribute it and/or modify 00017 * it under the terms of the GNU General Public License as published by 00018 * the Free Software Foundation; either version 3 of the License, or 00019 * (at your option) any later version. 00020 */ 00021 00022 #include <stdlib.h> 00023 #include <stdio.h> 00024 #include <math.h> 00025 #include <ctype.h> 00026 #include <string.h> 00027 #include <shogun/kernel/LocalAlignmentStringKernel.h> 00028 00029 using namespace shogun; 00030 00031 /****************/ 00032 /* The alphabet */ 00033 /****************/ 00034 00035 #define NAA 20 /* Number of amino-acids */ 00036 #define NLET 26 /* Number of letters in the alphabet */ 00037 const char* CLocalAlignmentStringKernel::aaList= "ARNDCQEGHILKMFPSTWYV"; /* The list of amino acids */ 00038 00039 /*****************/ 00040 /* SW parameters */ 00041 /*****************/ 00042 00043 /* mutation matrix */ 00044 const int32_t CLocalAlignmentStringKernel::blosum[] = { 00045 6, 00046 -2, 8, 00047 -2, -1, 9, 00048 -3, -2, 2, 9, 00049 -1, -5, -4, -5, 13, 00050 -1, 1, 0, 0, -4, 8, 00051 -1, 0, 0, 2, -5, 3, 7, 00052 0, -3, -1, -2, -4, -3, -3, 8, 00053 -2, 0, 1, -2, -4, 1, 0, -3, 11, 00054 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6, 00055 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6, 00056 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7, 00057 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8, 00058 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9, 00059 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11, 00060 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6, 00061 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7, 00062 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16, 00063 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10, 00064 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6}; 00065 00066 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */ 00067 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2)) 00068 00069 /********************* 00070 * Kernel parameters * 00071 *********************/ 00072 00073 #define SCALING 0.1 /* Factor to scale all SW parameters */ 00074 00075 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */ 00076 /* If x=log(a) and y=log(b), compute log(a+b) : */ 00077 /* 00078 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y)))) 00079 */ 00080 00081 #define LOGP(x,y) LogSum(x,y) 00082 00083 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */ 00084 /* 00085 #define LOGP(x,y) (((x)>(y))?(x):(y)) 00086 */ 00087 00088 /* Usefule constants */ 00089 #define LOG0 -10000 /* log(0) */ 00090 #define INTSCALE 1000.0 /* critical for speed and precise computation*/ 00091 00092 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL]; 00093 00094 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size) 00095 : CStringKernel<char>(size) 00096 { 00097 init(); 00098 init_static_variables(); 00099 } 00100 00101 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel( 00102 CStringFeatures<char>* l, CStringFeatures<char>* r, 00103 float64_t opening, float64_t extension) 00104 : CStringKernel<char>() 00105 { 00106 init(); 00107 m_opening=opening; 00108 m_extension=extension; 00109 init_static_variables(); 00110 init(l, r); 00111 } 00112 00113 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel() 00114 { 00115 cleanup(); 00116 } 00117 00118 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r) 00119 { 00120 CStringKernel<char>::init(l, r); 00121 initialized = true; 00122 return init_normalizer(); 00123 } 00124 00125 void CLocalAlignmentStringKernel::cleanup() 00126 { 00127 SG_FREE(scaled_blosum); 00128 scaled_blosum=NULL; 00129 00130 SG_FREE(isAA); 00131 isAA=NULL; 00132 SG_FREE(aaIndex); 00133 aaIndex=NULL; 00134 00135 CKernel::cleanup(); 00136 } 00137 00138 /* LogSum - default log funciotion. fast, but not exact */ 00139 /* LogSum2 - precise, but slow. Note that these two functions need different figure types */ 00140 00141 void CLocalAlignmentStringKernel::init_logsum(){ 00142 int32_t i; 00143 for (i = 0; i < LOGSUM_TBL; i++) 00144 logsum_lookup[i] = (int32_t) (INTSCALE* 00145 (log(1.+exp( (float32_t) -i/INTSCALE)))); 00146 } 00147 00148 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2) 00149 { 00150 int32_t diff; 00151 static int32_t firsttime=1; 00152 00153 if (firsttime) 00154 { 00155 init_logsum(); 00156 firsttime =0; 00157 } 00158 00159 diff=p1-p2; 00160 if (diff>=LOGSUM_TBL) return p1; 00161 else if (diff<=-LOGSUM_TBL) return p2; 00162 else if (diff>0) return p1+logsum_lookup[diff]; 00163 else return p2+logsum_lookup[-diff]; 00164 } 00165 00166 00167 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2) 00168 { 00169 if (p1 > p2) 00170 return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1)); 00171 else 00172 return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2)); 00173 } 00174 00175 00176 void CLocalAlignmentStringKernel::init_static_variables() 00177 /* Initialize all static variables. This function should be called once before computing the first pair HMM score */ 00178 { 00179 register int32_t i; 00180 00181 /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */ 00182 if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t))) == NULL) 00183 SG_ERROR("run out o memory"); 00184 for (i=0;i<NAA;i++) 00185 aaIndex[aaList[i]-'A']=i; 00186 00187 /* Initialization of the array which indicates whether a char is an amino-acid */ 00188 if ((isAA=(int32_t *)calloc(256,sizeof(int32_t))) == NULL) 00189 SG_ERROR("run out of memory"); 00190 for (i=0;i<NAA;i++) 00191 isAA[(int32_t)aaList[i]]=1; 00192 00193 /* Scale the blossum matrix */ 00194 for (i=0 ; i<NAA*(NAA+1)/2; i++) 00195 scaled_blosum[i] = (int32_t) floor(blosum[i]*SCALING*INTSCALE); 00196 00197 00198 /* Scale of gap penalties */ 00199 m_opening = (int32_t) floor(m_opening * SCALING*INTSCALE); 00200 m_extension = (int32_t) floor(m_extension * SCALING*INTSCALE); 00201 } 00202 00203 00204 00205 /* Implementation of the 00206 * convolution kernel which generalizes the Smith-Waterman algorithm 00207 */ 00208 float64_t CLocalAlignmentStringKernel::LAkernelcompute( 00209 int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */ 00210 int32_t nX, int32_t nY /* the lengths of both sequences */) 00211 { 00212 register int32_t 00213 i,j, /* loop indexes */ 00214 cur, old, /* to indicate the array to use (0 or 1) */ 00215 curpos, frompos; /* position in an array */ 00216 00217 int32_t 00218 *logX, /* arrays to store the log-values of each state */ 00219 *logY, 00220 *logM, 00221 *logX2, 00222 *logY2, 00223 00224 aux , aux2;/* , aux3 , aux4 , aux5;*/ 00225 int32_t 00226 cl; /* length of a column for the dynamic programming */ 00227 00228 /* 00229 printf("now computing pairHMM between %d and %d:\n",nX,nY); 00230 for (i=0;i<nX;printf("%d ",aaX[i++])); 00231 printf("\n and \n"); 00232 for (i=0;i<nY;printf("%d ",aaY[i++])); 00233 printf("\n"); 00234 */ 00235 00236 /* Initialization of the arrays */ 00237 /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */ 00238 cl = nY+1; /* each column stores the positions in the aaY sequence, plus a position at zero */ 00239 00240 logM=SG_CALLOC(int32_t, 2*cl); 00241 logX=SG_CALLOC(int32_t, 2*cl); 00242 logY=SG_CALLOC(int32_t, 2*cl); 00243 logX2=SG_CALLOC(int32_t, 2*cl); 00244 logY2=SG_CALLOC(int32_t, 2*cl); 00245 00246 /************************************************/ 00247 /* First iteration : initialization of column 0 */ 00248 /************************************************/ 00249 /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */ 00250 00251 for (j=0;j<cl;j++) { 00252 logM[j]=LOG0; 00253 logX[j]=LOG0; 00254 logY[j]=LOG0; 00255 logX2[j]=LOG0; 00256 logY2[j]=LOG0; 00257 00258 } 00259 00260 /* Update column order */ 00261 cur = 1; /* Indexes [0..cl-1] are used to process the next column */ 00262 old = 0; /* Indexes [cl..2*cl-1] were used for column 0 */ 00263 00264 00265 /************************************************/ 00266 /* Next iterations : processing columns 1 .. nX */ 00267 /************************************************/ 00268 00269 /* Main loop to vary the position in aaX : i=1..nX */ 00270 for (i=1;i<=nX;i++) { 00271 00272 /* Special update for positions (i=1..nX,j=0) */ 00273 curpos = cur*cl; /* index of the state (i,0) */ 00274 logM[curpos] = LOG0; 00275 logX[curpos] = LOG0; 00276 logY[curpos] = LOG0; 00277 logX2[curpos] = LOG0; 00278 logY2[curpos] = LOG0; 00279 00280 /* Secondary loop to vary the position in aaY : j=1..nY */ 00281 for (j=1;j<=nY;j++) { 00282 00283 curpos = cur*cl + j; /* index of the state (i,j) */ 00284 00285 /* Update for states which emit X only */ 00286 /***************************************/ 00287 00288 frompos = old*cl + j; /* index of the state (i-1,j) */ 00289 00290 /* State RX */ 00291 logX[curpos] = LOGP( - m_opening + logM[frompos] , - m_extension + logX[frompos] ); 00292 /* printf("%.5f\n",logX[curpos]);*/ 00293 /* printf("%.5f\n",logX_B[curpos]);*/ 00294 /* State RX2 */ 00295 logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] ); 00296 00297 /* Update for states which emit Y only */ 00298 /***************************************/ 00299 00300 frompos = cur*cl + j-1; /* index of the state (i,j-1) */ 00301 00302 /* State RY */ 00303 aux = LOGP( - m_opening + logM[frompos] , - m_extension + logY[frompos] ); 00304 logY[curpos] = LOGP( aux , - m_opening + logX[frompos] ); 00305 00306 /* State RY2 */ 00307 aux = LOGP( logM[frompos] , logY2[frompos] ); 00308 logY2[curpos] = LOGP( aux , logX2[frompos] ); 00309 00310 /* Update for states which emit X and Y */ 00311 /****************************************/ 00312 00313 frompos = old*cl + j-1; /* index of the state (i-1,j-1) */ 00314 00315 aux = LOGP( logX[frompos] , logY[frompos] ); 00316 aux2 = LOGP( 0 , logM[frompos] ); 00317 logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ]; 00318 00319 /* printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]); 00320 */ 00321 00322 } /* end of j=1:nY loop */ 00323 00324 00325 /* Update the culumn order */ 00326 cur = 1-cur; 00327 old = 1-old; 00328 00329 } /* end of j=1:nX loop */ 00330 00331 00332 /* Termination */ 00333 /***************/ 00334 00335 curpos = old*cl + nY; /* index of the state (nX,nY) */ 00336 aux = LOGP( logX2[curpos] , logY2[curpos] ); 00337 aux2 = LOGP( 0 , logM[curpos] ); 00338 /* kernel_value = LOGP( aux , aux2 );*/ 00339 00340 /* Memory release */ 00341 SG_FREE(logM); 00342 SG_FREE(logX); 00343 SG_FREE(logY); 00344 SG_FREE(logX2); 00345 SG_FREE(logY2); 00346 00347 /* Return the logarithm of the kernel */ 00348 return (float32_t)LOGP(aux,aux2)/INTSCALE; 00349 } 00350 00351 /********************/ 00352 /* Public functions */ 00353 /********************/ 00354 00355 00356 /* Return the log-probability of two sequences x and y under a pair HMM model */ 00357 /* x and y are strings of aminoacid letters, e.g., "AABRS" */ 00358 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y) 00359 { 00360 int32_t *aax,*aay; /* to convert x and y into sequences of amino-acid indexes */ 00361 int32_t lx=0,ly=0; /* lengths of x and y */ 00362 int32_t i,j; 00363 00364 bool free_x, free_y; 00365 char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x); 00366 char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y); 00367 ASSERT(x && y); 00368 00369 if ((lx<1) || (ly<1)) 00370 SG_ERROR("empty chain"); 00371 00372 /* Create aax and aay */ 00373 00374 if ((aax=(int32_t *)calloc(lx,sizeof(int32_t))) == NULL) 00375 SG_ERROR("run out of memory"); 00376 if ((aay=(int32_t *)calloc(ly,sizeof(int32_t))) == NULL) 00377 SG_ERROR("run out of memory"); 00378 00379 /* Extract the characters corresponding to aminoacids and keep their indexes */ 00380 00381 j=0; 00382 for (i=0 ; i<lx ; i++) 00383 if (isAA[toupper(x[i])]) 00384 aax[j++] = aaIndex[toupper(x[i])-'A']; 00385 lx = j; 00386 j=0; 00387 for (i=0 ; i<ly ; i++) 00388 if (isAA[toupper(y[i])]) 00389 aay[j++] = aaIndex[toupper(y[i])-'A']; 00390 ly = j; 00391 00392 00393 /* Compute the pair HMM score */ 00394 float64_t result=LAkernelcompute(aax,aay,lx,ly); 00395 00396 /* Release memory */ 00397 SG_FREE(aax); 00398 SG_FREE(aay); 00399 00400 ((CStringFeatures<char>*) lhs)->free_feature_vector(x, idx_x, free_x); 00401 ((CStringFeatures<char>*) rhs)->free_feature_vector(y, idx_y, free_y); 00402 00403 return result; 00404 } 00405 00406 void CLocalAlignmentStringKernel::init() 00407 { 00408 initialized = false; 00409 isAA = NULL; 00410 aaIndex = NULL; 00411 00412 m_opening = 10; 00413 m_extension = 2; 00414 00415 scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum)); 00416 init_logsum(); 00417 00418 m_parameters->add(&initialized, "initialized", "if kernel is initalized"); 00419 m_parameters->add(&m_opening, "opening", "opening gap opening penalty"); 00420 m_parameters->add(&m_extension, "extension", "extension gap extension penalty"); 00421 }