SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <vector> 00013 00014 #include <shogun/lib/common.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/lib/Signal.h> 00017 #include <shogun/lib/Trie.h> 00018 #include <shogun/base/Parallel.h> 00019 00020 #include <shogun/kernel/SpectrumMismatchRBFKernel.h> 00021 #include <shogun/features/Features.h> 00022 #include <shogun/features/StringFeatures.h> 00023 00024 00025 #include <vector> 00026 #include <string> 00027 00028 #include <assert.h> 00029 00030 #ifndef WIN32 00031 #include <pthread.h> 00032 #endif 00033 00034 using namespace shogun; 00035 00036 CSpectrumMismatchRBFKernel::CSpectrumMismatchRBFKernel() 00037 :CStringKernel<char>(0) 00038 { 00039 init(); 00040 register_params(); 00041 } 00042 00043 CSpectrumMismatchRBFKernel::CSpectrumMismatchRBFKernel (int32_t size, 00044 float64_t* AA_matrix_, int32_t nr, int32_t nc, 00045 int32_t degree_, int32_t max_mismatch_, float64_t width_) : CStringKernel<char>(size), 00046 alphabet(NULL), degree(degree_), max_mismatch(max_mismatch_), width(width_) 00047 { 00048 lhs=NULL; 00049 rhs=NULL; 00050 00051 target_letter_0=-1 ; 00052 00053 AA_matrix=NULL; 00054 set_AA_matrix(AA_matrix_, nr, nc); 00055 register_params(); 00056 } 00057 00058 CSpectrumMismatchRBFKernel::CSpectrumMismatchRBFKernel( 00059 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t nr, int32_t nc, int32_t degree_, int32_t max_mismatch_, float64_t width_) 00060 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), max_mismatch(max_mismatch_), width(width_) 00061 { 00062 target_letter_0=-1 ; 00063 00064 AA_matrix=NULL; 00065 set_AA_matrix(AA_matrix_, nr, nc); 00066 init(l, r); 00067 register_params(); 00068 } 00069 00070 CSpectrumMismatchRBFKernel::~CSpectrumMismatchRBFKernel() 00071 { 00072 cleanup(); 00073 SG_FREE(AA_matrix); 00074 } 00075 00076 00077 void CSpectrumMismatchRBFKernel::remove_lhs() 00078 { 00079 00080 CKernel::remove_lhs(); 00081 } 00082 00083 bool CSpectrumMismatchRBFKernel::init(CFeatures* l, CFeatures* r) 00084 { 00085 int32_t lhs_changed=(lhs!=l); 00086 int32_t rhs_changed=(rhs!=r); 00087 00088 CStringKernel<char>::init(l,r); 00089 00090 SG_DEBUG("lhs_changed: %i\n", lhs_changed); 00091 SG_DEBUG("rhs_changed: %i\n", rhs_changed); 00092 00093 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00094 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00095 00096 SG_UNREF(alphabet); 00097 alphabet=sf_l->get_alphabet(); 00098 CAlphabet* ralphabet=sf_r->get_alphabet(); 00099 00100 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00101 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00102 00103 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()); 00104 SG_UNREF(ralphabet); 00105 00106 compute_all() ; 00107 00108 return init_normalizer(); 00109 } 00110 00111 void CSpectrumMismatchRBFKernel::cleanup() 00112 { 00113 00114 SG_UNREF(alphabet); 00115 alphabet=NULL; 00116 00117 CKernel::cleanup(); 00118 } 00119 00120 float64_t CSpectrumMismatchRBFKernel::AA_helper(std::string &path, const char* joint_seq, unsigned int index) 00121 { 00122 float64_t diff=0.0 ; 00123 00124 for (unsigned int i=0; i<path.size(); i++) 00125 { 00126 if (path[i]!=joint_seq[index+i]) 00127 { 00128 diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ; 00129 diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ; 00130 diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ; 00131 } 00132 } 00133 00134 return exp( - diff/width) ; 00135 } 00136 00137 /* 00138 float64_t CSpectrumMismatchRBFKernel::compute_helper(const char* joint_seq, 00139 std::vector<unsigned int> joint_index, std::vector<unsigned int> joint_mismatch, 00140 std::string path, unsigned int d, 00141 const int & alen) 00142 { 00143 const char* AA = "ACDEFGHIKLMNPQRSTVWY" ; 00144 const unsigned int num_AA = strlen(AA) ; 00145 00146 assert(path.size()==d) ; 00147 assert(joint_mismatch.size()==joint_index.size()) ; 00148 00149 float64_t res = 0.0 ; 00150 00151 for (unsigned int i=0; i<num_AA; i++) 00152 { 00153 std::vector<unsigned int> joint_mismatch_ ; 00154 std::vector<unsigned int> joint_index_ ; 00155 00156 for (unsigned int j=0; j<joint_index.size(); j++) 00157 { 00158 if (joint_seq[joint_index[j]+d] != AA[i]) 00159 { 00160 if (joint_mismatch[j]+1 <= (unsigned int) max_mismatch) 00161 { 00162 joint_mismatch_.push_back(joint_mismatch[j]+1) ; 00163 joint_index_.push_back(joint_index[j]) ; 00164 } 00165 } 00166 else 00167 { 00168 joint_mismatch_.push_back(joint_mismatch[j]) ; 00169 joint_index_.push_back(joint_index[j]) ; 00170 } 00171 } 00172 if (joint_mismatch_.size()>0) 00173 { 00174 std::string path_ = path + AA[i] ; 00175 00176 if (d+1 < (unsigned int) degree) 00177 { 00178 res += compute_helper(joint_seq, joint_index_, joint_mismatch_, path_, d+1, alen) ; 00179 } 00180 else 00181 { 00182 int anum=0, bnum=0; 00183 for (unsigned int j=0; j<joint_index_.size(); j++) 00184 if (joint_index_[j] < (unsigned int)alen) 00185 { 00186 if (1) 00187 { 00188 anum++ ; 00189 if (joint_mismatch_[j]==0) 00190 anum+=3 ; 00191 } 00192 else 00193 { 00194 if (joint_mismatch_[j]!=0) 00195 anum += AA_helper(path_, joint_seq, joint_index_[j]) ; 00196 else 00197 anum++ ; 00198 } 00199 } 00200 else 00201 { 00202 if (1) 00203 { 00204 bnum++ ; 00205 if (joint_mismatch_[j]==0) 00206 bnum+=3 ; 00207 } 00208 else 00209 { 00210 if (joint_mismatch_[j]!=0) 00211 bnum += AA_helper(path_, joint_seq, joint_index_[j]) ; 00212 else 00213 bnum++ ; 00214 } 00215 } 00216 00217 //fprintf(stdout, "%s: %i x %i\n", path_.c_str(), anum, bnum) ; 00218 00219 res+= anum*bnum ; 00220 } 00221 } 00222 } 00223 return res ; 00224 } 00225 */ 00226 00227 void CSpectrumMismatchRBFKernel::compute_helper_all(const char *joint_seq, 00228 std::vector<struct joint_list_struct> &joint_list, 00229 std::string path, unsigned int d) 00230 { 00231 const char* AA = "ACDEFGHIKLMNPQRSTVWY" ; 00232 const unsigned int num_AA = strlen(AA) ; 00233 00234 assert(path.size()==d) ; 00235 00236 for (unsigned int i=0; i<num_AA; i++) 00237 { 00238 std::vector<struct joint_list_struct> joint_list_ ; 00239 00240 if (d==0) 00241 fprintf(stderr, "i=%i: ", i) ; 00242 if (d==0 && target_letter_0!=-1 && (int)i != target_letter_0 ) 00243 continue ; 00244 00245 if (d==1) 00246 { 00247 fprintf(stdout, "*") ; 00248 fflush(stdout) ; 00249 } 00250 if (d==2) 00251 { 00252 fprintf(stdout, "+") ; 00253 fflush(stdout) ; 00254 } 00255 00256 for (unsigned int j=0; j<joint_list.size(); j++) 00257 { 00258 if (joint_seq[joint_list[j].index+d] != AA[i]) 00259 { 00260 if (joint_list[j].mismatch+1 <= (unsigned int) max_mismatch) 00261 { 00262 struct joint_list_struct list_item ; 00263 list_item = joint_list[j] ; 00264 list_item.mismatch = joint_list[j].mismatch+1 ; 00265 joint_list_.push_back(list_item) ; 00266 } 00267 } 00268 else 00269 joint_list_.push_back(joint_list[j]) ; 00270 } 00271 00272 if (joint_list_.size()>0) 00273 { 00274 std::string path_ = path + AA[i] ; 00275 00276 if (d+1 < (unsigned int) degree) 00277 { 00278 compute_helper_all(joint_seq, joint_list_, path_, d+1) ; 00279 } 00280 else 00281 { 00282 CArray<float64_t> feats ; 00283 feats.resize_array(kernel_matrix.get_dim1()) ; 00284 feats.zero() ; 00285 00286 for (unsigned int j=0; j<joint_list_.size(); j++) 00287 { 00288 if (width==0.0) 00289 { 00290 feats[joint_list_[j].ex_index]++ ; 00291 //if (joint_mismatch_[j]==0) 00292 // feats[joint_ex_index_[j]]+=3 ; 00293 } 00294 else 00295 { 00296 if (joint_list_[j].mismatch!=0) 00297 feats[joint_list_[j].ex_index] += AA_helper(path_, joint_seq, joint_list_[j].index) ; 00298 else 00299 feats[joint_list_[j].ex_index] ++ ; 00300 } 00301 } 00302 00303 std::vector<int> idx ; 00304 for (int r=0; r<feats.get_array_size(); r++) 00305 if (feats[r]!=0.0) 00306 idx.push_back(r) ; 00307 00308 for (unsigned int r=0; r<idx.size(); r++) 00309 for (unsigned int s=r; s<idx.size(); s++) 00310 if (s==r) 00311 kernel_matrix.set_element(feats[idx[r]]*feats[idx[s]] + kernel_matrix.get_element(idx[r],idx[s]), idx[r], idx[s]) ; 00312 else 00313 { 00314 kernel_matrix.set_element(feats[idx[r]]*feats[idx[s]] + kernel_matrix.get_element(idx[r],idx[s]), idx[r], idx[s]) ; 00315 kernel_matrix.set_element(feats[idx[r]]*feats[idx[s]] + kernel_matrix.get_element(idx[s],idx[r]), idx[s], idx[r]) ; 00316 } 00317 } 00318 } 00319 if (d==0) 00320 fprintf(stdout, "\n") ; 00321 } 00322 } 00323 00324 void CSpectrumMismatchRBFKernel::compute_all() 00325 { 00326 std::string joint_seq ; 00327 std::vector<struct joint_list_struct> joint_list ; 00328 00329 assert(lhs->get_num_vectors()==rhs->get_num_vectors()) ; 00330 kernel_matrix.resize_array(lhs->get_num_vectors(), lhs->get_num_vectors()) ; 00331 kernel_matrix_length = lhs->get_num_vectors()*rhs->get_num_vectors(); 00332 for (int i=0; i<lhs->get_num_vectors(); i++) 00333 for (int j=0; j<lhs->get_num_vectors(); j++) 00334 kernel_matrix.set_element(0, i, j) ; 00335 00336 for (int i=0; i<lhs->get_num_vectors(); i++) 00337 { 00338 int32_t alen ; 00339 bool free_avec ; 00340 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, alen, free_avec); 00341 00342 for (int apos=0; apos+degree-1<alen; apos++) 00343 { 00344 struct joint_list_struct list_item ; 00345 list_item.ex_index = i ; 00346 list_item.index = apos+joint_seq.size() ; 00347 list_item.mismatch = 0 ; 00348 00349 joint_list.push_back(list_item) ; 00350 } 00351 joint_seq += std::string(avec, alen) ; 00352 00353 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, i, free_avec); 00354 } 00355 00356 compute_helper_all(joint_seq.c_str(), joint_list, "", 0) ; 00357 } 00358 00359 00360 float64_t CSpectrumMismatchRBFKernel::compute(int32_t idx_a, int32_t idx_b) 00361 { 00362 return kernel_matrix.element(idx_a, idx_b) ; 00363 } 00364 /* 00365 bool CSpectrumMismatchRBFKernel::set_weights( 00366 float64_t* ws, int32_t d, int32_t len) 00367 { 00368 if (d==128 && len==128) 00369 { 00370 SG_DEBUG("Setting AA_matrix\n") ; 00371 memcpy(AA_matrix, ws, 128*128*sizeof(float64_t)) ; 00372 return true ; 00373 } 00374 00375 if (d==1 && len==1) 00376 { 00377 sigma=ws[0] ; 00378 SG_DEBUG("Setting sigma to %e\n", sigma) ; 00379 return true ; 00380 } 00381 00382 if (d==2 && len==2) 00383 { 00384 target_letter_0=ws[0] ; 00385 SG_DEBUG("Setting target letter to %c\n", target_letter_0) ; 00386 return true ; 00387 } 00388 00389 if (d!=degree || len<1) 00390 SG_ERROR("Dimension mismatch (should be de(seq_length | 1) x degree)\n"); 00391 00392 length=len; 00393 00394 if (length==0) 00395 length=1; 00396 00397 int32_t num_weights=degree*(length+max_mismatch); 00398 SG_FREE(weights); 00399 weights=SG_MALLOC(float64_t, num_weights); 00400 00401 if (weights) 00402 { 00403 for (int32_t i=0; i<num_weights; i++) { 00404 if (ws[i]) // len(ws) might be != num_weights? 00405 weights[i]=ws[i]; 00406 } 00407 return true; 00408 } 00409 else 00410 return false; 00411 } 00412 */ 00413 00414 bool CSpectrumMismatchRBFKernel::set_AA_matrix(float64_t* AA_matrix_, int32_t nr, int32_t nc) 00415 { 00416 if (AA_matrix_) 00417 { 00418 if (nr!=128 || nc!=128) 00419 SG_ERROR("AA_matrix should be of shape 128x128\n"); 00420 SG_FREE(AA_matrix); 00421 AA_matrix=SG_MALLOC(float64_t, nc*nr); 00422 memcpy(AA_matrix, AA_matrix_, nc*nr*sizeof(float64_t)) ; 00423 SG_DEBUG("Setting AA_matrix\n") ; 00424 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00425 return true ; 00426 } 00427 00428 return false; 00429 } 00430 00431 bool CSpectrumMismatchRBFKernel::set_max_mismatch(int32_t max) 00432 { 00433 max_mismatch=max; 00434 00435 if (lhs!=NULL && rhs!=NULL) 00436 return init(lhs, rhs); 00437 else 00438 return true; 00439 } 00440 00441 void CSpectrumMismatchRBFKernel::register_params() 00442 { 00443 m_parameters->add(°ree, "degree", "degree of the kernel"); 00444 m_parameters->add(&AA_matrix_length, "AA_matrix_length", "the length of AA matrix"); 00445 m_parameters->add_vector(&AA_matrix, &AA_matrix_length, "AA_matrix", "128*128 scalar product matrix"); 00446 m_parameters->add(&width,"width","width of Gaussian"); 00447 m_parameters->add(&target_letter_0, "target_letter_0","target letter 0"); 00448 m_parameters->add(&initialized, "initialized", "the mark of initialization status"); 00449 m_parameters->add_vector((SGString<float64_t>**)&kernel_matrix, &kernel_matrix_length, "kernel_matrix", "the kernel matrix with its length defined by the number of vectors of the string features"); 00450 } 00451 00452 void CSpectrumMismatchRBFKernel::register_alphabet() 00453 { 00454 m_parameters->add((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel"); 00455 } 00456 00457 void CSpectrumMismatchRBFKernel::init() 00458 { 00459 alphabet = NULL; 00460 degree = 0; 00461 max_mismatch = 0; 00462 AA_matrix = NULL; 00463 width = 0.0; 00464 00465 initialized = false; 00466 target_letter_0 = 0; 00467 } 00468