SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Soeren Sonnenburg 00008 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00009 */ 00010 #include <shogun/distributions/PositionalPWM.h> 00011 #include <shogun/mathematics/Math.h> 00012 #include <shogun/base/Parameter.h> 00013 #include <shogun/features/Alphabet.h> 00014 #include <shogun/features/StringFeatures.h> 00015 00016 using namespace shogun; 00017 00018 CPositionalPWM::CPositionalPWM() : CDistribution(), 00019 m_pwm_rows(0), m_pwm_cols(0), m_pwm(NULL), 00020 m_sigma(0), m_mean(0), 00021 m_w_rows(0), m_w_cols(0), m_w(NULL), m_poim(NULL) 00022 00023 { 00024 register_params(); 00025 } 00026 00027 CPositionalPWM::~CPositionalPWM() 00028 { 00029 SG_FREE(m_pwm); 00030 SG_FREE(m_w); 00031 } 00032 00033 bool CPositionalPWM::train(CFeatures* data) 00034 { 00035 SG_NOTIMPLEMENTED; 00036 return true; 00037 } 00038 00039 int32_t CPositionalPWM::get_num_model_parameters() 00040 { 00041 return m_pwm_rows*m_pwm_cols+2; 00042 } 00043 00044 float64_t CPositionalPWM::get_log_model_parameter(int32_t num_param) 00045 { 00046 ASSERT(num_param>0 && num_param<=m_pwm_rows*m_pwm_cols+2); 00047 00048 if (num_param<m_pwm_rows*m_pwm_cols) 00049 { 00050 ASSERT(m_pwm); 00051 return m_pwm[num_param]; 00052 } 00053 else if (num_param<m_pwm_rows*m_pwm_cols+1) 00054 return CMath::log(m_sigma); 00055 else 00056 return CMath::log(m_mean); 00057 } 00058 00059 float64_t CPositionalPWM::get_log_derivative(int32_t num_param, int32_t num_example) 00060 { 00061 SG_NOTIMPLEMENTED; 00062 return 0; 00063 } 00064 00065 float64_t CPositionalPWM::get_log_likelihood_example(int32_t num_example) 00066 { 00067 ASSERT(features); 00068 ASSERT(features->get_feature_class() == C_STRING); 00069 ASSERT(features->get_feature_type()==F_BYTE); 00070 00071 CStringFeatures<uint8_t>* strs=(CStringFeatures<uint8_t>*) features; 00072 00073 float64_t lik=0; 00074 int32_t len=0; 00075 bool do_free=false; 00076 00077 uint8_t* str = strs->get_feature_vector(num_example, len, do_free); 00078 00079 if (!(m_w && m_w_cols==len)) 00080 return 0; //TODO 00081 00082 for (int32_t i=0; i<len; i++) 00083 lik+=m_w[4*i+str[i]]; 00084 00085 strs->free_feature_vector(str, num_example, do_free); 00086 return lik; 00087 } 00088 00089 float64_t CPositionalPWM::get_log_likelihood_window(uint8_t* window, int32_t len, float64_t pos) 00090 { 00091 ASSERT(m_pwm_cols == len); 00092 float64_t score = CMath::log(1/(m_sigma*CMath::sqrt(2*M_PI))) - 00093 CMath::sq(pos-m_mean)/(2*CMath::sq(m_sigma)); 00094 00095 for (int32_t i=0; i<m_pwm_cols; i++) 00096 score+=m_pwm[m_pwm_rows*i+window[i]]; 00097 00098 return score; 00099 } 00100 00101 void CPositionalPWM::compute_w(int32_t num_pos) 00102 { 00103 ASSERT(m_pwm && m_pwm_rows && m_pwm_cols); 00104 00105 m_w_rows=CMath::pow(m_pwm_rows, m_pwm_cols); 00106 m_w_cols=num_pos; 00107 00108 SG_FREE(m_w); 00109 m_w=SG_MALLOC(float64_t, m_w_cols*m_w_rows); 00110 00111 uint8_t* window=SG_MALLOC(uint8_t, m_pwm_cols); 00112 CMath::fill_vector(window, m_pwm_cols, (uint8_t) 0); 00113 00114 const int32_t last_idx=m_pwm_cols-1; 00115 for (int32_t i=0; i<m_w_rows; i++) 00116 { 00117 for (int32_t j=0; j<m_w_cols; j++) 00118 m_w[j*m_w_rows+i]=get_log_likelihood_window(window, m_pwm_cols, j); 00119 00120 window[last_idx]++; 00121 int32_t window_ptr=last_idx; 00122 while (window[window_ptr]==m_pwm_rows && window_ptr>0) 00123 { 00124 window[window_ptr]=0; 00125 window_ptr--; 00126 window[window_ptr]++; 00127 } 00128 00129 } 00130 } 00131 00132 void CPositionalPWM::register_params() 00133 { 00134 m_parameters->add_vector(&m_poim, &m_poim_len, "poim", "POIM Scoring Matrix"); 00135 m_parameters->add_matrix(&m_w, &m_w_rows, &m_w_cols, "w", "Scoring Matrix"); 00136 m_parameters->add_matrix(&m_pwm, &m_pwm_rows, &m_pwm_cols, "pwm", "Positional Weight Matrix."); 00137 m_parameters->add(&m_sigma, "sigma", "Standard Deviation."); 00138 m_parameters->add(&m_mean, "mean", "Mean."); 00139 } 00140 00141 void CPositionalPWM::compute_scoring(int32_t max_degree) 00142 { 00143 ASSERT(m_w); 00144 00145 int32_t num_feat=m_w_cols; 00146 int32_t num_sym=0; 00147 int32_t order=m_pwm_cols; 00148 int32_t num_words=m_pwm_cols; 00149 00150 CAlphabet* alpha=new CAlphabet(DNA); 00151 CStringFeatures<uint16_t>* str= new CStringFeatures<uint16_t>(alpha); 00152 int32_t num_bits=alpha->get_num_bits(); 00153 str->compute_symbol_mask_table(num_bits); 00154 00155 for (int32_t i=0; i<order; i++) 00156 num_sym+=CMath::pow((int32_t) num_words,i+1); 00157 00158 SG_FREE(m_poim); 00159 m_poim_len=num_feat*num_sym; 00160 m_poim=SG_MALLOC(float64_t, num_feat*num_sym); 00161 memset(m_poim,0, size_t(num_feat)*size_t(num_sym)); 00162 00163 uint32_t kmer_mask=0; 00164 uint32_t words=CMath::pow((int32_t) num_words,(int32_t) order); 00165 int32_t offset=0; 00166 00167 for (int32_t o=0; o<max_degree; o++) 00168 { 00169 float64_t* contrib=&m_poim[offset]; 00170 offset+=CMath::pow((int32_t) num_words,(int32_t) o+1); 00171 00172 kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1); 00173 00174 for (int32_t p=-o; p<order; p++) 00175 { 00176 int32_t o_sym=0, m_sym=0, il=0,ir=0, jl=0; 00177 uint32_t imer_mask=kmer_mask; 00178 uint32_t jmer_mask=kmer_mask; 00179 00180 if (p<0) 00181 { 00182 il=-p; 00183 m_sym=order-o-p-1; 00184 o_sym=-p; 00185 } 00186 else if (p<order-o) 00187 { 00188 ir=p; 00189 m_sym=order-o-1; 00190 } 00191 else 00192 { 00193 ir=p; 00194 m_sym=p; 00195 o_sym=p-order+o+1; 00196 jl=order-ir; 00197 imer_mask=(kmer_mask>>(num_bits*o_sym)); 00198 jmer_mask=(kmer_mask>>(num_bits*jl)); 00199 } 00200 00201 float64_t marginalizer= 00202 1.0/CMath::pow((int32_t) num_words,(int32_t) m_sym); 00203 00204 for (uint32_t i=0; i<words; i++) 00205 { 00206 uint16_t x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask; 00207 00208 if (p>=0 && p<order-o) 00209 { 00210 contrib[x]+=m_w[m_w_cols*ir+i]*marginalizer; 00211 } 00212 else 00213 { 00214 for (uint32_t j=0; j< (uint32_t) CMath::pow((int32_t) num_words, (int32_t) o_sym); j++) 00215 { 00216 uint32_t c=x | ((j & jmer_mask) << (num_bits*jl)); 00217 contrib[c]+=m_w[m_w_cols*il+i]*marginalizer; 00218 } 00219 } 00220 } 00221 } 00222 } 00223 } 00224 00225 SGMatrix<float64_t> CPositionalPWM::get_scoring(int32_t d) 00226 { 00227 int32_t offs=0; 00228 for (int32_t i=0; i<d-1; i++) 00229 offs+=CMath::pow((int32_t) m_w_rows,i+1); 00230 int32_t rows=CMath::pow((int32_t) m_w_rows,d); 00231 int32_t cols=m_w_cols; 00232 return SGMatrix<float64_t>(&m_poim[offs],rows,cols); 00233 }