SHOGUN
v1.1.0
|
00001 #include <shogun/features/PolyFeatures.h> 00002 00003 using namespace shogun; 00004 00005 CPolyFeatures::CPolyFeatures() :CDotFeatures() 00006 { 00007 m_feat=NULL; 00008 m_degree=0; 00009 m_normalize=false; 00010 m_input_dimensions=0; 00011 m_multi_index=NULL; 00012 m_multinomial_coefficients=NULL; 00013 m_normalization_values=NULL; 00014 00015 register_parameters(); 00016 } 00017 00018 CPolyFeatures::CPolyFeatures(CSimpleFeatures<float64_t>* feat, int32_t degree, bool normalize) 00019 : CDotFeatures(), m_multi_index(NULL), m_multinomial_coefficients(NULL), 00020 m_normalization_values(NULL) 00021 { 00022 ASSERT(feat); 00023 00024 m_feat = feat; 00025 SG_REF(m_feat); 00026 m_degree=degree; 00027 m_normalize=normalize; 00028 m_input_dimensions=feat->get_num_features(); 00029 m_output_dimensions=calc_feature_space_dimensions(m_input_dimensions, m_degree); 00030 00031 store_multi_index(); 00032 store_multinomial_coefficients(); 00033 if (m_normalize) 00034 store_normalization_values(); 00035 00036 register_parameters(); 00037 } 00038 00039 00040 CPolyFeatures::~CPolyFeatures() 00041 { 00042 SG_FREE(m_multi_index); 00043 SG_FREE(m_multinomial_coefficients); 00044 SG_FREE(m_normalization_values); 00045 SG_UNREF(m_feat); 00046 } 00047 00048 CPolyFeatures::CPolyFeatures(const CPolyFeatures & orig) 00049 { 00050 SG_PRINT("CPolyFeatures:\n"); 00051 SG_NOTIMPLEMENTED; 00052 }; 00053 00054 int32_t CPolyFeatures::get_dim_feature_space() const 00055 { 00056 return m_output_dimensions; 00057 } 00058 00059 int32_t CPolyFeatures::get_nnz_features_for_vector(int32_t num) 00060 { 00061 return m_output_dimensions; 00062 } 00063 00064 EFeatureType CPolyFeatures::get_feature_type() 00065 { 00066 return F_UNKNOWN; 00067 } 00068 00069 EFeatureClass CPolyFeatures::get_feature_class() 00070 { 00071 return C_POLY; 00072 } 00073 00074 int32_t CPolyFeatures::get_num_vectors() const 00075 { 00076 if (m_feat) 00077 return m_feat->get_num_vectors(); 00078 else 00079 return 0; 00080 00081 } 00082 00083 int32_t CPolyFeatures::get_size() 00084 { 00085 return sizeof(float64_t); 00086 } 00087 00088 void* CPolyFeatures::get_feature_iterator(int32_t vector_index) 00089 { 00090 SG_NOTIMPLEMENTED; 00091 return NULL; 00092 } 00093 00094 bool CPolyFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00095 { 00096 SG_NOTIMPLEMENTED; 00097 return NULL; 00098 } 00099 00100 void CPolyFeatures::free_feature_iterator(void* iterator) 00101 { 00102 SG_NOTIMPLEMENTED; 00103 } 00104 00105 00106 00107 float64_t CPolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00108 { 00109 ASSERT(df); 00110 ASSERT(df->get_feature_type() == get_feature_type()); 00111 ASSERT(df->get_feature_class() == get_feature_class()); 00112 00113 CPolyFeatures* pf=(CPolyFeatures*) df; 00114 00115 int32_t len1; 00116 bool do_free1; 00117 float64_t* vec1 = m_feat->get_feature_vector(vec_idx1, len1, do_free1); 00118 00119 int32_t len2; 00120 bool do_free2; 00121 float64_t* vec2 = pf->m_feat->get_feature_vector(vec_idx2, len2, do_free2); 00122 00123 float64_t sum=0; 00124 int cnt=0; 00125 for (int j=0; j<m_output_dimensions; j++) 00126 { 00127 float64_t out1=m_multinomial_coefficients[j]; 00128 float64_t out2=m_multinomial_coefficients[j]; 00129 for (int k=0; k<m_degree; k++) 00130 { 00131 out1*=vec1[m_multi_index[cnt]]; 00132 out2*=vec2[m_multi_index[cnt]]; 00133 cnt++; 00134 } 00135 sum+=out1*out2; 00136 } 00137 m_feat->free_feature_vector(vec1, len1, do_free1); 00138 pf->m_feat->free_feature_vector(vec2, len2, do_free2); 00139 00140 return sum; 00141 } 00142 00143 float64_t CPolyFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 00144 { 00145 if (vec2_len != m_output_dimensions) 00146 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions); 00147 00148 int32_t len; 00149 bool do_free; 00150 float64_t* vec = m_feat->get_feature_vector(vec_idx1, len, do_free); 00151 00152 00153 int cnt=0; 00154 float64_t sum=0; 00155 for (int j=0; j<vec2_len; j++) 00156 { 00157 float64_t output=m_multinomial_coefficients[j]; 00158 for (int k=0; k<m_degree; k++) 00159 { 00160 output*=vec[m_multi_index[cnt]]; 00161 cnt++; 00162 } 00163 sum+=output*vec2[j]; 00164 } 00165 if (m_normalize) 00166 sum = sum/m_normalization_values[vec_idx1]; 00167 00168 m_feat->free_feature_vector(vec, len, do_free); 00169 return sum; 00170 } 00171 void CPolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00172 { 00173 if (vec2_len != m_output_dimensions) 00174 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions); 00175 00176 int32_t len; 00177 bool do_free; 00178 float64_t* vec = m_feat->get_feature_vector(vec_idx1, len, do_free); 00179 00180 00181 int cnt=0; 00182 float32_t norm_val=1; 00183 if (m_normalize) 00184 norm_val = m_normalization_values[vec_idx1]; 00185 alpha/=norm_val; 00186 for (int j=0; j<vec2_len; j++) 00187 { 00188 float64_t output=m_multinomial_coefficients[j]; 00189 for (int k=0; k<m_degree; k++) 00190 { 00191 output*=vec[m_multi_index[cnt]]; 00192 cnt++; 00193 } 00194 if (abs_val) 00195 output=CMath::abs(output); 00196 00197 vec2[j]+=alpha*output; 00198 } 00199 m_feat->free_feature_vector(vec, len, do_free); 00200 } 00201 void CPolyFeatures::store_normalization_values() 00202 { 00203 SG_FREE(m_normalization_values); 00204 00205 int32_t num_vec = this->get_num_vectors(); 00206 00207 m_normalization_values=SG_MALLOC(float32_t, num_vec); 00208 for (int i=0; i<num_vec; i++) 00209 { 00210 float64_t tmp = CMath::sqrt(dot(i, this,i)); 00211 if (tmp==0) 00212 // trap division by zero 00213 m_normalization_values[i]=1; 00214 else 00215 m_normalization_values[i]=tmp; 00216 } 00217 00218 } 00219 00220 void CPolyFeatures::store_multi_index() 00221 { 00222 SG_FREE(m_multi_index); 00223 00224 m_multi_index=SG_MALLOC(uint16_t, m_output_dimensions*m_degree); 00225 00226 uint16_t* exponents = SG_MALLOC(uint16_t, m_input_dimensions); 00227 if (!exponents) 00228 SG_ERROR( "Error allocating mem \n"); 00229 /*copy adress: otherwise it will be overwritten in recursion*/ 00230 uint16_t* index = m_multi_index; 00231 enumerate_multi_index(0, &index, exponents, m_degree); 00232 00233 SG_FREE(exponents); 00234 } 00235 00236 void CPolyFeatures::enumerate_multi_index(const int32_t feat_idx, uint16_t** index, uint16_t* exponents, const int32_t degree) 00237 { 00238 if (feat_idx==m_input_dimensions-1 || degree==0) 00239 { 00240 if (feat_idx==m_input_dimensions-1) 00241 exponents[feat_idx] = degree; 00242 if (degree==0) 00243 exponents[feat_idx] = 0; 00244 int32_t i, j; 00245 for (j=0; j<feat_idx+1; j++) 00246 for (i=0; i<exponents[j]; i++) 00247 { 00248 **index = j; 00249 (*index)++; 00250 } 00251 exponents[feat_idx] = 0; 00252 return; 00253 } 00254 int32_t k; 00255 for (k=0; k<=degree; k++) 00256 { 00257 exponents[feat_idx] = k; 00258 enumerate_multi_index(feat_idx+1, index, exponents, degree-k); 00259 } 00260 return; 00261 00262 } 00263 00264 void CPolyFeatures::store_multinomial_coefficients() 00265 { 00266 SG_FREE(m_multinomial_coefficients); 00267 00268 m_multinomial_coefficients = SG_MALLOC(float64_t, m_output_dimensions); 00269 int32_t* exponents = SG_MALLOC(int32_t, m_input_dimensions); 00270 if (!exponents) 00271 SG_ERROR( "Error allocating mem \n"); 00272 int32_t j=0; 00273 for (j=0; j<m_input_dimensions; j++) 00274 exponents[j] = 0; 00275 int32_t k, cnt=0; 00276 for (j=0; j<m_output_dimensions; j++) 00277 { 00278 for (k=0; k<m_degree; k++) 00279 { 00280 exponents[m_multi_index[cnt]] ++; 00281 cnt++; 00282 } 00283 m_multinomial_coefficients[j] = sqrt((double) multinomialcoef(exponents, m_input_dimensions)); 00284 for (k=0; k<m_input_dimensions; k++) 00285 { 00286 exponents[k]=0; 00287 } 00288 } 00289 SG_FREE(exponents); 00290 } 00291 00292 int32_t CPolyFeatures::bico2(int32_t n, int32_t k) 00293 { 00294 00295 /* for this problem k is usually small (<=degree), 00296 * thus it is efficient to 00297 * to use recursion and prune end recursions*/ 00298 if (n<k) 00299 return 0; 00300 if (k>n/2) 00301 k = n-k; 00302 if (k<0) 00303 return 0; 00304 if (k==0) 00305 return 1; 00306 if (k==1) 00307 return n; 00308 if (k<4) 00309 return bico2(n-1, k-1)+bico2(n-1, k); 00310 00311 /* call function as implemented in numerical recipies: 00312 * much more efficient for large binomial coefficients*/ 00313 return bico(n, k); 00314 00315 } 00316 00317 int32_t CPolyFeatures::calc_feature_space_dimensions(int32_t N, int32_t D) 00318 { 00319 if (N==1) 00320 return 1; 00321 if (D==0) 00322 return 1; 00323 int32_t d; 00324 int32_t ret = 0; 00325 for (d=0; d<=D; d++) 00326 ret += calc_feature_space_dimensions(N-1, d); 00327 00328 return ret; 00329 } 00330 00331 int32_t CPolyFeatures::multinomialcoef(int32_t* exps, int32_t len) 00332 { 00333 int32_t ret = 1, i; 00334 int32_t n = 0; 00335 for (i=0; i<len; i++) 00336 { 00337 n += exps[i]; 00338 ret *= bico2(n, exps[i]); 00339 } 00340 return ret; 00341 } 00342 00343 /* gammln as implemented in the 00344 * second edition of Numerical Recipes in C */ 00345 float64_t CPolyFeatures::gammln(float64_t xx) 00346 { 00347 float64_t x,y,tmp,ser; 00348 static float64_t cof[6]={76.18009172947146, -86.50532032941677, 00349 24.01409824083091, -1.231739572450155, 00350 0.1208650973866179e-2,-0.5395239384953e-5}; 00351 int32_t j; 00352 00353 y=x=xx; 00354 tmp=x+5.5; 00355 tmp -= (x+0.5)*log(tmp); 00356 ser=1.000000000190015; 00357 for (j=0;j<=5;j++) ser += cof[j]/++y; 00358 return -tmp+log(2.5066282746310005*ser/x); 00359 } 00360 00361 float64_t CPolyFeatures::factln(int32_t n) 00362 { 00363 static float64_t a[101]; 00364 00365 if (n < 0) SG_ERROR("Negative factorial in routine factln\n"); 00366 if (n <= 1) return 0.0; 00367 if (n <= 100) return a[n] ? a[n] : (a[n]=gammln(n+1.0)); 00368 else return gammln(n+1.0); 00369 } 00370 00371 int32_t CPolyFeatures::bico(int32_t n, int32_t k) 00372 { 00373 /* use floor to clean roundoff errors*/ 00374 return (int32_t) floor(0.5+exp(factln(n)-factln(k)-factln(n-k))); 00375 } 00376 CFeatures* CPolyFeatures::duplicate() const 00377 { 00378 return new CPolyFeatures(*this); 00379 } 00380 00381 void CPolyFeatures::register_parameters() 00382 { 00383 m_parameters->add((CSGObject**) &m_feat, "features", 00384 "Features in original space."); 00385 m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel."); 00386 m_parameters->add(&m_normalize, "normalize", "Normalize?"); 00387 m_parameters->add(&m_input_dimensions, "input_dimensions", 00388 "Dimensions of the input space."); 00389 m_parameters->add(&m_output_dimensions, "output_dimensions", 00390 "Dimensions of the feature space of the polynomial kernel."); 00391 00392 multi_index_length=m_output_dimensions*m_degree; 00393 m_parameters->add_vector( 00394 &m_multi_index, 00395 &multi_index_length, 00396 "multi_index", 00397 "Flattened matrix of all multi indices that sum do the" 00398 " degree of the polynomial kernel."); 00399 00400 multinomial_coefficients_length=m_output_dimensions; 00401 m_parameters->add_vector(&m_multinomial_coefficients, 00402 &multinomial_coefficients_length, "multinomial_coefficients", 00403 "Multinomial coefficients for all multi-indices."); 00404 00405 normalization_values_length=get_num_vectors(); 00406 m_parameters->add_vector(&m_normalization_values, 00407 &normalization_values_length, "normalization_values", 00408 "Norm of each training example."); 00409 }