SHOGUN
v1.1.0
|
00001 #include <shogun/features/SparseFeatures.h> 00002 #include <shogun/preprocessor/SparsePreprocessor.h> 00003 #include <shogun/mathematics/Math.h> 00004 #include <shogun/lib/DataType.h> 00005 #include <shogun/io/SGIO.h> 00006 00007 #include <string.h> 00008 #include <stdlib.h> 00009 00010 namespace shogun 00011 { 00012 00013 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size) 00014 : CDotFeatures(size), num_vectors(0), num_features(0), 00015 sparse_feature_matrix(NULL), feature_cache(NULL) 00016 { 00017 init(); 00018 } 00019 00020 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseVector<ST>* src, 00021 int32_t num_feat, int32_t num_vec, bool copy) 00022 : CDotFeatures(0), num_vectors(0), num_features(0), 00023 sparse_feature_matrix(NULL), feature_cache(NULL) 00024 { 00025 init(); 00026 00027 if (!copy) 00028 set_sparse_feature_matrix(SGSparseMatrix<ST>(src, num_feat, num_vec)); 00029 else 00030 { 00031 sparse_feature_matrix = SG_MALLOC(SGSparseVector<ST>, num_vec); 00032 memcpy(sparse_feature_matrix, src, sizeof(SGSparseVector<ST>)*num_vec); 00033 for (int32_t i=0; i< num_vec; i++) 00034 { 00035 sparse_feature_matrix[i].features = SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries); 00036 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries); 00037 00038 } 00039 } 00040 } 00041 00042 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGSparseMatrix<ST> sparse) 00043 : CDotFeatures(0), num_vectors(0), num_features(0), 00044 sparse_feature_matrix(NULL), feature_cache(NULL) 00045 { 00046 init(); 00047 00048 set_sparse_feature_matrix(sparse); 00049 } 00050 00051 template<class ST> CSparseFeatures<ST>::CSparseFeatures(SGMatrix<ST> dense) 00052 : CDotFeatures(0), num_vectors(0), num_features(0), 00053 sparse_feature_matrix(NULL), feature_cache(NULL) 00054 { 00055 init(); 00056 00057 set_full_feature_matrix(dense); 00058 } 00059 00060 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig) 00061 : CDotFeatures(orig), num_vectors(orig.num_vectors), 00062 num_features(orig.num_features), 00063 sparse_feature_matrix(orig.sparse_feature_matrix), 00064 feature_cache(orig.feature_cache) 00065 { 00066 init(); 00067 00068 if (orig.sparse_feature_matrix) 00069 { 00070 free_sparse_feature_matrix(); 00071 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors); 00072 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(SGSparseVector<ST>)*num_vectors); 00073 for (int32_t i=0; i< num_vectors; i++) 00074 { 00075 sparse_feature_matrix[i].features=SG_MALLOC(SGSparseVectorEntry<ST>, sparse_feature_matrix[i].num_feat_entries); 00076 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries); 00077 00078 } 00079 } 00080 00081 m_subset=orig.m_subset->duplicate(); 00082 } 00083 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader) 00084 : CDotFeatures(loader), num_vectors(0), num_features(0), 00085 sparse_feature_matrix(NULL), feature_cache(NULL) 00086 { 00087 init(); 00088 00089 load(loader); 00090 } 00091 00092 template<class ST> CSparseFeatures<ST>::~CSparseFeatures() 00093 { 00094 free_sparse_features(); 00095 } 00096 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_matrix() 00097 { 00098 clean_tsparse(sparse_feature_matrix, num_vectors); 00099 sparse_feature_matrix = NULL; 00100 num_vectors=0; 00101 num_features=0; 00102 remove_subset(); 00103 } 00104 template<class ST> void CSparseFeatures<ST>::free_sparse_features() 00105 { 00106 free_sparse_feature_matrix(); 00107 delete feature_cache; 00108 feature_cache = NULL; 00109 } 00110 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const 00111 { 00112 return new CSparseFeatures<ST>(*this); 00113 } 00114 00115 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index) 00116 { 00117 ASSERT(index>=0 && index<num_features) ; 00118 ASSERT(num>=0 && num<get_num_vectors()) ; 00119 00120 int32_t i; 00121 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00122 ST ret = 0 ; 00123 00124 if (sv.features) 00125 { 00126 for (i=0; i<sv.num_feat_entries; i++) 00127 if (sv.features[i].feat_index==index) 00128 ret+=sv.features[i].entry ; 00129 } 00130 00131 free_sparse_feature_vector(sv, num); 00132 00133 return ret ; 00134 } 00135 00136 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len) 00137 { 00138 int32_t i; 00139 len=0; 00140 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00141 ST* fv=NULL; 00142 00143 if (sv.features) 00144 { 00145 len=num_features; 00146 fv=SG_MALLOC(ST, num_features); 00147 00148 for (i=0; i<num_features; i++) 00149 fv[i]=0; 00150 00151 for (i=0; i<sv.num_feat_entries; i++) 00152 fv[sv.features[i].feat_index]= sv.features[i].entry; 00153 } 00154 00155 free_sparse_feature_vector(sv, num); 00156 00157 return fv; 00158 } 00159 00160 template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num) 00161 { 00162 if (num>=num_vectors) 00163 { 00164 SG_ERROR("Index out of bounds (number of vectors %d, you " 00165 "requested %d)\n", num_vectors, num); 00166 } 00167 00168 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00169 00170 SGVector<ST> dense; 00171 00172 if (sv.features) 00173 { 00174 dense.do_free=true; 00175 dense.vlen=num_features; 00176 dense.vector=SG_MALLOC(ST, num_features); 00177 memset(dense.vector, 0, sizeof(ST)*num_features); 00178 00179 for (int32_t i=0; i<sv.num_feat_entries; i++) 00180 { 00181 dense.vector[sv.features[i].feat_index]= sv.features[i].entry; 00182 } 00183 } 00184 00185 free_sparse_feature_vector(sv, num); 00186 00187 return dense; 00188 } 00189 00190 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num) 00191 { 00192 SGSparseVector<ST> sv = get_sparse_feature_vector(num); 00193 int32_t len=sv.num_feat_entries; 00194 free_sparse_feature_vector(sv, num); 00195 return len; 00196 } 00197 00198 template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num) 00199 { 00200 ASSERT(num<get_num_vectors()); 00201 00202 index_t real_num=subset_idx_conversion(num); 00203 00204 SGSparseVector<ST> result; 00205 00206 if (sparse_feature_matrix) 00207 { 00208 result=sparse_feature_matrix[real_num]; 00209 result.do_free=false; 00210 return result; 00211 } 00212 else 00213 { 00214 result.do_free=false; 00215 00216 if (feature_cache) 00217 { 00218 result.features=feature_cache->lock_entry(num); 00219 00220 if (result.features) 00221 return result; 00222 else 00223 { 00224 result.features=feature_cache->set_entry(num); 00225 } 00226 } 00227 00228 if (!result.features) 00229 result.do_free=true; 00230 00231 result.features=compute_sparse_feature_vector(num, 00232 result.num_feat_entries, result.features); 00233 00234 00235 if (get_num_preprocessors()) 00236 { 00237 int32_t tmp_len=result.num_feat_entries; 00238 SGSparseVectorEntry<ST>* tmp_feat_before=result.features; 00239 SGSparseVectorEntry<ST>* tmp_feat_after = NULL; 00240 00241 for (int32_t i=0; i<get_num_preprocessors(); i++) 00242 { 00243 //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len); 00244 00245 if (i!=0) // delete feature vector, except for the the first one, i.e., feat 00246 SG_FREE(tmp_feat_before); 00247 tmp_feat_before=tmp_feat_after; 00248 } 00249 00250 memcpy(result.features, tmp_feat_after, 00251 sizeof(SGSparseVectorEntry<ST>)*tmp_len); 00252 00253 SG_FREE(tmp_feat_after); 00254 result.num_feat_entries=tmp_len ; 00255 SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features); 00256 } 00257 result.vec_index=num; 00258 return result ; 00259 } 00260 } 00261 00262 template<class ST> ST CSparseFeatures<ST>::sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen) 00263 { 00264 ST result=0; 00265 00266 //result remains zero when one of the vectors is non existent 00267 if (avec && bvec) 00268 { 00269 if (alen<=blen) 00270 { 00271 int32_t j=0; 00272 for (int32_t i=0; i<alen; i++) 00273 { 00274 int32_t a_feat_idx=avec[i].feat_index; 00275 00276 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) ) 00277 j++; 00278 00279 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) ) 00280 { 00281 result+= avec[i].entry * bvec[j].entry; 00282 j++; 00283 } 00284 } 00285 } 00286 else 00287 { 00288 int32_t j=0; 00289 for (int32_t i=0; i<blen; i++) 00290 { 00291 int32_t b_feat_idx=bvec[i].feat_index; 00292 00293 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) ) 00294 j++; 00295 00296 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) ) 00297 { 00298 result+= bvec[i].entry * avec[j].entry; 00299 j++; 00300 } 00301 } 00302 } 00303 00304 result*=alpha; 00305 } 00306 00307 return result; 00308 } 00309 00310 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b) 00311 { 00312 ASSERT(vec); 00313 ASSERT(dim==num_features); 00314 ST result=b; 00315 00316 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00317 00318 if (sv.features) 00319 { 00320 for (int32_t i=0; i<sv.num_feat_entries; i++) 00321 { 00322 result+=alpha*vec[sv.features[i].feat_index] 00323 *sv.features[i].entry; 00324 } 00325 } 00326 00327 free_sparse_feature_vector(sv, num); 00328 return result; 00329 } 00330 00331 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val) 00332 { 00333 ASSERT(vec); 00334 if (dim!=num_features) 00335 { 00336 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00337 dim, num_features); 00338 } 00339 00340 SGSparseVector<ST> sv=get_sparse_feature_vector(num); 00341 00342 if (sv.features) 00343 { 00344 if (abs_val) 00345 { 00346 for (int32_t i=0; i<sv.num_feat_entries; i++) 00347 { 00348 vec[sv.features[i].feat_index]+=alpha 00349 *CMath::abs(sv.features[i].entry); 00350 } 00351 } 00352 else 00353 { 00354 for (int32_t i=0; i<sv.num_feat_entries; i++) 00355 { 00356 vec[sv.features[i].feat_index]+=alpha 00357 *sv.features[i].entry; 00358 } 00359 } 00360 } 00361 00362 free_sparse_feature_vector(sv, num); 00363 } 00364 00365 template<class ST> void CSparseFeatures<ST>::free_sparse_feature_vector(SGSparseVector<ST> vec, int32_t num) 00366 { 00367 if (feature_cache) 00368 feature_cache->unlock_entry(subset_idx_conversion(num)); 00369 00370 vec.free_vector(); 00371 } 00372 00373 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec) 00374 { 00375 if (m_subset) 00376 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n"); 00377 00378 num_feat=num_features; 00379 num_vec=num_vectors; 00380 00381 return sparse_feature_matrix; 00382 } 00383 00384 template<class ST> SGSparseMatrix<ST> CSparseFeatures<ST>::get_sparse_feature_matrix() 00385 { 00386 if (m_subset) 00387 SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n"); 00388 00389 SGSparseMatrix<ST> sm; 00390 sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors); 00391 return sm; 00392 } 00393 00394 template<class ST> void CSparseFeatures<ST>::clean_tsparse(SGSparseVector<ST>* sfm, int32_t num_vec) 00395 { 00396 if (sfm) 00397 { 00398 for (int32_t i=0; i<num_vec; i++) 00399 SG_FREE(sfm[i].features); 00400 00401 SG_FREE(sfm); 00402 } 00403 } 00404 00405 template<class ST> CSparseFeatures<ST>* CSparseFeatures<ST>::get_transposed() 00406 { 00407 int32_t num_feat; 00408 int32_t num_vec; 00409 SGSparseVector<ST>* s=get_transposed(num_feat, num_vec); 00410 return new CSparseFeatures<ST>(s, num_feat, num_vec); 00411 } 00412 00413 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec) 00414 { 00415 num_feat=get_num_vectors(); 00416 num_vec=num_features; 00417 00418 int32_t* hist=SG_MALLOC(int32_t, num_features); 00419 memset(hist, 0, sizeof(int32_t)*num_features); 00420 00421 // count how lengths of future feature vectors 00422 for (int32_t v=0; v<num_feat; v++) 00423 { 00424 SGSparseVector<ST> sv=get_sparse_feature_vector(v); 00425 00426 for (int32_t i=0; i<sv.num_feat_entries; i++) 00427 hist[sv.features[i].feat_index]++; 00428 00429 free_sparse_feature_vector(sv, v); 00430 } 00431 00432 // allocate room for future feature vectors 00433 SGSparseVector<ST>* sfm=SG_MALLOC(SGSparseVector<ST>, num_vec); 00434 for (int32_t v=0; v<num_vec; v++) 00435 { 00436 sfm[v].features= SG_MALLOC(SGSparseVectorEntry<ST>, hist[v]); 00437 sfm[v].num_feat_entries=hist[v]; 00438 sfm[v].vec_index=v; 00439 } 00440 00441 // fill future feature vectors with content 00442 memset(hist,0,sizeof(int32_t)*num_features); 00443 for (int32_t v=0; v<num_feat; v++) 00444 { 00445 SGSparseVector<ST> sv=get_sparse_feature_vector(v); 00446 00447 for (int32_t i=0; i<sv.num_feat_entries; i++) 00448 { 00449 int32_t vidx=sv.features[i].feat_index; 00450 int32_t fidx=v; 00451 sfm[vidx].features[hist[vidx]].feat_index=fidx; 00452 sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry; 00453 hist[vidx]++; 00454 } 00455 00456 free_sparse_feature_vector(sv, v); 00457 } 00458 00459 SG_FREE(hist); 00460 return sfm; 00461 } 00462 00463 template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm) 00464 { 00465 if (m_subset) 00466 SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n"); 00467 00468 00469 free_sparse_feature_matrix(); 00470 sm.own_matrix(); 00471 00472 sparse_feature_matrix=sm.sparse_matrix; 00473 num_features=sm.num_features; 00474 num_vectors=sm.num_vectors; 00475 } 00476 00477 template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix() 00478 { 00479 SGMatrix<ST> full; 00480 00481 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features); 00482 full.num_rows=num_features; 00483 full.num_cols=get_num_vectors(); 00484 full.do_free=true; 00485 full.matrix=SG_MALLOC(ST, int64_t(num_features)*get_num_vectors()); 00486 00487 memset(full.matrix, 0, size_t(num_features)*size_t(get_num_vectors())*sizeof(ST)); 00488 00489 for (int32_t v=0; v<full.num_cols; v++) 00490 { 00491 SGSparseVector<ST> current= 00492 sparse_feature_matrix[subset_idx_conversion(v)]; 00493 00494 for (int32_t f=0; f<current.num_feat_entries; f++) 00495 { 00496 int64_t offs=(current.vec_index*num_features) 00497 +current.features[f].feat_index; 00498 00499 full.matrix[offs]=current.features[f].entry; 00500 } 00501 } 00502 00503 return full; 00504 } 00505 00506 template<class ST> bool CSparseFeatures<ST>::set_full_feature_matrix(SGMatrix<ST> full) 00507 { 00508 remove_subset(); 00509 00510 ST* src=full.matrix; 00511 int32_t num_feat=full.num_rows; 00512 int32_t num_vec=full.num_cols; 00513 00514 free_sparse_feature_matrix(); 00515 bool result=true; 00516 num_features=num_feat; 00517 num_vectors=num_vec; 00518 00519 SG_INFO("converting dense feature matrix to sparse one\n"); 00520 int32_t* num_feat_entries=SG_MALLOC(int, num_vectors); 00521 00522 if (num_feat_entries) 00523 { 00524 int64_t num_total_entries=0; 00525 00526 // count nr of non sparse features 00527 for (int32_t i=0; i< num_vec; i++) 00528 { 00529 num_feat_entries[i]=0; 00530 for (int32_t j=0; j< num_feat; j++) 00531 { 00532 if (src[i*((int64_t) num_feat) + j] != 0) 00533 num_feat_entries[i]++; 00534 } 00535 } 00536 00537 if (num_vec>0) 00538 { 00539 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec); 00540 00541 if (sparse_feature_matrix) 00542 { 00543 for (int32_t i=0; i< num_vec; i++) 00544 { 00545 sparse_feature_matrix[i].vec_index=i; 00546 sparse_feature_matrix[i].num_feat_entries=0; 00547 sparse_feature_matrix[i].features= NULL; 00548 00549 if (num_feat_entries[i]>0) 00550 { 00551 sparse_feature_matrix[i].features= SG_MALLOC(SGSparseVectorEntry<ST>, num_feat_entries[i]); 00552 00553 if (!sparse_feature_matrix[i].features) 00554 { 00555 SG_INFO( "allocation of features failed\n"); 00556 return false; 00557 } 00558 00559 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i]; 00560 int32_t sparse_feat_idx=0; 00561 00562 for (int32_t j=0; j< num_feat; j++) 00563 { 00564 int64_t pos= i*num_feat + j; 00565 00566 if (src[pos] != 0) 00567 { 00568 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos]; 00569 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j; 00570 sparse_feat_idx++; 00571 num_total_entries++; 00572 } 00573 } 00574 } 00575 } 00576 } 00577 else 00578 { 00579 SG_ERROR( "allocation of sparse feature matrix failed\n"); 00580 result=false; 00581 } 00582 00583 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n", 00584 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec)); 00585 } 00586 else 00587 { 00588 SG_ERROR( "huh ? zero size matrix given ?\n"); 00589 result=false; 00590 } 00591 } 00592 SG_FREE(num_feat_entries); 00593 return result; 00594 } 00595 00596 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing) 00597 { 00598 SG_INFO( "force: %d\n", force_preprocessing); 00599 00600 if ( sparse_feature_matrix && get_num_preprocessors() ) 00601 { 00602 for (int32_t i=0; i<get_num_preprocessors(); i++) 00603 { 00604 if ( (!is_preprocessed(i) || force_preprocessing) ) 00605 { 00606 set_preprocessed(i); 00607 SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name()); 00608 if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL) 00609 return false; 00610 } 00611 return true; 00612 } 00613 return true; 00614 } 00615 else 00616 { 00617 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n"); 00618 return false; 00619 } 00620 } 00621 00622 template<class ST> int32_t CSparseFeatures<ST>::get_size() 00623 { 00624 return sizeof(ST); 00625 } 00626 00627 template<class ST> bool CSparseFeatures<ST>::obtain_from_simple(CSimpleFeatures<ST>* sf) 00628 { 00629 SGMatrix<ST> fm=sf->get_feature_matrix(); 00630 ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0); 00631 00632 return set_full_feature_matrix(fm); 00633 } 00634 00635 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const 00636 { 00637 return m_subset ? m_subset->get_size() : num_vectors; 00638 } 00639 00640 template<class ST> int32_t CSparseFeatures<ST>::get_num_features() 00641 { 00642 return num_features; 00643 } 00644 00645 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num) 00646 { 00647 int32_t n=num_features; 00648 ASSERT(n<=num); 00649 num_features=num; 00650 return num_features; 00651 } 00652 00653 template<class ST> EFeatureClass CSparseFeatures<ST>::get_feature_class() 00654 { 00655 return C_SPARSE; 00656 } 00657 00658 template<class ST> void CSparseFeatures<ST>::free_feature_vector(SGSparseVector<ST> vec, int32_t num) 00659 { 00660 if (feature_cache) 00661 feature_cache->unlock_entry(subset_idx_conversion(num)); 00662 00663 vec.free_vector(); 00664 } 00665 00666 template<class ST> int64_t CSparseFeatures<ST>::get_num_nonzero_entries() 00667 { 00668 int64_t num=0; 00669 index_t num_vec=get_num_vectors(); 00670 for (int32_t i=0; i<num_vec; i++) 00671 num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries; 00672 00673 return num; 00674 } 00675 00676 template<class ST> float64_t* CSparseFeatures<ST>::compute_squared(float64_t* sq) 00677 { 00678 ASSERT(sq); 00679 00680 index_t num_vec=get_num_vectors(); 00681 for (int32_t i=0; i<num_vec; i++) 00682 { 00683 sq[i]=0; 00684 SGSparseVector<ST> vec=get_sparse_feature_vector(i); 00685 00686 for (int32_t j=0; j<vec.num_feat_entries; j++) 00687 sq[i]+=vec.features[j].entry*vec.features[j].entry; 00688 00689 free_feature_vector(vec, i); 00690 } 00691 00692 return sq; 00693 } 00694 00695 template<class ST> float64_t CSparseFeatures<ST>::compute_squared_norm( 00696 CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, 00697 CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b) 00698 { 00699 int32_t i,j; 00700 ASSERT(lhs); 00701 ASSERT(rhs); 00702 00703 SGSparseVector<float64_t> avec=lhs->get_sparse_feature_vector(idx_a); 00704 SGSparseVector<float64_t> bvec=rhs->get_sparse_feature_vector(idx_b); 00705 ASSERT(avec.features); 00706 ASSERT(bvec.features); 00707 00708 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b]; 00709 00710 if (avec.num_feat_entries<=bvec.num_feat_entries) 00711 { 00712 j=0; 00713 for (i=0; i<avec.num_feat_entries; i++) 00714 { 00715 int32_t a_feat_idx=avec.features[i].feat_index; 00716 00717 while ((j<bvec.num_feat_entries) 00718 &&(bvec.features[j].feat_index<a_feat_idx)) 00719 j++; 00720 00721 if ((j<bvec.num_feat_entries) 00722 &&(bvec.features[j].feat_index==a_feat_idx)) 00723 { 00724 result-=2*(avec.features[i].entry*bvec.features[j].entry); 00725 j++; 00726 } 00727 } 00728 } 00729 else 00730 { 00731 j=0; 00732 for (i=0; i<bvec.num_feat_entries; i++) 00733 { 00734 int32_t b_feat_idx=bvec.features[i].feat_index; 00735 00736 while ((j<avec.num_feat_entries) 00737 &&(avec.features[j].feat_index<b_feat_idx)) 00738 j++; 00739 00740 if ((j<avec.num_feat_entries) 00741 &&(avec.features[j].feat_index==b_feat_idx)) 00742 { 00743 result-=2*(bvec.features[i].entry*avec.features[j].entry); 00744 j++; 00745 } 00746 } 00747 } 00748 00749 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a); 00750 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b); 00751 00752 return CMath::abs(result); 00753 } 00754 00755 template<class ST> CLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname, 00756 bool do_sort_features) 00757 { 00758 remove_subset(); 00759 00760 CLabels* lab=NULL; 00761 00762 size_t blocksize=1024*1024; 00763 size_t required_blocksize=blocksize; 00764 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); 00765 FILE* f=fopen(fname, "ro"); 00766 00767 if (f) 00768 { 00769 free_sparse_feature_matrix(); 00770 num_vectors=0; 00771 num_features=0; 00772 00773 SG_INFO("counting line numbers in file %s\n", fname); 00774 size_t sz=blocksize; 00775 size_t block_offs=0; 00776 size_t old_block_offs=0; 00777 fseek(f, 0, SEEK_END); 00778 size_t fsize=ftell(f); 00779 rewind(f); 00780 00781 while (sz == blocksize) 00782 { 00783 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00784 for (size_t i=0; i<sz; i++) 00785 { 00786 block_offs++; 00787 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00788 { 00789 num_vectors++; 00790 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); 00791 old_block_offs=block_offs; 00792 } 00793 } 00794 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00795 } 00796 00797 SG_INFO("found %d feature vectors\n", num_vectors); 00798 SG_FREE(dummy); 00799 blocksize=required_blocksize; 00800 dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL 00801 00802 lab=new CLabels(num_vectors); 00803 sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors); 00804 00805 rewind(f); 00806 sz=blocksize; 00807 int32_t lines=0; 00808 while (sz == blocksize) 00809 { 00810 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00811 00812 size_t old_sz=0; 00813 for (size_t i=0; i<sz; i++) 00814 { 00815 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) 00816 { 00817 size_t len=i-old_sz+1; 00818 uint8_t* data=&dummy[old_sz]; 00819 00820 for (size_t j=0; j<len; j++) 00821 dummy[j]=data[j]; 00822 00823 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f); 00824 i=0; 00825 old_sz=0; 00826 sz+=len; 00827 } 00828 00829 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00830 { 00831 00832 size_t len=i-old_sz; 00833 uint8_t* data=&dummy[old_sz]; 00834 00835 int32_t dims=0; 00836 for (size_t j=0; j<len; j++) 00837 { 00838 if (data[j]==':') 00839 dims++; 00840 } 00841 00842 if (dims<=0) 00843 { 00844 SG_ERROR("Error in line %d - number of" 00845 " dimensions is %d line is %d characters" 00846 " long\n line_content:'%.*s'\n", lines, 00847 dims, len, len, (const char*) data); 00848 } 00849 00850 SGSparseVectorEntry<ST>* feat=SG_MALLOC(SGSparseVectorEntry<ST>, dims); 00851 size_t j=0; 00852 for (; j<len; j++) 00853 { 00854 if (data[j]==' ') 00855 { 00856 data[j]='\0'; 00857 00858 lab->set_label(lines, atof((const char*) data)); 00859 break; 00860 } 00861 } 00862 00863 int32_t d=0; 00864 j++; 00865 uint8_t* start=&data[j]; 00866 for (; j<len; j++) 00867 { 00868 if (data[j]==':') 00869 { 00870 data[j]='\0'; 00871 00872 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; 00873 num_features=CMath::max(num_features, feat[d].feat_index+1); 00874 00875 j++; 00876 start=&data[j]; 00877 for (; j<len; j++) 00878 { 00879 if (data[j]==' ' || data[j]=='\n') 00880 { 00881 data[j]='\0'; 00882 feat[d].entry=(ST) atof((const char*) start); 00883 d++; 00884 break; 00885 } 00886 } 00887 00888 if (j==len) 00889 { 00890 data[j]='\0'; 00891 feat[dims-1].entry=(ST) atof((const char*) start); 00892 } 00893 00894 j++; 00895 start=&data[j]; 00896 } 00897 } 00898 00899 sparse_feature_matrix[lines].vec_index=lines; 00900 sparse_feature_matrix[lines].num_feat_entries=dims; 00901 sparse_feature_matrix[lines].features=feat; 00902 00903 old_sz=i+1; 00904 lines++; 00905 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 00906 } 00907 } 00908 } 00909 SG_INFO("file successfully read\n"); 00910 fclose(f); 00911 } 00912 00913 SG_FREE(dummy); 00914 00915 if (do_sort_features) 00916 sort_features(); 00917 00918 return lab; 00919 } 00920 00921 template<class ST> void CSparseFeatures<ST>::sort_features() 00922 { 00923 if (m_subset) 00924 SG_ERROR("sort_features() not allowed with subset\n"); 00925 00926 ASSERT(get_num_preprocessors()==0); 00927 00928 if (!sparse_feature_matrix) 00929 SG_ERROR("Requires sparse feature matrix to be available in-memory\n"); 00930 00931 for (int32_t i=0; i<num_vectors; i++) 00932 { 00933 int32_t len=sparse_feature_matrix[i].num_feat_entries; 00934 00935 if (!len) 00936 continue; 00937 00938 SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features; 00939 int32_t* feat_idx=SG_MALLOC(int32_t, len); 00940 int32_t* orig_idx=SG_MALLOC(int32_t, len); 00941 00942 for (int j=0; j<len; j++) 00943 { 00944 feat_idx[j]=sf_orig[j].feat_index; 00945 orig_idx[j]=j; 00946 } 00947 00948 CMath::qsort_index(feat_idx, orig_idx, len); 00949 00950 SGSparseVectorEntry<ST>* sf_new= SG_MALLOC(SGSparseVectorEntry<ST>, len); 00951 for (int j=0; j<len; j++) 00952 sf_new[j]=sf_orig[orig_idx[j]]; 00953 00954 sparse_feature_matrix[i].features=sf_new; 00955 00956 // sanity check 00957 for (int j=0; j<len-1; j++) 00958 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index); 00959 00960 SG_FREE(orig_idx); 00961 SG_FREE(feat_idx); 00962 SG_FREE(sf_orig); 00963 } 00964 } 00965 00966 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname, 00967 CLabels* label) 00968 { 00969 if (m_subset) 00970 SG_ERROR("write_svmlight_file() not allowed with subset\n"); 00971 00972 ASSERT(label); 00973 int32_t num=label->get_num_labels(); 00974 ASSERT(num>0); 00975 ASSERT(num==num_vectors); 00976 00977 FILE* f=fopen(fname, "wb"); 00978 00979 if (f) 00980 { 00981 for (int32_t i=0; i<num; i++) 00982 { 00983 fprintf(f, "%d ", (int32_t) label->get_int_label(i)); 00984 00985 SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features; 00986 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries; 00987 00988 for (int32_t j=0; j<num_feat; j++) 00989 { 00990 if (j<num_feat-1) 00991 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 00992 else 00993 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 00994 } 00995 } 00996 00997 fclose(f); 00998 return true; 00999 } 01000 return false; 01001 } 01002 01003 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const 01004 { 01005 return num_features; 01006 } 01007 01008 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1, 01009 CDotFeatures* df, int32_t vec_idx2) 01010 { 01011 ASSERT(df); 01012 ASSERT(df->get_feature_type() == get_feature_type()); 01013 ASSERT(df->get_feature_class() == get_feature_class()); 01014 CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df; 01015 01016 SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1); 01017 SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2); 01018 01019 float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries, 01020 bvec.features, bvec.num_feat_entries); 01021 01022 free_sparse_feature_vector(avec, vec_idx1); 01023 sf->free_sparse_feature_vector(bvec, vec_idx2); 01024 01025 return result; 01026 } 01027 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 01028 { 01029 ASSERT(vec2); 01030 if (vec2_len!=num_features) 01031 { 01032 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 01033 vec2_len, num_features); 01034 } 01035 float64_t result=0; 01036 01037 SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1); 01038 01039 if (sv.features) 01040 { 01041 for (int32_t i=0; i<sv.num_feat_entries; i++) 01042 result+=vec2[sv.features[i].feat_index]*sv.features[i].entry; 01043 } 01044 01045 free_sparse_feature_vector(sv, vec_idx1); 01046 01047 return result; 01048 } 01049 01050 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index) 01051 { 01052 if (vector_index>=get_num_vectors()) 01053 { 01054 SG_ERROR("Index out of bounds (number of vectors %d, you " 01055 "requested %d)\n", get_num_vectors(), vector_index); 01056 } 01057 01058 if (!sparse_feature_matrix) 01059 SG_ERROR("Requires a in-memory feature matrix\n"); 01060 01061 sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1); 01062 it->sv=get_sparse_feature_vector(vector_index); 01063 it->index=0; 01064 01065 return it; 01066 } 01067 01068 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator) 01069 { 01070 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01071 if (!it || it->index>=it->sv.num_feat_entries) 01072 return false; 01073 01074 int32_t i=it->index++; 01075 01076 index=it->sv.features[i].feat_index; 01077 value=(float64_t) it->sv.features[i].entry; 01078 01079 return true; 01080 } 01081 01082 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator) 01083 { 01084 if (!iterator) 01085 return; 01086 01087 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01088 free_sparse_feature_vector(it->sv, it->sv.vec_index); 01089 SG_FREE(it); 01090 } 01091 01092 template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t> indices) 01093 { 01094 SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen, 01095 get_dim_feature_space()); 01096 01097 for (index_t i=0; i<indices.vlen; ++i) 01098 { 01099 /* index to copy */ 01100 index_t index=indices.vector[i]; 01101 01102 /* copy sparse vector TODO THINK ABOUT VECTOR INDEX (i or vec.index*/ 01103 SGSparseVector<ST> current=get_sparse_feature_vector(index); 01104 matrix_copy.sparse_matrix[i]=SGSparseVector<ST>( 01105 current.num_feat_entries, current.vec_index); 01106 01107 /* copy entries */ 01108 memcpy(matrix_copy.sparse_matrix[i].features, current.features, 01109 sizeof(SGSparseVectorEntry<ST>)*current.num_feat_entries); 01110 01111 free_sparse_feature_vector(current, index); 01112 } 01113 01114 return new CSparseFeatures<ST>(matrix_copy); 01115 } 01116 01117 template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num, 01118 int32_t& len, SGSparseVectorEntry<ST>* target) 01119 { 01120 SG_NOTIMPLEMENTED; 01121 01122 len=0; 01123 return NULL; 01124 } 01125 01126 template<class ST> void CSparseFeatures<ST>::init() 01127 { 01128 set_generic<ST>(); 01129 01130 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors, 01131 "sparse_feature_matrix", 01132 "Array of sparse vectors."); 01133 m_parameters->add(&num_features, "num_features", 01134 "Total number of features."); 01135 } 01136 01137 #define GET_FEATURE_TYPE(sg_type, f_type) \ 01138 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() \ 01139 { \ 01140 return f_type; \ 01141 } 01142 GET_FEATURE_TYPE(bool, F_BOOL) 01143 GET_FEATURE_TYPE(char, F_CHAR) 01144 GET_FEATURE_TYPE(uint8_t, F_BYTE) 01145 GET_FEATURE_TYPE(int8_t, F_BYTE) 01146 GET_FEATURE_TYPE(int16_t, F_SHORT) 01147 GET_FEATURE_TYPE(uint16_t, F_WORD) 01148 GET_FEATURE_TYPE(int32_t, F_INT) 01149 GET_FEATURE_TYPE(uint32_t, F_UINT) 01150 GET_FEATURE_TYPE(int64_t, F_LONG) 01151 GET_FEATURE_TYPE(uint64_t, F_ULONG) 01152 GET_FEATURE_TYPE(float32_t, F_SHORTREAL) 01153 GET_FEATURE_TYPE(float64_t, F_DREAL) 01154 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL) 01155 #undef GET_FEATURE_TYPE 01156 01157 #define LOAD(fname, sg_type) \ 01158 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \ 01159 { \ 01160 remove_subset(); \ 01161 SG_SET_LOCALE_C; \ 01162 ASSERT(loader); \ 01163 SGSparseVector<sg_type>* matrix=NULL; \ 01164 int32_t num_feat=0; \ 01165 int32_t num_vec=0; \ 01166 loader->fname(matrix, num_feat, num_vec); \ 01167 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \ 01168 SG_RESET_LOCALE; \ 01169 } 01170 LOAD(get_sparse_matrix, bool) 01171 LOAD(get_sparse_matrix, char) 01172 LOAD(get_sparse_matrix, uint8_t) 01173 LOAD(get_int8_sparsematrix, int8_t) 01174 LOAD(get_sparse_matrix, int16_t) 01175 LOAD(get_sparse_matrix, uint16_t) 01176 LOAD(get_sparse_matrix, int32_t) 01177 LOAD(get_uint_sparsematrix, uint32_t) 01178 LOAD(get_long_sparsematrix, int64_t) 01179 LOAD(get_ulong_sparsematrix, uint64_t) 01180 LOAD(get_sparse_matrix, float32_t) 01181 LOAD(get_sparse_matrix, float64_t) 01182 LOAD(get_longreal_sparsematrix, floatmax_t) 01183 #undef LOAD 01184 01185 #define WRITE(fname, sg_type) \ 01186 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \ 01187 { \ 01188 if (m_subset) \ 01189 SG_ERROR("save() not allowed with subset\n"); \ 01190 SG_SET_LOCALE_C; \ 01191 ASSERT(writer); \ 01192 writer->fname(sparse_feature_matrix, num_features, num_vectors); \ 01193 SG_RESET_LOCALE; \ 01194 } 01195 WRITE(set_sparse_matrix, bool) 01196 WRITE(set_sparse_matrix, char) 01197 WRITE(set_sparse_matrix, uint8_t) 01198 WRITE(set_int8_sparsematrix, int8_t) 01199 WRITE(set_sparse_matrix, int16_t) 01200 WRITE(set_sparse_matrix, uint16_t) 01201 WRITE(set_sparse_matrix, int32_t) 01202 WRITE(set_uint_sparsematrix, uint32_t) 01203 WRITE(set_long_sparsematrix, int64_t) 01204 WRITE(set_ulong_sparsematrix, uint64_t) 01205 WRITE(set_sparse_matrix, float32_t) 01206 WRITE(set_sparse_matrix, float64_t) 01207 WRITE(set_longreal_sparsematrix, floatmax_t) 01208 #undef WRITE 01209 01210 template class CSparseFeatures<bool>; 01211 template class CSparseFeatures<char>; 01212 template class CSparseFeatures<int8_t>; 01213 template class CSparseFeatures<uint8_t>; 01214 template class CSparseFeatures<int16_t>; 01215 template class CSparseFeatures<uint16_t>; 01216 template class CSparseFeatures<int32_t>; 01217 template class CSparseFeatures<uint32_t>; 01218 template class CSparseFeatures<int64_t>; 01219 template class CSparseFeatures<uint64_t>; 01220 template class CSparseFeatures<float32_t>; 01221 template class CSparseFeatures<float64_t>; 01222 template class CSparseFeatures<floatmax_t>; 01223 }