SHOGUN
v1.1.0
|
00001 #include <shogun/features/StringFeatures.h> 00002 #include <shogun/preprocessor/Preprocessor.h> 00003 #include <shogun/preprocessor/StringPreprocessor.h> 00004 #include <shogun/io/MemoryMappedFile.h> 00005 #include <shogun/io/SGIO.h> 00006 #include <shogun/mathematics/Math.h> 00007 #include <shogun/base/Parameter.h> 00008 00009 #include <sys/types.h> 00010 #include <sys/stat.h> 00011 #include <dirent.h> 00012 #include <stdio.h> 00013 #include <stdlib.h> 00014 #include <unistd.h> 00015 00016 00017 namespace shogun 00018 { 00019 00020 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0) 00021 { 00022 init(); 00023 alphabet=new CAlphabet(); 00024 } 00025 00026 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0) 00027 { 00028 init(); 00029 00030 alphabet=new CAlphabet(alpha); 00031 SG_REF(alphabet); 00032 num_symbols=alphabet->get_num_symbols(); 00033 original_num_symbols=num_symbols; 00034 } 00035 00036 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha) 00037 : CFeatures(0) 00038 { 00039 init(); 00040 00041 alphabet=new CAlphabet(alpha); 00042 SG_REF(alphabet); 00043 num_symbols=alphabet->get_num_symbols(); 00044 original_num_symbols=num_symbols; 00045 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length); 00046 } 00047 00048 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha) 00049 : CFeatures(0) 00050 { 00051 init(); 00052 00053 alphabet=new CAlphabet(alpha); 00054 SG_REF(alphabet); 00055 num_symbols=alphabet->get_num_symbols(); 00056 original_num_symbols=num_symbols; 00057 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length); 00058 } 00059 00060 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha) 00061 : CFeatures(0) 00062 { 00063 init(); 00064 00065 ASSERT(alpha); 00066 SG_REF(alpha); 00067 alphabet=alpha; 00068 num_symbols=alphabet->get_num_symbols(); 00069 original_num_symbols=num_symbols; 00070 } 00071 00072 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig) 00073 : CFeatures(orig), num_vectors(orig.num_vectors), 00074 single_string(orig.single_string), 00075 length_of_single_string(orig.length_of_single_string), 00076 max_string_length(orig.max_string_length), 00077 num_symbols(orig.num_symbols), 00078 original_num_symbols(orig.original_num_symbols), 00079 order(orig.order), preprocess_on_get(false), 00080 feature_cache(NULL) 00081 { 00082 init(); 00083 00084 ASSERT(orig.single_string == NULL); //not implemented 00085 00086 alphabet=orig.alphabet; 00087 SG_REF(alphabet); 00088 00089 if (orig.features) 00090 { 00091 features=SG_MALLOC(SGString<ST>, orig.num_vectors); 00092 00093 for (int32_t i=0; i<num_vectors; i++) 00094 { 00095 features[i].string=SG_MALLOC(ST, orig.features[i].slen); 00096 features[i].slen=orig.features[i].slen; 00097 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen); 00098 } 00099 } 00100 00101 if (orig.symbol_mask_table) 00102 { 00103 symbol_mask_table=SG_MALLOC(ST, 256); 00104 for (int32_t i=0; i<256; i++) 00105 symbol_mask_table[i]=orig.symbol_mask_table[i]; 00106 } 00107 00108 m_subset=orig.m_subset->duplicate(); 00109 } 00110 00111 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha) 00112 : CFeatures(loader), num_vectors(0), 00113 features(NULL), single_string(NULL), length_of_single_string(0), 00114 max_string_length(0), order(0), 00115 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL) 00116 { 00117 init(); 00118 00119 alphabet=new CAlphabet(alpha); 00120 SG_REF(alphabet); 00121 num_symbols=alphabet->get_num_symbols(); 00122 original_num_symbols=num_symbols; 00123 load(loader); 00124 } 00125 00126 template<class ST> CStringFeatures<ST>::~CStringFeatures() 00127 { 00128 cleanup(); 00129 00130 SG_UNREF(alphabet); 00131 } 00132 00133 template<class ST> void CStringFeatures<ST>::cleanup() 00134 { 00135 remove_subset(); 00136 00137 if (single_string) 00138 { 00139 SG_FREE(single_string); 00140 single_string=NULL; 00141 } 00142 else 00143 cleanup_feature_vectors(0, num_vectors-1); 00144 00145 num_vectors=0; 00146 SG_FREE(features); 00147 SG_FREE(symbol_mask_table); 00148 features=NULL; 00149 symbol_mask_table=NULL; 00150 00151 /* start with a fresh alphabet, but instead of emptying the histogram 00152 * create a new object (to leave the alphabet object alone if it is used 00153 * by others) 00154 */ 00155 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00156 SG_UNREF(alphabet); 00157 alphabet=alpha; 00158 SG_REF(alphabet); 00159 } 00160 00161 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num) 00162 { 00163 ASSERT(num<get_num_vectors()); 00164 00165 if (features) 00166 { 00167 int32_t real_num=subset_idx_conversion(num); 00168 SG_FREE(features[real_num].string); 00169 features[real_num].string=NULL; 00170 features[real_num].slen=0; 00171 00172 determine_maximum_string_length(); 00173 } 00174 } 00175 00176 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop) 00177 { 00178 if (features && get_num_vectors()) 00179 { 00180 ASSERT(start<get_num_vectors()); 00181 ASSERT(stop<get_num_vectors()); 00182 00183 for (int32_t i=start; i<=stop; i++) 00184 { 00185 int32_t real_num=subset_idx_conversion(i); 00186 SG_FREE(features[real_num].string); 00187 features[real_num].string=NULL; 00188 features[real_num].slen=0; 00189 } 00190 determine_maximum_string_length(); 00191 } 00192 } 00193 00194 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() { return C_STRING; } 00195 00196 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() { return F_UNKNOWN; } 00197 00198 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet() 00199 { 00200 SG_REF(alphabet); 00201 return alphabet; 00202 } 00203 00204 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const 00205 { 00206 return new CStringFeatures<ST>(*this); 00207 } 00208 00209 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num) 00210 { 00211 ASSERT(features); 00212 if (num>=get_num_vectors()) 00213 { 00214 SG_ERROR("Index out of bounds (number of strings %d, you " 00215 "requested %d)\n", get_num_vectors(), num); 00216 } 00217 00218 int32_t l; 00219 bool free_vec; 00220 ST* vec=get_feature_vector(num, l, free_vec); 00221 ST* dst=SG_MALLOC(ST, l); 00222 memcpy(dst, vec, l*sizeof(ST)); 00223 free_feature_vector(vec, num, free_vec); 00224 return SGVector<ST>(dst, l); 00225 } 00226 00227 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num) 00228 { 00229 ASSERT(features); 00230 00231 if (m_subset) 00232 SG_ERROR("A subset is set, cannot set feature vector\n"); 00233 00234 if (num>=num_vectors) 00235 { 00236 SG_ERROR("Index out of bounds (number of strings %d, you " 00237 "requested %d)\n", num_vectors, num); 00238 } 00239 00240 if (vector.vlen<=0) 00241 SG_ERROR("String has zero or negative length\n"); 00242 00243 cleanup_feature_vector(num); 00244 features[num].slen=vector.vlen; 00245 features[num].string=SG_MALLOC(ST, vector.vlen); 00246 memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST)); 00247 00248 determine_maximum_string_length(); 00249 } 00250 00251 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing() 00252 { 00253 preprocess_on_get=true; 00254 } 00255 00256 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing() 00257 { 00258 preprocess_on_get=false; 00259 } 00260 00261 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree) 00262 { 00263 ASSERT(features); 00264 ASSERT(num<get_num_vectors()); 00265 00266 00267 int32_t real_num=subset_idx_conversion(num); 00268 00269 if (!preprocess_on_get) 00270 { 00271 dofree=false; 00272 len=features[real_num].slen; 00273 return features[real_num].string; 00274 } 00275 else 00276 { 00277 SG_DEBUG( "computing feature vector!\n") ; 00278 ST* feat=compute_feature_vector(num, len); 00279 dofree=true; 00280 00281 if (get_num_preprocessors()) 00282 { 00283 ST* tmp_feat_before=feat; 00284 00285 for (int32_t i=0; i<get_num_preprocessors(); i++) 00286 { 00287 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i); 00288 feat=p->apply_to_string(tmp_feat_before, len); 00289 SG_UNREF(p); 00290 SG_FREE(tmp_feat_before); 00291 tmp_feat_before=feat; 00292 } 00293 } 00294 // TODO: implement caching 00295 return feat; 00296 } 00297 } 00298 00299 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed() 00300 { 00301 int32_t num_feat; 00302 int32_t num_vec; 00303 SGString<ST>* s=get_transposed(num_feat, num_vec); 00304 SGStringList<ST> string_list; 00305 string_list.strings = s; 00306 string_list.num_strings = num_vec; 00307 string_list.max_string_length = num_feat; 00308 00309 return new CStringFeatures<ST>(string_list, alphabet); 00310 } 00311 00312 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec) 00313 { 00314 num_feat=get_num_vectors(); 00315 num_vec=get_max_vector_length(); 00316 ASSERT(have_same_length()); 00317 00318 SG_DEBUG("Allocating memory for transposed string features of size %ld\n", 00319 int64_t(num_feat)*num_vec); 00320 00321 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec); 00322 00323 for (int32_t i=0; i<num_vec; i++) 00324 { 00325 sf[i].string=SG_MALLOC(ST, num_feat); 00326 sf[i].slen=num_feat; 00327 } 00328 00329 for (int32_t i=0; i<num_feat; i++) 00330 { 00331 int32_t len=0; 00332 bool free_vec=false; 00333 ST* vec=get_feature_vector(i, len, free_vec); 00334 00335 for (int32_t j=0; j<num_vec; j++) 00336 sf[j].string[i]=vec[j]; 00337 00338 free_feature_vector(vec, i, free_vec); 00339 } 00340 return sf; 00341 } 00342 00343 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree) 00344 { 00345 if (num>=get_num_vectors()) 00346 { 00347 SG_ERROR( 00348 "Trying to access string[%d] but num_str=%d\n", num, 00349 get_num_vectors()); 00350 } 00351 00352 int32_t real_num=subset_idx_conversion(num); 00353 00354 if (feature_cache) 00355 feature_cache->unlock_entry(real_num); 00356 00357 if (dofree) 00358 SG_FREE(feat_vec); 00359 } 00360 00361 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num) 00362 { 00363 if (num>=get_num_vectors()) 00364 { 00365 SG_ERROR( 00366 "Trying to access string[%d] but num_str=%d\n", num, 00367 get_num_vectors()); 00368 } 00369 00370 int32_t real_num=subset_idx_conversion(num); 00371 00372 if (feature_cache) 00373 feature_cache->unlock_entry(real_num); 00374 00375 if (feat_vec.do_free) 00376 SG_FREE(feat_vec.vector); 00377 } 00378 00379 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num) 00380 { 00381 ASSERT(vec_num<get_num_vectors()); 00382 00383 int32_t len; 00384 bool free_vec; 00385 ST* vec=get_feature_vector(vec_num, len, free_vec); 00386 ASSERT(feat_num<len); 00387 ST result=vec[feat_num]; 00388 free_feature_vector(vec, vec_num, free_vec); 00389 00390 return result; 00391 } 00392 00393 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num) 00394 { 00395 ASSERT(vec_num<get_num_vectors()); 00396 00397 int32_t len; 00398 bool free_vec; 00399 ST* vec=get_feature_vector(vec_num, len, free_vec); 00400 free_feature_vector(vec, vec_num, free_vec); 00401 return len; 00402 } 00403 00404 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length() 00405 { 00406 return max_string_length; 00407 } 00408 00409 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const 00410 { 00411 return m_subset ? m_subset->get_size() : num_vectors; 00412 } 00413 00414 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; } 00415 00416 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); } 00417 00418 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; } 00419 00420 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; } 00421 00422 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask) 00423 { 00424 ASSERT(symbol_mask_table); 00425 return symbol_mask_table[mask] & symbol; 00426 } 00427 00428 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount) 00429 { 00430 ASSERT(alphabet); 00431 return (offset << (amount*alphabet->get_num_bits())); 00432 } 00433 00434 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount) 00435 { 00436 ASSERT(alphabet); 00437 return (symbol >> (amount*alphabet->get_num_bits())); 00438 } 00439 00440 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin, 00441 EAlphabet ascii_alphabet, EAlphabet binary_alphabet) 00442 { 00443 remove_subset(); 00444 00445 size_t blocksize=1024*1024; 00446 size_t required_blocksize=0; 00447 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); 00448 uint8_t* overflow=NULL; 00449 int32_t overflow_len=0; 00450 00451 cleanup(); 00452 00453 CAlphabet* alpha=new CAlphabet(ascii_alphabet); 00454 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet); 00455 00456 FILE* f=fopen(fname, "ro"); 00457 00458 if (f) 00459 { 00460 num_vectors=0; 00461 max_string_length=0; 00462 00463 SG_INFO("counting line numbers in file %s\n", fname); 00464 size_t block_offs=0; 00465 size_t old_block_offs=0; 00466 fseek(f, 0, SEEK_END); 00467 size_t fsize=ftell(f); 00468 rewind(f); 00469 00470 if (blocksize>fsize) 00471 blocksize=fsize; 00472 00473 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize); 00474 00475 size_t sz=blocksize; 00476 while (sz == blocksize) 00477 { 00478 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00479 for (size_t i=0; i<sz; i++) 00480 { 00481 block_offs++; 00482 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00483 { 00484 num_vectors++; 00485 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00486 old_block_offs=block_offs; 00487 } 00488 } 00489 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00490 } 00491 00492 SG_INFO("found %d strings\n", num_vectors); 00493 SG_FREE(dummy); 00494 blocksize=required_blocksize; 00495 dummy=SG_MALLOC(uint8_t, blocksize); 00496 overflow=SG_MALLOC(uint8_t, blocksize); 00497 features=SG_MALLOC(SGString<ST>, num_vectors); 00498 00499 rewind(f); 00500 sz=blocksize; 00501 int32_t lines=0; 00502 while (sz == blocksize) 00503 { 00504 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00505 00506 size_t old_sz=0; 00507 for (size_t i=0; i<sz; i++) 00508 { 00509 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00510 { 00511 int32_t len=i-old_sz; 00512 //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz); 00513 max_string_length=CMath::max(max_string_length, len+overflow_len); 00514 00515 features[lines].slen=len; 00516 features[lines].string=SG_MALLOC(ST, len); 00517 00518 if (remap_to_bin) 00519 { 00520 for (int32_t j=0; j<overflow_len; j++) 00521 features[lines].string[j]=alpha->remap_to_bin(overflow[j]); 00522 for (int32_t j=0; j<len; j++) 00523 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]); 00524 alpha->add_string_to_histogram(&dummy[old_sz], len); 00525 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen); 00526 } 00527 else 00528 { 00529 for (int32_t j=0; j<overflow_len; j++) 00530 features[lines].string[j]=overflow[j]; 00531 for (int32_t j=0; j<len; j++) 00532 features[lines].string[j+overflow_len]=dummy[old_sz+j]; 00533 alpha->add_string_to_histogram(&dummy[old_sz], len); 00534 alpha->add_string_to_histogram(features[lines].string, features[lines].slen); 00535 } 00536 00537 // clear overflow 00538 overflow_len=0; 00539 00540 //CMath::display_vector(features[lines].string, len); 00541 old_sz=i+1; 00542 lines++; 00543 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 00544 } 00545 } 00546 for (size_t i=old_sz; i<sz; i++) 00547 overflow[i-old_sz]=dummy[i]; 00548 00549 overflow_len=sz-old_sz; 00550 } 00551 00552 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00553 { 00554 SG_INFO("file successfully read\n"); 00555 SG_INFO("max_string_length=%d\n", max_string_length); 00556 SG_INFO("num_strings=%d\n", num_vectors); 00557 } 00558 fclose(f); 00559 } 00560 00561 SG_FREE(dummy); 00562 00563 SG_UNREF(alphabet); 00564 00565 if (remap_to_bin) 00566 alphabet=alpha_bin; 00567 else 00568 alphabet=alpha; 00569 SG_REF(alphabet); 00570 num_symbols=alphabet->get_num_symbols(); 00571 } 00572 00573 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid) 00574 { 00575 remove_subset(); 00576 00577 int32_t i=0; 00578 uint64_t len=0; 00579 uint64_t offs=0; 00580 int32_t num=0; 00581 int32_t max_len=0; 00582 00583 CMemoryMappedFile<char> f(fname); 00584 00585 while (true) 00586 { 00587 char* s=f.get_line(len, offs); 00588 if (!s) 00589 break; 00590 00591 if (len>0 && s[0]=='>') 00592 num++; 00593 } 00594 00595 if (num==0) 00596 SG_ERROR("No fasta hunks (lines starting with '>') found\n"); 00597 00598 cleanup(); 00599 SG_UNREF(alphabet); 00600 alphabet=new CAlphabet(DNA); 00601 num_symbols=alphabet->get_num_symbols(); 00602 00603 SGString<ST>* strings=SG_MALLOC(SGString<ST>, num); 00604 offs=0; 00605 00606 for (i=0;i<num; i++) 00607 { 00608 uint64_t id_len=0; 00609 char* id=f.get_line(id_len, offs); 00610 00611 char* fasta=f.get_line(len, offs); 00612 char* s=fasta; 00613 int32_t fasta_len=0; 00614 int32_t spanned_lines=0; 00615 00616 while (true) 00617 { 00618 if (!s || len==0) 00619 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len); 00620 00621 if (s[0]=='>' || offs==f.get_size()) 00622 { 00623 offs-=len+1; // seek to beginning 00624 if (offs==f.get_size()) 00625 { 00626 SG_DEBUG("at EOF\n"); 00627 fasta_len+=len; 00628 } 00629 00630 len=fasta_len-spanned_lines; 00631 strings[i].string=SG_MALLOC(ST, len); 00632 strings[i].slen=len; 00633 00634 ST* str=strings[i].string; 00635 int32_t idx=0; 00636 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines); 00637 00638 for (int32_t j=0; j<fasta_len; j++) 00639 { 00640 if (fasta[j]=='\n') 00641 continue; 00642 00643 ST c=(ST) fasta[j]; 00644 00645 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j])) 00646 c=(ST) 'A'; 00647 00648 if (uint64_t(idx)>=len) 00649 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str); 00650 str[idx++]=c; 00651 } 00652 max_len=CMath::max(max_len, strings[i].slen); 00653 00654 00655 break; 00656 } 00657 00658 spanned_lines++; 00659 fasta_len+=len+1; // including '\n' 00660 s=f.get_line(len, offs); 00661 } 00662 } 00663 return set_features(strings, num, max_len); 00664 } 00665 00666 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname, 00667 bool ignore_invalid, bool bitremap_in_single_string) 00668 { 00669 remove_subset(); 00670 00671 CMemoryMappedFile<char> f(fname); 00672 00673 int32_t i=0; 00674 uint64_t len=0; 00675 uint64_t offs=0; 00676 00677 int32_t num=f.get_num_lines(); 00678 int32_t max_len=0; 00679 00680 if (num%4) 00681 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n"); 00682 num/=4; 00683 00684 cleanup(); 00685 SG_UNREF(alphabet); 00686 alphabet=new CAlphabet(DNA); 00687 00688 SGString<ST>* strings; 00689 00690 ST* str=NULL; 00691 if (bitremap_in_single_string) 00692 { 00693 strings=SG_MALLOC(SGString<ST>, 1); 00694 strings[0].string=SG_MALLOC(ST, num); 00695 strings[0].slen=num; 00696 f.get_line(len, offs); 00697 f.get_line(len, offs); 00698 order=len; 00699 max_len=num; 00700 offs=0; 00701 original_num_symbols=alphabet->get_num_symbols(); 00702 str=SG_MALLOC(ST, len); 00703 } 00704 else 00705 strings=SG_MALLOC(SGString<ST>, num); 00706 00707 for (i=0;i<num; i++) 00708 { 00709 if (!f.get_line(len, offs)) 00710 SG_ERROR("Error reading 'read' identifier in line %d", 4*i); 00711 00712 char* s=f.get_line(len, offs); 00713 if (!s || len==0) 00714 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len); 00715 00716 if (bitremap_in_single_string) 00717 { 00718 if (len!=(uint64_t) order) 00719 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len); 00720 for (int32_t j=0; j<order; j++) 00721 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]); 00722 00723 strings[0].string[i]=embed_word(str, order); 00724 } 00725 else 00726 { 00727 strings[i].string=SG_MALLOC(ST, len); 00728 strings[i].slen=len; 00729 str=strings[i].string; 00730 00731 if (ignore_invalid) 00732 { 00733 for (uint64_t j=0; j<len; j++) 00734 { 00735 if (alphabet->is_valid((uint8_t) s[j])) 00736 str[j]= (ST) s[j]; 00737 else 00738 str[j]= (ST) 'A'; 00739 } 00740 } 00741 else 00742 { 00743 for (uint64_t j=0; j<len; j++) 00744 str[j]= (ST) s[j]; 00745 } 00746 max_len=CMath::max(max_len, (int32_t) len); 00747 } 00748 00749 00750 if (!f.get_line(len, offs)) 00751 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2); 00752 00753 if (!f.get_line(len, offs)) 00754 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3); 00755 } 00756 00757 if (bitremap_in_single_string) 00758 num=1; 00759 00760 num_vectors=num; 00761 max_string_length=max_len; 00762 features=strings; 00763 00764 return true; 00765 } 00766 00767 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname) 00768 { 00769 remove_subset(); 00770 00771 struct dirent **namelist; 00772 int32_t n; 00773 00774 SGIO::set_dirname(dirname); 00775 00776 SG_DEBUG("dirname '%s'\n", dirname); 00777 00778 n=scandir(dirname, &namelist, &SGIO::filter, alphasort); 00779 if (n <= 0) 00780 { 00781 SG_ERROR("error calling scandir - no files found\n"); 00782 return false; 00783 } 00784 else 00785 { 00786 SGString<ST>* strings=NULL; 00787 00788 int32_t num=0; 00789 int32_t max_len=-1; 00790 00791 //usually n==num_vec, but it might not in race conditions 00792 //(file perms modified, file erased) 00793 strings=SG_MALLOC(SGString<ST>, n); 00794 00795 for (int32_t i=0; i<n; i++) 00796 { 00797 char* fname=SGIO::concat_filename(namelist[i]->d_name); 00798 00799 struct stat s; 00800 off_t filesize=0; 00801 00802 if (!stat(fname, &s) && s.st_size>0) 00803 { 00804 filesize=s.st_size/sizeof(ST); 00805 00806 FILE* f=fopen(fname, "ro"); 00807 if (f) 00808 { 00809 ST* str=SG_MALLOC(ST, filesize); 00810 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize); 00811 if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize) 00812 SG_ERROR("failed to read file\n"); 00813 strings[num].string=str; 00814 strings[num].slen=filesize; 00815 max_len=CMath::max(max_len, strings[num].slen); 00816 00817 num++; 00818 fclose(f); 00819 } 00820 } 00821 else 00822 SG_ERROR("empty or non readable file \'%s\'\n", fname); 00823 00824 SG_FREE(namelist[i]); 00825 } 00826 SG_FREE(namelist); 00827 00828 if (num>0 && strings) 00829 { 00830 set_features(strings, num, max_len); 00831 return true; 00832 } 00833 } 00834 return false; 00835 } 00836 00837 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats) 00838 { 00839 set_features(feats.strings, feats.num_strings, feats.max_string_length); 00840 } 00841 00842 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 00843 { 00844 if (m_subset) 00845 SG_ERROR("Cannot call set_features() with subset.\n"); 00846 00847 if (p_features) 00848 { 00849 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00850 00851 //compute histogram for char/byte 00852 for (int32_t i=0; i<p_num_vectors; i++) 00853 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00854 00855 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 00856 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 00857 00858 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00859 { 00860 cleanup(); 00861 SG_UNREF(alphabet); 00862 00863 alphabet=alpha; 00864 SG_REF(alphabet); 00865 00866 features=p_features; 00867 num_vectors=p_num_vectors; 00868 max_string_length=p_max_string_length; 00869 00870 return true; 00871 } 00872 else 00873 SG_UNREF(alpha); 00874 } 00875 00876 return false; 00877 } 00878 00879 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf) 00880 { 00881 ASSERT(sf); 00882 00883 if (m_subset) 00884 SG_ERROR("Cannot call set_features() with subset.\n"); 00885 00886 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors()); 00887 00888 index_t sf_num_str=sf->get_num_vectors(); 00889 for (int32_t i=0; i<sf_num_str; i++) 00890 { 00891 int32_t real_i = sf->subset_idx_conversion(i); 00892 int32_t length=sf->features[real_i].slen; 00893 new_features[i].string=SG_MALLOC(ST, length); 00894 memcpy(new_features[i].string, sf->features[real_i].string, length); 00895 new_features[i].slen=length; 00896 } 00897 return append_features(new_features, sf_num_str, 00898 sf->max_string_length); 00899 } 00900 00901 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 00902 { 00903 if (m_subset) 00904 SG_ERROR("Cannot call set_features() with subset.\n"); 00905 00906 if (!features) 00907 return set_features(p_features, p_num_vectors, p_max_string_length); 00908 00909 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00910 00911 //compute histogram for char/byte 00912 for (int32_t i=0; i<p_num_vectors; i++) 00913 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00914 00915 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 00916 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 00917 00918 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00919 { 00920 SG_UNREF(alpha); 00921 for (int32_t i=0; i<p_num_vectors; i++) 00922 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00923 00924 int32_t old_num_vectors=num_vectors; 00925 num_vectors=old_num_vectors+p_num_vectors; 00926 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors); 00927 00928 for (int32_t i=0; i<num_vectors; i++) 00929 { 00930 if (i<old_num_vectors) 00931 { 00932 new_features[i].string=features[i].string; 00933 new_features[i].slen=features[i].slen; 00934 } 00935 else 00936 { 00937 new_features[i].string=p_features[i-old_num_vectors].string; 00938 new_features[i].slen=p_features[i-old_num_vectors].slen; 00939 } 00940 } 00941 SG_FREE(features); 00942 SG_FREE(p_features); // free now obsolete features 00943 00944 this->features=new_features; 00945 max_string_length=CMath::max(max_string_length, p_max_string_length); 00946 00947 return true; 00948 } 00949 SG_UNREF(alpha); 00950 00951 return false; 00952 } 00953 00954 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features() 00955 { 00956 SGStringList<ST> sl; 00957 00958 sl.strings=get_features(sl.num_strings, sl.max_string_length); 00959 return sl; 00960 } 00961 00962 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len) 00963 { 00964 if (m_subset) 00965 SG_ERROR("get features() is not possible on subset"); 00966 00967 num_str=num_vectors; 00968 max_str_len=max_string_length; 00969 return features; 00970 } 00971 00972 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len) 00973 { 00974 ASSERT(num_vectors>0); 00975 00976 num_str=get_num_vectors(); 00977 max_str_len=max_string_length; 00978 SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str); 00979 00980 for (int32_t i=0; i<num_str; i++) 00981 { 00982 int32_t len; 00983 bool free_vec; 00984 ST* vec=get_feature_vector(i, len, free_vec); 00985 new_feat[i].string=SG_MALLOC(ST, len); 00986 new_feat[i].slen=len; 00987 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST)); 00988 free_feature_vector(vec, i, free_vec); 00989 } 00990 00991 return new_feat; 00992 } 00993 00994 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str) 00995 { 00996 int32_t num_vec; 00997 int32_t max_str_len; 00998 *dst=copy_features(num_vec, max_str_len); 00999 *num_str=num_vec; 01000 } 01001 01002 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress) 01003 { 01004 remove_subset(); 01005 01006 FILE* file=NULL; 01007 01008 if (!(file=fopen(src, "r"))) 01009 return false; 01010 cleanup(); 01011 01012 // header shogun v0 01013 char id[4]; 01014 if (fread(&id[0], sizeof(char), 1, file)!=1) 01015 SG_ERROR("failed to read header"); 01016 ASSERT(id[0]=='S'); 01017 if (fread(&id[1], sizeof(char), 1, file)!=1) 01018 SG_ERROR("failed to read header"); 01019 ASSERT(id[1]=='G'); 01020 if (fread(&id[2], sizeof(char), 1, file)!=1) 01021 SG_ERROR("failed to read header"); 01022 ASSERT(id[2]=='V'); 01023 if (fread(&id[3], sizeof(char), 1, file)!=1) 01024 SG_ERROR("failed to read header"); 01025 ASSERT(id[3]=='0'); 01026 01027 //compression type 01028 uint8_t c; 01029 if (fread(&c, sizeof(uint8_t), 1, file)!=1) 01030 SG_ERROR("failed to read compression type"); 01031 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c); 01032 //alphabet 01033 uint8_t a; 01034 delete alphabet; 01035 if (fread(&a, sizeof(uint8_t), 1, file)!=1) 01036 SG_ERROR("failed to read compression alphabet"); 01037 alphabet=new CAlphabet((EAlphabet) a); 01038 // number of vectors 01039 if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1) 01040 SG_ERROR("failed to read compression number of vectors"); 01041 ASSERT(num_vectors>0); 01042 // maximum string length 01043 if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1) 01044 SG_ERROR("failed to read maximum string length"); 01045 ASSERT(max_string_length>0); 01046 01047 features=SG_MALLOC(SGString<ST>, num_vectors); 01048 01049 // vectors 01050 for (int32_t i=0; i<num_vectors; i++) 01051 { 01052 // vector len compressed 01053 int32_t len_compressed; 01054 if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1) 01055 SG_ERROR("failed to read vector length compressed"); 01056 // vector len uncompressed 01057 int32_t len_uncompressed; 01058 if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1) 01059 SG_ERROR("failed to read vector length uncompressed"); 01060 01061 // vector raw data 01062 if (decompress) 01063 { 01064 features[i].string=SG_MALLOC(ST, len_uncompressed); 01065 features[i].slen=len_uncompressed; 01066 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed); 01067 if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed) 01068 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed); 01069 uint64_t uncompressed_size=len_uncompressed; 01070 uncompressed_size*=sizeof(ST); 01071 compressor->decompress(compressed, len_compressed, 01072 (uint8_t*) features[i].string, uncompressed_size); 01073 SG_FREE(compressed); 01074 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST)); 01075 } 01076 else 01077 { 01078 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST)); 01079 features[i].string=SG_MALLOC(ST, len_compressed+offs); 01080 features[i].slen=len_compressed+offs; 01081 int32_t* feat32ptr=((int32_t*) (features[i].string)); 01082 memset(features[i].string, 0, offs*sizeof(ST)); 01083 feat32ptr[0]=(int32_t) len_compressed; 01084 feat32ptr[1]=(int32_t) len_uncompressed; 01085 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]); 01086 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed) 01087 SG_ERROR("failed to read uncompressed data"); 01088 } 01089 } 01090 01091 delete compressor; 01092 fclose(file); 01093 01094 return false; 01095 } 01096 01097 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level) 01098 { 01099 if (m_subset) 01100 SG_ERROR("save_compressed() is not possible on subset"); 01101 01102 FILE* file=NULL; 01103 01104 if (!(file=fopen(dest, "wb"))) 01105 return false; 01106 01107 CCompressor* compressor= new CCompressor(compression); 01108 01109 // header shogun v0 01110 const char* id="SGV0"; 01111 fwrite(&id[0], sizeof(char), 1, file); 01112 fwrite(&id[1], sizeof(char), 1, file); 01113 fwrite(&id[2], sizeof(char), 1, file); 01114 fwrite(&id[3], sizeof(char), 1, file); 01115 01116 //compression type 01117 uint8_t c=(uint8_t) compression; 01118 fwrite(&c, sizeof(uint8_t), 1, file); 01119 //alphabet 01120 uint8_t a=(uint8_t) alphabet->get_alphabet(); 01121 fwrite(&a, sizeof(uint8_t), 1, file); 01122 // number of vectors 01123 fwrite(&num_vectors, sizeof(int32_t), 1, file); 01124 // maximum string length 01125 fwrite(&max_string_length, sizeof(int32_t), 1, file); 01126 01127 // vectors 01128 for (int32_t i=0; i<num_vectors; i++) 01129 { 01130 int32_t len=-1; 01131 bool vfree; 01132 ST* vec=get_feature_vector(i, len, vfree); 01133 01134 uint8_t* compressed=NULL; 01135 uint64_t compressed_size=0; 01136 01137 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST), 01138 compressed, compressed_size, level); 01139 01140 int32_t len_compressed=(int32_t) compressed_size; 01141 // vector len compressed in bytes 01142 fwrite(&len_compressed, sizeof(int32_t), 1, file); 01143 // vector len uncompressed in number of elements of type ST 01144 fwrite(&len, sizeof(int32_t), 1, file); 01145 // vector raw data 01146 fwrite(compressed, compressed_size, 1, file); 01147 SG_FREE(compressed); 01148 01149 free_feature_vector(vec, i, vfree); 01150 } 01151 01152 delete compressor; 01153 fclose(file); 01154 return true; 01155 } 01156 01157 template<class ST> int32_t CStringFeatures<ST>::get_size() { return sizeof(ST); } 01158 01159 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing) 01160 { 01161 SG_DEBUG( "force: %d\n", force_preprocessing); 01162 01163 for (int32_t i=0; i<get_num_preprocessors(); i++) 01164 { 01165 if ( (!is_preprocessed(i) || force_preprocessing) ) 01166 { 01167 set_preprocessed(i); 01168 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i); 01169 SG_INFO( "preprocessing using preproc %s\n", p->get_name()); 01170 01171 if (!p->apply_to_string_features(this)) 01172 { 01173 SG_UNREF(p); 01174 return false; 01175 } 01176 else 01177 SG_UNREF(p); 01178 } 01179 } 01180 return true; 01181 } 01182 01183 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip) 01184 { 01185 if (m_subset) 01186 SG_NOTIMPLEMENTED; 01187 01188 ASSERT(step_size>0); 01189 ASSERT(window_size>0); 01190 ASSERT(num_vectors==1 || single_string); 01191 ASSERT(max_string_length>=window_size || 01192 (single_string && length_of_single_string>=window_size)); 01193 01194 //in case we are dealing with a single remapped string 01195 //allow remapping 01196 if (single_string) 01197 num_vectors= (length_of_single_string-window_size)/step_size + 1; 01198 else if (num_vectors==1) 01199 { 01200 num_vectors= (max_string_length-window_size)/step_size + 1; 01201 length_of_single_string=max_string_length; 01202 } 01203 01204 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors); 01205 int32_t offs=0; 01206 for (int32_t i=0; i<num_vectors; i++) 01207 { 01208 f[i].string=&features[0].string[offs+skip]; 01209 f[i].slen=window_size-skip; 01210 offs+=step_size; 01211 } 01212 single_string=features[0].string; 01213 SG_FREE(features); 01214 features=f; 01215 max_string_length=window_size-skip; 01216 01217 return num_vectors; 01218 } 01219 01220 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, 01221 int32_t skip) 01222 { 01223 if (m_subset) 01224 SG_NOTIMPLEMENTED; 01225 01226 ASSERT(positions); 01227 ASSERT(window_size>0); 01228 ASSERT(num_vectors==1 || single_string); 01229 ASSERT(max_string_length>=window_size || 01230 (single_string && length_of_single_string>=window_size)); 01231 01232 num_vectors= positions->get_num_elements(); 01233 ASSERT(num_vectors>0); 01234 01235 int32_t len; 01236 01237 //in case we are dealing with a single remapped string 01238 //allow remapping 01239 if (single_string) 01240 len=length_of_single_string; 01241 else 01242 { 01243 single_string=features[0].string; 01244 len=max_string_length; 01245 length_of_single_string=max_string_length; 01246 } 01247 01248 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors); 01249 for (int32_t i=0; i<num_vectors; i++) 01250 { 01251 int32_t p=positions->get_element(i); 01252 01253 if (p>=0 && p<=len-window_size) 01254 { 01255 f[i].string=&features[0].string[p+skip]; 01256 f[i].slen=window_size-skip; 01257 } 01258 else 01259 { 01260 num_vectors=1; 01261 max_string_length=len; 01262 features[0].slen=len; 01263 single_string=NULL; 01264 SG_FREE(f); 01265 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n", 01266 window_size, i, p, len); 01267 return -1; 01268 } 01269 } 01270 01271 SG_FREE(features); 01272 features=f; 01273 max_string_length=window_size-skip; 01274 01275 return num_vectors; 01276 } 01277 01278 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01279 { 01280 return obtain_from_char_features(sf, start, p_order, gap, rev); 01281 } 01282 01283 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len) 01284 { 01285 if (len!=-1) 01286 { 01287 if (len!=max_string_length) 01288 return false; 01289 } 01290 len=max_string_length; 01291 01292 index_t num_str=get_num_vectors(); 01293 for (int32_t i=0; i<num_str; i++) 01294 { 01295 if (get_vector_length(i)!=len) 01296 return false; 01297 } 01298 01299 return true; 01300 } 01301 01302 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order) 01303 { 01304 if (m_subset) 01305 SG_NOTIMPLEMENTED; 01306 01307 ASSERT(alphabet->get_num_symbols_in_histogram() > 0); 01308 01309 order=p_order; 01310 original_num_symbols=alphabet->get_num_symbols(); 01311 int32_t max_val=alphabet->get_num_bits(); 01312 01313 if (p_order>1) 01314 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01315 else 01316 num_symbols=original_num_symbols; 01317 01318 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01319 01320 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01321 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 01322 01323 ST mask=0; 01324 for (int32_t i=0; i<p_order*max_val; i++) 01325 mask= (mask<<1) | ((ST) 1); 01326 01327 for (int32_t i=0; i<num_vectors; i++) 01328 { 01329 int32_t len=features[i].slen; 01330 01331 if (len < p_order) 01332 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order); 01333 01334 ST* str=features[i].string; 01335 01336 // convert first word 01337 for (int32_t j=0; j<p_order; j++) 01338 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01339 str[0]=embed_word(&str[0], p_order); 01340 01341 // convert the rest 01342 int32_t idx=0; 01343 for (int32_t j=p_order; j<len; j++) 01344 { 01345 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01346 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask; 01347 idx++; 01348 } 01349 01350 features[i].slen=len-p_order+1; 01351 } 01352 01353 compute_symbol_mask_table(max_val); 01354 } 01355 01356 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val) 01357 { 01358 if (m_subset) 01359 SG_NOTIMPLEMENTED; 01360 01361 SG_FREE(symbol_mask_table); 01362 symbol_mask_table=SG_MALLOC(ST, 256); 01363 01364 uint64_t mask=0; 01365 for (int32_t i=0; i< (int64_t) max_val; i++) 01366 mask=(mask<<1) | 1; 01367 01368 for (int32_t i=0; i<256; i++) 01369 { 01370 uint8_t bits=(uint8_t) i; 01371 symbol_mask_table[i]=0; 01372 01373 for (int32_t j=0; j<8; j++) 01374 { 01375 if (bits & 1) 01376 symbol_mask_table[i]|=mask<<(max_val*j); 01377 01378 bits>>=1; 01379 } 01380 } 01381 } 01382 01383 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len) 01384 { 01385 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01386 01387 ST mask=0; 01388 for (uint32_t i=0; i<nbits; i++) 01389 mask=(mask<<1) | (ST) 1; 01390 01391 for (int32_t i=0; i<len; i++) 01392 { 01393 ST w=(word & mask); 01394 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w); 01395 word>>=nbits; 01396 } 01397 } 01398 01399 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len) 01400 { 01401 ST value=(ST) 0; 01402 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01403 for (int32_t i=0; i<len; i++) 01404 { 01405 value<<=nbits; 01406 value|=seq[i]; 01407 } 01408 01409 return value; 01410 } 01411 01412 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length() 01413 { 01414 max_string_length=0; 01415 index_t num_str=get_num_vectors(); 01416 01417 for (int32_t i=0; i<num_str; i++) 01418 { 01419 max_string_length=CMath::max(max_string_length, 01420 features[subset_idx_conversion(i)].slen); 01421 } 01422 } 01423 01424 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str) 01425 { 01426 int32_t l=str.slen; 01427 ST* s=SG_MALLOC(ST, l+1); 01428 memcpy(s, str.string, sizeof(ST)*l); 01429 s[l]='\0'; 01430 return s; 01431 } 01432 01433 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len) 01434 { 01435 ASSERT(features); 01436 ASSERT(num<get_num_vectors()); 01437 01438 int32_t real_num=subset_idx_conversion(num); 01439 01440 01441 features[real_num].slen=len ; 01442 features[real_num].string=string ; 01443 01444 max_string_length=CMath::max(len, max_string_length); 01445 } 01446 01447 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize) 01448 { 01449 int32_t nsym=get_num_symbols(); 01450 int32_t slen=get_max_vector_length(); 01451 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t); 01452 float64_t* h= SG_MALLOC(float64_t, sz); 01453 memset(h, 0, sz); 01454 01455 float64_t* h_normalizer=SG_MALLOC(float64_t, slen); 01456 memset(h_normalizer, 0, slen*sizeof(float64_t)); 01457 int32_t num_str=get_num_vectors(); 01458 for (int32_t i=0; i<num_str; i++) 01459 { 01460 int32_t len; 01461 bool free_vec; 01462 ST* vec=get_feature_vector(i, len, free_vec); 01463 for (int32_t j=0; j<len; j++) 01464 { 01465 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++; 01466 h_normalizer[j]++; 01467 } 01468 free_feature_vector(vec, i, free_vec); 01469 } 01470 01471 if (normalize) 01472 { 01473 for (int32_t i=0; i<slen; i++) 01474 { 01475 for (int32_t j=0; j<nsym; j++) 01476 { 01477 if (h_normalizer && h_normalizer[i]) 01478 h[int64_t(i)*nsym+j]/=h_normalizer[i]; 01479 } 01480 } 01481 } 01482 SG_FREE(h_normalizer); 01483 01484 *hist=h; 01485 *rows=nsym; 01486 *cols=slen; 01487 } 01488 01489 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec) 01490 { 01491 ASSERT(rows == get_num_symbols()); 01492 cleanup(); 01493 float64_t* randoms=SG_MALLOC(float64_t, cols); 01494 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec); 01495 01496 for (int32_t i=0; i<num_vec; i++) 01497 { 01498 sf[i].string=SG_MALLOC(ST, cols); 01499 sf[i].slen=cols; 01500 01501 CMath::random_vector(randoms, cols, 0.0, 1.0); 01502 01503 for (int32_t j=0; j<cols; j++) 01504 { 01505 float64_t lik=hist[int64_t(j)*rows+0]; 01506 01507 int32_t c; 01508 for (c=0; c<rows-1; c++) 01509 { 01510 if (randoms[j]<=lik) 01511 break; 01512 lik+=hist[int64_t(j)*rows+c+1]; 01513 } 01514 sf[i].string[j]=alphabet->remap_to_char(c); 01515 } 01516 } 01517 SG_FREE(randoms); 01518 set_features(sf, num_vec, cols); 01519 } 01520 01521 /* 01522 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2) 01523 { 01524 int *s; 01525 int32_t nStr=get_num_vectors(); 01526 01527 int32_t nfeat=0; 01528 for (int32_t i=0; i < nStr; ++i) 01529 nfeat += get_vector_length[i] - d1 -d2; 01530 SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat); 01531 int32_t c=0; 01532 for (int32_t i=0; i < nStr; ++i) 01533 { 01534 int32_t len; 01535 bool free_vec; 01536 ST* S=get_feature_vector(vec_num, len, free_vec); 01537 free_feature_vector(vec, vec_num, free_vec); 01538 int32_t n=len - d1 - d2; 01539 s=S[i]; 01540 for (int32_t j=0; j < n; ++j) 01541 { 01542 F[c].feature1=s[j]; 01543 F[c].feature2=s[j+d1]; 01544 F[c].feature3=s[j+d1+d2]; 01545 F[c].group=i; 01546 c++; 01547 } 01548 } 01549 ASSERT(nfeat==c); 01550 return F; 01551 } 01552 01553 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1) 01554 { 01555 int i, j; 01556 int n, nfeat; 01557 int *group; 01558 int *features; 01559 int *s; 01560 int c; 01561 SSKFeatures *F; 01562 01563 nfeat=0; 01564 for (i=0; i < nStr; ++i) 01565 nfeat += len[i] - d1; 01566 group=(int *)SG_MALLOC(nfeat*sizeof(int)); 01567 features=(int *)SG_MALLOC(nfeat*2*sizeof(int *)); 01568 c=0; 01569 for (i=0; i < nStr; ++i) 01570 { 01571 n=len[i] - d1; 01572 s=S[i]; 01573 for (j=0; j < n; ++j) 01574 { 01575 features[c]=s[j]; 01576 features[c+nfeat]=s[j+d1]; 01577 group[c]=i; 01578 c++; 01579 } 01580 } 01581 if (nfeat!=c) 01582 printf("Something is wrong...\n"); 01583 F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures)); 01584 (*F).features=features; 01585 (*F).group=group; 01586 (*F).n=nfeat; 01587 return F; 01588 } 01589 */ 01590 01591 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(SGVector<index_t> indices) 01592 { 01593 /* string list to create new CStringFeatures from */ 01594 SGStringList<ST> list_copy(indices.vlen, max_string_length); 01595 01596 /* copy all features */ 01597 for (index_t i=0; i<indices.vlen; ++i) 01598 { 01599 /* index with respect to possible subset */ 01600 index_t real_idx=subset_idx_conversion(indices.vector[i]); 01601 01602 /* copy string */ 01603 SGString<ST> current_string=features[real_idx]; 01604 SGString<ST> string_copy(current_string.slen); 01605 memcpy(string_copy.string, current_string.string, 01606 current_string.slen*sizeof(ST)); 01607 list_copy.strings[i]=string_copy; 01608 } 01609 01610 /* create copy instance */ 01611 CStringFeatures* result=new CStringFeatures(list_copy, alphabet); 01612 01613 /* max string length may have changed */ 01614 result->determine_maximum_string_length(); 01615 01616 return result; 01617 } 01618 01619 template<class ST> void CStringFeatures<ST>::subset_changed_post() 01620 { 01621 /* max string length has to be updated */ 01622 determine_maximum_string_length(); 01623 } 01624 01625 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len) 01626 { 01627 ASSERT(features && num<get_num_vectors()); 01628 01629 int32_t real_num=subset_idx_conversion(num); 01630 01631 len=features[real_num].slen; 01632 if (len<=0) 01633 return NULL; 01634 01635 ST* target=SG_MALLOC(ST, len); 01636 memcpy(target, features[real_num].string, len*sizeof(ST)); 01637 return target; 01638 } 01639 01640 template<class ST> void CStringFeatures<ST>::init() 01641 { 01642 set_generic<ST>(); 01643 01644 alphabet=NULL; 01645 num_vectors=0; 01646 features=NULL; 01647 single_string=NULL; 01648 length_of_single_string=0; 01649 max_string_length=0; 01650 order=0; 01651 symbol_mask_table=0; 01652 preprocess_on_get=false; 01653 feature_cache=NULL; 01654 01655 m_parameters->add((CSGObject**) &alphabet, "alphabet"); 01656 m_parameters->add_vector(&features, &num_vectors, "features", 01657 "This contains the array of features."); 01658 m_parameters->add_vector(&single_string, 01659 &length_of_single_string, 01660 "single_string", 01661 "Created by sliding window."); 01662 m_parameters->add(&max_string_length, "max_string_length", 01663 "Length of longest string."); 01664 m_parameters->add(&num_symbols, "num_symbols", 01665 "Number of used symbols."); 01666 m_parameters->add(&original_num_symbols, "original_num_symbols", 01667 "Original number of used symbols."); 01668 m_parameters->add(&order, "order", 01669 "Order used in higher order mapping."); 01670 m_parameters->add(&preprocess_on_get, "preprocess_on_get", 01671 "Preprocess on-the-fly?"); 01672 01673 /* TODO M_PARAMETERS->ADD? 01674 * /// order used in higher order mapping 01675 * ST* symbol_mask_table; 01676 */ 01677 } 01678 01683 template<> EFeatureType CStringFeatures<bool>::get_feature_type() 01684 { 01685 return F_BOOL; 01686 } 01687 01692 template<> EFeatureType CStringFeatures<char>::get_feature_type() 01693 { 01694 return F_CHAR; 01695 } 01696 01701 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type() 01702 { 01703 return F_BYTE; 01704 } 01705 01710 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type() 01711 { 01712 return F_SHORT; 01713 } 01714 01719 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type() 01720 { 01721 return F_WORD; 01722 } 01723 01728 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type() 01729 { 01730 return F_INT; 01731 } 01732 01737 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type() 01738 { 01739 return F_UINT; 01740 } 01741 01746 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type() 01747 { 01748 return F_LONG; 01749 } 01750 01755 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type() 01756 { 01757 return F_ULONG; 01758 } 01759 01764 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type() 01765 { 01766 return F_SHORTREAL; 01767 } 01768 01773 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type() 01774 { 01775 return F_DREAL; 01776 } 01777 01782 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type() 01783 { 01784 return F_LONGREAL; 01785 } 01786 01787 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask) 01788 { 01789 return symbol; 01790 } 01791 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask) 01792 { 01793 return symbol; 01794 } 01795 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask) 01796 { 01797 return symbol; 01798 } 01799 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask) 01800 { 01801 return symbol; 01802 } 01803 01804 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount) 01805 { 01806 return false; 01807 } 01808 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount) 01809 { 01810 return 0; 01811 } 01812 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount) 01813 { 01814 return 0; 01815 } 01816 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount) 01817 { 01818 return 0; 01819 } 01820 01821 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount) 01822 { 01823 return symbol; 01824 } 01825 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount) 01826 { 01827 return symbol; 01828 } 01829 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount) 01830 { 01831 return symbol; 01832 } 01833 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount) 01834 { 01835 return symbol; 01836 } 01837 01838 #ifndef SUNOS 01839 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01840 { 01841 return false; 01842 } 01843 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01844 { 01845 return false; 01846 } 01847 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01848 { 01849 return false; 01850 } 01851 #endif 01852 01853 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order) 01854 { 01855 } 01856 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order) 01857 { 01858 } 01859 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order) 01860 { 01861 } 01862 01863 template<> void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val) 01864 { 01865 } 01866 template<> void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val) 01867 { 01868 } 01869 template<> void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val) 01870 { 01871 } 01872 01873 template<> float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len) 01874 { 01875 return 0; 01876 } 01877 template<> float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len) 01878 { 01879 return 0; 01880 } 01881 template<> floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len) 01882 { 01883 return 0; 01884 } 01885 01886 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len) 01887 { 01888 } 01889 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len) 01890 { 01891 } 01892 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len) 01893 { 01894 } 01895 #define LOAD(f_load, sg_type) \ 01896 template<> void CStringFeatures<sg_type>::load(CFile* loader) \ 01897 { \ 01898 SG_INFO( "loading...\n"); \ 01899 \ 01900 SG_SET_LOCALE_C; \ 01901 SGString<sg_type>* strs; \ 01902 int32_t num_str; \ 01903 int32_t max_len; \ 01904 loader->f_load(strs, num_str, max_len); \ 01905 set_features(strs, num_str, max_len); \ 01906 SG_RESET_LOCALE; \ 01907 } 01908 01909 LOAD(get_string_list, bool) 01910 LOAD(get_string_list, char) 01911 LOAD(get_int8_string_list, int8_t) 01912 LOAD(get_string_list, uint8_t) 01913 LOAD(get_string_list, int16_t) 01914 LOAD(get_string_list, uint16_t) 01915 LOAD(get_string_list, int32_t) 01916 LOAD(get_uint_string_list, uint32_t) 01917 LOAD(get_long_string_list, int64_t) 01918 LOAD(get_ulong_string_list, uint64_t) 01919 LOAD(get_string_list, float32_t) 01920 LOAD(get_string_list, float64_t) 01921 LOAD(get_longreal_string_list, floatmax_t) 01922 #undef LOAD 01923 01924 #define SAVE(f_write, sg_type) \ 01925 template<> void CStringFeatures<sg_type>::save(CFile* writer) \ 01926 { \ 01927 if (m_subset) \ 01928 SG_ERROR("save() is not possible on subset"); \ 01929 SG_SET_LOCALE_C; \ 01930 ASSERT(writer); \ 01931 writer->f_write(features, num_vectors); \ 01932 SG_RESET_LOCALE; \ 01933 } 01934 01935 SAVE(set_string_list, bool) 01936 SAVE(set_string_list, char) 01937 SAVE(set_int8_string_list, int8_t) 01938 SAVE(set_string_list, uint8_t) 01939 SAVE(set_string_list, int16_t) 01940 SAVE(set_string_list, uint16_t) 01941 SAVE(set_string_list, int32_t) 01942 SAVE(set_uint_string_list, uint32_t) 01943 SAVE(set_long_string_list, int64_t) 01944 SAVE(set_ulong_string_list, uint64_t) 01945 SAVE(set_string_list, float32_t) 01946 SAVE(set_string_list, float64_t) 01947 SAVE(set_longreal_string_list, floatmax_t) 01948 #undef SAVE 01949 01950 template <class ST> template <class CT> 01951 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, 01952 int32_t p_order, int32_t gap, bool rev) 01953 { 01954 remove_subset(); 01955 ASSERT(sf); 01956 01957 CAlphabet* alpha=sf->get_alphabet(); 01958 ASSERT(alpha->get_num_symbols_in_histogram() > 0); 01959 01960 this->order=p_order; 01961 cleanup(); 01962 01963 num_vectors=sf->get_num_vectors(); 01964 ASSERT(num_vectors>0); 01965 max_string_length=sf->get_max_vector_length()-start; 01966 features=SG_MALLOC(SGString<ST>, num_vectors); 01967 01968 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(), 01969 alpha->get_num_symbols_in_histogram()); 01970 01971 for (int32_t i=0; i<num_vectors; i++) 01972 { 01973 int32_t len=-1; 01974 bool vfree; 01975 CT* c=sf->get_feature_vector(i, len, vfree); 01976 ASSERT(!vfree); // won't work when preprocessors are attached 01977 01978 features[i].string=SG_MALLOC(ST, len); 01979 features[i].slen=len; 01980 01981 ST* str=features[i].string; 01982 for (int32_t j=0; j<len; j++) 01983 str[j]=(ST) alpha->remap_to_bin(c[j]); 01984 } 01985 01986 original_num_symbols=alpha->get_num_symbols(); 01987 int32_t max_val=alpha->get_num_bits(); 01988 01989 SG_UNREF(alpha); 01990 01991 if (p_order>1) 01992 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01993 else 01994 num_symbols=original_num_symbols; 01995 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01996 01997 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01998 { 01999 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 02000 return false; 02001 } 02002 02003 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ; 02004 for (int32_t line=0; line<num_vectors; line++) 02005 { 02006 int32_t len=0; 02007 bool vfree; 02008 ST* fv=get_feature_vector(line, len, vfree); 02009 ASSERT(!vfree); // won't work when preprocessors are attached 02010 02011 if (rev) 02012 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap); 02013 else 02014 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap); 02015 02016 /* fix the length of the string -- hacky */ 02017 features[line].slen-=start+gap ; 02018 if (features[line].slen<0) 02019 features[line].slen=0 ; 02020 } 02021 02022 compute_symbol_mask_table(max_val); 02023 02024 return true; 02025 } 02026 02027 template class CStringFeatures<bool>; 02028 template class CStringFeatures<char>; 02029 template class CStringFeatures<int8_t>; 02030 template class CStringFeatures<uint8_t>; 02031 template class CStringFeatures<int16_t>; 02032 template class CStringFeatures<uint16_t>; 02033 template class CStringFeatures<int32_t>; 02034 template class CStringFeatures<uint32_t>; 02035 template class CStringFeatures<int64_t>; 02036 template class CStringFeatures<uint64_t>; 02037 template class CStringFeatures<float32_t>; 02038 template class CStringFeatures<float64_t>; 02039 template class CStringFeatures<floatmax_t>; 02040 02041 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02042 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02043 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02044 02045 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02046 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02047 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02048 }