SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Subset support written (W) 2011 Heiko Strathmann 00010 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00011 */ 00012 00013 #ifndef _CSTRINGFEATURES__H__ 00014 #define _CSTRINGFEATURES__H__ 00015 00016 #include <shogun/lib/common.h> 00017 #include <shogun/lib/Cache.h> 00018 #include <shogun/lib/DynamicArray.h> 00019 #include <shogun/lib/Compressor.h> 00020 #include <shogun/io/File.h> 00021 00022 #include <shogun/features/Features.h> 00023 #include <shogun/features/Alphabet.h> 00024 00025 namespace shogun 00026 { 00027 class CAlphabet; 00028 template <class T> class CDynamicArray; 00029 class CFile; 00030 template <class T> class SGString; 00031 00032 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00033 struct SSKDoubleFeature 00034 { 00035 int feature1; 00036 int feature2; 00037 int group; 00038 }; 00039 00040 struct SSKTripleFeature 00041 { 00042 int feature1; 00043 int feature2; 00044 int feature3; 00045 int group; 00046 }; 00047 #endif 00048 00071 template <class ST> class CStringFeatures : public CFeatures 00072 { 00073 public: 00077 CStringFeatures(); 00078 00083 CStringFeatures(EAlphabet alpha); 00084 00089 CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha); 00090 00095 CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha); 00096 00101 CStringFeatures(CAlphabet* alpha); 00102 00104 CStringFeatures(const CStringFeatures & orig); 00105 00111 CStringFeatures(CFile* loader, EAlphabet alpha=DNA); 00112 00113 virtual ~CStringFeatures(); 00114 00120 virtual void cleanup(); 00121 00128 virtual void cleanup_feature_vector(int32_t num); 00129 00137 virtual void cleanup_feature_vectors(int32_t start, int32_t stop); 00138 00143 virtual EFeatureClass get_feature_class(); 00144 00149 virtual EFeatureType get_feature_type(); 00150 00155 CAlphabet* get_alphabet(); 00156 00161 virtual CFeatures* duplicate() const; 00162 00169 SGVector<ST> get_feature_vector(int32_t num); 00170 00178 void set_feature_vector(SGVector<ST> vector, int32_t num); 00179 00182 void enable_on_the_fly_preprocessing(); 00183 00187 void disable_on_the_fly_preprocessing(); 00188 00199 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree); 00200 00207 CStringFeatures<ST>* get_transposed(); 00208 00222 SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec); 00223 00232 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree); 00233 00241 void free_feature_vector(SGVector<ST> feat_vec, int32_t num); 00242 00251 virtual ST get_feature(int32_t vec_num, int32_t feat_num); 00252 00260 virtual int32_t get_vector_length(int32_t vec_num); 00261 00268 virtual int32_t get_max_vector_length(); 00269 00271 virtual int32_t get_num_vectors() const; 00272 00279 floatmax_t get_num_symbols(); 00280 00288 floatmax_t get_max_num_symbols(); 00289 00290 // these functions are necessary to find out about a former conversion process 00291 00296 floatmax_t get_original_num_symbols(); 00297 00302 int32_t get_order(); 00303 00311 ST get_masked_symbols(ST symbol, uint8_t mask); 00312 00319 ST shift_offset(ST offset, int32_t amount); 00320 00327 ST shift_symbol(ST symbol, int32_t amount); 00328 00333 virtual void load(CFile* loader); 00334 00345 void load_ascii_file(char* fname, bool remap_to_bin=true, 00346 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA); 00347 00356 bool load_fasta_file(const char* fname, bool ignore_invalid=false); 00357 00367 bool load_fastq_file(const char* fname, 00368 bool ignore_invalid=false, bool bitremap_in_single_string=false); 00369 00377 bool load_from_directory(char* dirname); 00378 00384 void set_features(SGStringList<ST> feats); 00385 00395 bool set_features(SGString<ST>* p_features, int32_t p_num_vectors, 00396 int32_t p_max_string_length); 00397 00406 bool append_features(CStringFeatures<ST>* sf); 00407 00420 bool append_features(SGString<ST>* p_features, int32_t p_num_vectors, 00421 int32_t p_max_string_length); 00422 00426 SGStringList<ST> get_features(); 00427 00436 virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len); 00437 00446 virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len); 00447 00455 virtual void get_features(SGString<ST>** dst, int32_t* num_str); 00456 00463 virtual void save(CFile* writer); 00464 00473 virtual bool load_compressed(char* src, bool decompress); 00474 00484 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level); 00485 00490 virtual int32_t get_size(); 00491 00497 virtual bool apply_preprocessor(bool force_preprocessing=false); 00498 00511 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0); 00512 00523 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, 00524 int32_t skip=0); 00525 00539 bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, 00540 int32_t p_order, int32_t gap, bool rev); 00541 00553 template <class CT> 00554 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, 00555 int32_t p_order, int32_t gap, bool rev); 00556 00566 bool have_same_length(int32_t len=-1); 00567 00573 void embed_features(int32_t p_order); 00574 00581 void compute_symbol_mask_table(int64_t max_val); 00582 00589 void unembed_word(ST word, uint8_t* seq, int32_t len); 00590 00596 ST embed_word(ST* seq, int32_t len); 00597 00602 void determine_maximum_string_length(); 00603 00611 static ST* get_zero_terminated_string_copy(SGString<ST> str); 00612 00621 virtual void set_feature_vector(int32_t num, ST* string, int32_t len); 00622 00627 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, 00628 bool normalize=true); 00629 00634 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, 00635 int32_t num_vec); 00636 00645 virtual CFeatures* copy_subset(SGVector<index_t> indices); 00646 00648 inline virtual const char* get_name() const { return "StringFeatures"; } 00649 00651 virtual void subset_changed_post(); 00652 00653 protected: 00664 virtual ST* compute_feature_vector(int32_t num, int32_t& len); 00665 00666 private: 00667 void init(); 00668 00669 protected: 00670 00672 CAlphabet* alphabet; 00673 00675 int32_t num_vectors; 00676 00678 SGString<ST>* features; 00679 00681 ST* single_string; 00682 00684 int32_t length_of_single_string; 00685 00687 int32_t max_string_length; 00688 00690 floatmax_t num_symbols; 00691 00693 floatmax_t original_num_symbols; 00694 00696 int32_t order; 00697 00699 ST* symbol_mask_table; 00700 00702 bool preprocess_on_get; 00703 00705 CCache<ST>* feature_cache; 00706 }; 00707 } 00708 #endif // _CSTRINGFEATURES__H__