SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Soeren Sonnenburg 00008 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #ifndef _CALPHABET__H__ 00012 #define _CALPHABET__H__ 00013 00014 #include <shogun/base/SGObject.h> 00015 #include <shogun/lib/common.h> 00016 00017 namespace shogun 00018 { 00020 enum EAlphabet 00021 { 00023 DNA=0, 00024 00026 RAWDNA=1, 00027 00029 RNA=2, 00030 00032 PROTEIN=3, 00033 00034 // BINARY just 0 and 1 00035 BINARY=4, 00036 00038 ALPHANUM=5, 00039 00041 CUBE=6, 00042 00044 RAWBYTE=7, 00045 00047 IUPAC_NUCLEIC_ACID=8, 00048 00050 IUPAC_AMINO_ACID=9, 00051 00053 NONE=10, 00054 00056 DIGIT=11, 00057 00059 DIGIT2=12, 00060 00062 RAWDIGIT=13, 00063 00065 RAWDIGIT2=14, 00066 00068 UNKNOWN=15, 00069 00071 SNP=16, 00072 00074 RAWSNP=17 00075 }; 00076 00077 00088 class CAlphabet : public CSGObject 00089 { 00090 public: 00091 00095 CAlphabet(); 00096 00102 CAlphabet(char* alpha, int32_t len); 00103 00108 CAlphabet(EAlphabet alpha); 00109 00114 CAlphabet(CAlphabet* alpha); 00115 virtual ~CAlphabet(); 00116 00121 bool set_alphabet(EAlphabet alpha); 00122 00127 inline EAlphabet get_alphabet() const 00128 { 00129 return alphabet; 00130 } 00131 00136 inline int32_t get_num_symbols() const 00137 { 00138 return num_symbols; 00139 } 00140 00146 inline int32_t get_num_bits() const 00147 { 00148 return num_bits; 00149 } 00150 00156 inline uint8_t remap_to_bin(uint8_t c) 00157 { 00158 return maptable_to_bin[c]; 00159 } 00160 00166 inline uint8_t remap_to_char(uint8_t c) 00167 { 00168 return maptable_to_char[c]; 00169 } 00170 00172 void clear_histogram(); 00173 00179 template <class T> 00180 void add_string_to_histogram(T* p, int64_t len) 00181 { 00182 for (int64_t i=0; i<len; i++) 00183 add_byte_to_histogram((uint8_t) (p[i])); 00184 } 00185 00190 inline void add_byte_to_histogram(uint8_t p) 00191 { 00192 histogram[p]++; 00193 } 00194 00196 void print_histogram(); 00197 00202 SGVector<int64_t> get_histogram(); 00203 00210 bool check_alphabet(bool print_error=true); 00211 00218 inline bool is_valid(uint8_t c) 00219 { 00220 return valid_chars[c]; 00221 } 00222 00228 bool check_alphabet_size(bool print_error=true); 00229 00234 int32_t get_num_symbols_in_histogram(); 00235 00240 int32_t get_max_value_in_histogram(); 00241 00248 int32_t get_num_bits_in_histogram(); 00249 00254 static const char* get_alphabet_name(EAlphabet alphabet); 00255 00256 00258 inline virtual const char* get_name() const { return "Alphabet"; } 00259 00268 template <class ST> 00269 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00270 00279 template <class ST> 00280 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00281 00291 template <class ST> 00292 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00293 00303 template <class ST> 00304 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00305 00306 private: 00309 void init(); 00310 00311 protected: 00313 void init_map_table(); 00314 00319 void copy_histogram(CAlphabet* src); 00320 00321 public: 00323 static const uint8_t B_A; 00325 static const uint8_t B_C; 00327 static const uint8_t B_G; 00329 static const uint8_t B_T; 00331 static const uint8_t B_0; 00333 static const uint8_t MAPTABLE_UNDEF; 00335 static const char* alphabet_names[18]; 00336 00337 protected: 00346 virtual void load_serializable_post() throw (ShogunException); 00347 00348 protected: 00350 EAlphabet alphabet; 00352 int32_t num_symbols; 00354 int32_t num_bits; 00356 bool valid_chars[1 << (sizeof(uint8_t)*8)]; 00358 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)]; 00360 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)]; 00362 int64_t histogram[1 << (sizeof(uint8_t)*8)]; 00363 }; 00364 } 00365 #endif