SHOGUN
v1.1.0
|
00001 #include <shogun/features/StreamingStringFeatures.h> 00002 00003 namespace shogun 00004 { 00005 00006 00007 template <class T> 00008 CStreamingStringFeatures<T>::CStreamingStringFeatures() : CStreamingFeatures() 00009 { 00010 init(); 00011 set_read_functions(); 00012 remap_to_bin=false; 00013 } 00014 00015 template <class T> 00016 CStreamingStringFeatures<T>::CStreamingStringFeatures(CStreamingFile* file, 00017 bool is_labelled, 00018 int32_t size) 00019 : CStreamingFeatures() 00020 { 00021 init(file, is_labelled, size); 00022 set_read_functions(); 00023 remap_to_bin=false; 00024 } 00025 00026 template <class T> 00027 CStreamingStringFeatures<T>::~CStreamingStringFeatures() 00028 { 00029 parser.end_parser(); 00030 SG_UNREF(alphabet); 00031 } 00032 00033 template <class T> 00034 void CStreamingStringFeatures<T>::use_alphabet(EAlphabet alpha) 00035 { 00036 SG_UNREF(alphabet); 00037 00038 alphabet=new CAlphabet(alpha); 00039 SG_REF(alphabet); 00040 num_symbols=alphabet->get_num_symbols(); 00041 } 00042 00043 template <class T> 00044 void CStreamingStringFeatures<T>::use_alphabet(CAlphabet* alpha) 00045 { 00046 SG_UNREF(alphabet); 00047 00048 alphabet=new CAlphabet(alpha); 00049 SG_REF(alphabet); 00050 num_symbols=alphabet->get_num_symbols(); 00051 } 00052 00053 template <class T> 00054 void CStreamingStringFeatures<T>::set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet) 00055 { 00056 remap_to_bin=true; 00057 alpha_ascii=new CAlphabet(ascii_alphabet); 00058 alpha_bin=new CAlphabet(binary_alphabet); 00059 } 00060 00061 template <class T> 00062 void CStreamingStringFeatures<T>::set_remap(EAlphabet ascii_alphabet, EAlphabet binary_alphabet) 00063 { 00064 remap_to_bin=true; 00065 alpha_ascii=new CAlphabet(ascii_alphabet); 00066 alpha_bin=new CAlphabet(binary_alphabet); 00067 } 00068 00069 template <class T> 00070 CAlphabet* CStreamingStringFeatures<T>::get_alphabet() 00071 { 00072 SG_REF(alphabet); 00073 return alphabet; 00074 } 00075 00076 template <class T> 00077 floatmax_t CStreamingStringFeatures<T>::get_num_symbols() 00078 { 00079 return num_symbols; 00080 } 00081 00082 template <class T> 00083 CFeatures* CStreamingStringFeatures<T>::duplicate() const 00084 { 00085 return new CStreamingStringFeatures<T>(*this); 00086 } 00087 00088 template <class T> 00089 int32_t CStreamingStringFeatures<T>::get_num_vectors() const 00090 { 00091 if (current_string) 00092 return 1; 00093 return 0; 00094 } 00095 00096 template <class T> 00097 int32_t CStreamingStringFeatures<T>::get_size() 00098 { 00099 return sizeof(T); 00100 } 00101 00102 template <class T> 00103 int32_t CStreamingStringFeatures<T>::get_num_features() 00104 { 00105 return current_length; 00106 } 00107 00108 template <class T> void CStreamingStringFeatures<T>::set_vector_reader() 00109 { 00110 parser.set_read_vector(&CStreamingFile::get_string); 00111 } 00112 00113 template <class T> void CStreamingStringFeatures<T>::set_vector_and_label_reader() 00114 { 00115 parser.set_read_vector_and_label 00116 (&CStreamingFile::get_string_and_label); 00117 } 00118 00119 #define GET_FEATURE_TYPE(f_type, sg_type) \ 00120 template<> EFeatureType CStreamingStringFeatures<sg_type>::get_feature_type() \ 00121 { \ 00122 return f_type; \ 00123 } 00124 00125 GET_FEATURE_TYPE(F_BOOL, bool) 00126 GET_FEATURE_TYPE(F_CHAR, char) 00127 GET_FEATURE_TYPE(F_BYTE, uint8_t) 00128 GET_FEATURE_TYPE(F_BYTE, int8_t) 00129 GET_FEATURE_TYPE(F_SHORT, int16_t) 00130 GET_FEATURE_TYPE(F_WORD, uint16_t) 00131 GET_FEATURE_TYPE(F_INT, int32_t) 00132 GET_FEATURE_TYPE(F_UINT, uint32_t) 00133 GET_FEATURE_TYPE(F_LONG, int64_t) 00134 GET_FEATURE_TYPE(F_ULONG, uint64_t) 00135 GET_FEATURE_TYPE(F_SHORTREAL, float32_t) 00136 GET_FEATURE_TYPE(F_DREAL, float64_t) 00137 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t) 00138 #undef GET_FEATURE_TYPE 00139 00140 00141 template <class T> 00142 void CStreamingStringFeatures<T>::init() 00143 { 00144 working_file=NULL; 00145 alphabet=new CAlphabet(); 00146 00147 current_string=NULL; 00148 current_length=-1; 00149 current_sgstring.string=current_string; 00150 current_sgstring.slen=current_length; 00151 } 00152 00153 template <class T> 00154 void CStreamingStringFeatures<T>::init(CStreamingFile* file, 00155 bool is_labelled, 00156 int32_t size) 00157 { 00158 init(); 00159 has_labels=is_labelled; 00160 working_file=file; 00161 parser.init(file, is_labelled, size); 00162 parser.set_free_vector_after_release(false); 00163 parser.set_free_vectors_on_destruct(false); 00164 } 00165 00166 template <class T> 00167 void CStreamingStringFeatures<T>::start_parser() 00168 { 00169 if (!remap_to_bin) 00170 alpha_ascii=alphabet; 00171 00172 if (!parser.is_running()) 00173 parser.start_parser(); 00174 } 00175 00176 template <class T> 00177 void CStreamingStringFeatures<T>::end_parser() 00178 { 00179 parser.end_parser(); 00180 } 00181 00182 template <class T> 00183 bool CStreamingStringFeatures<T>::get_next_example() 00184 { 00185 bool ret_value; 00186 00187 ret_value = (bool) parser.get_next_example(current_string, 00188 current_length, 00189 current_label); 00190 00191 if (!ret_value) 00192 return false; 00193 00194 int32_t i; 00195 if (remap_to_bin) 00196 { 00197 alpha_ascii->add_string_to_histogram(current_string, current_length); 00198 00199 for (i=0; i<current_length; i++) 00200 current_string[i]=alpha_ascii->remap_to_bin(current_string[i]); 00201 alpha_bin->add_string_to_histogram(current_string, current_length); 00202 } 00203 else 00204 { 00205 alpha_ascii->add_string_to_histogram(current_string, current_length); 00206 } 00207 00208 /* Check the input using src alphabet, alpha_ascii */ 00209 if ( !(alpha_ascii->check_alphabet_size() && alpha_ascii->check_alphabet()) ) 00210 { 00211 SG_ERROR("StreamingStringFeatures: The given input was found to be incompatible with the alphabet!\n"); 00212 return 0; 00213 } 00214 00215 //SG_UNREF(alphabet); 00216 00217 if (remap_to_bin) 00218 alphabet=alpha_bin; 00219 else 00220 alphabet=alpha_ascii; 00221 00222 //SG_REF(alphabet); 00223 num_symbols=alphabet->get_num_symbols(); 00224 00225 return ret_value; 00226 } 00227 00228 template <class T> 00229 SGString<T> CStreamingStringFeatures<T>::get_vector() 00230 { 00231 current_sgstring.string=current_string; 00232 current_sgstring.slen=current_length; 00233 00234 return current_sgstring; 00235 } 00236 00237 template <class T> 00238 float64_t CStreamingStringFeatures<T>::get_label() 00239 { 00240 ASSERT(has_labels); 00241 00242 return current_label; 00243 } 00244 00245 template <class T> 00246 void CStreamingStringFeatures<T>::release_example() 00247 { 00248 parser.finalize_example(); 00249 } 00250 00251 template <class T> 00252 int32_t CStreamingStringFeatures<T>::get_vector_length() 00253 { 00254 return current_length; 00255 } 00256 00257 template <class T> 00258 EFeatureClass CStreamingStringFeatures<T>::get_feature_class() 00259 { 00260 return C_STREAMING_STRING; 00261 } 00262 00263 template class CStreamingStringFeatures<bool>; 00264 template class CStreamingStringFeatures<char>; 00265 template class CStreamingStringFeatures<int8_t>; 00266 template class CStreamingStringFeatures<uint8_t>; 00267 template class CStreamingStringFeatures<int16_t>; 00268 template class CStreamingStringFeatures<uint16_t>; 00269 template class CStreamingStringFeatures<int32_t>; 00270 template class CStreamingStringFeatures<uint32_t>; 00271 template class CStreamingStringFeatures<int64_t>; 00272 template class CStreamingStringFeatures<uint64_t>; 00273 template class CStreamingStringFeatures<float32_t>; 00274 template class CStreamingStringFeatures<float64_t>; 00275 template class CStreamingStringFeatures<floatmax_t>; 00276 00277 }