SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Gunnar Raetsch 00008 * Written (W) 1999-2009 Soeren Sonnenburg 00009 * Written (W) 2008-2009 Jonas Behr 00010 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00011 */ 00012 00013 #ifndef __CDYNPROG_H__ 00014 #define __CDYNPROG_H__ 00015 00016 #include <shogun/mathematics/Math.h> 00017 #include <shogun/lib/common.h> 00018 #include <shogun/base/SGObject.h> 00019 #include <shogun/io/SGIO.h> 00020 #include <shogun/lib/config.h> 00021 #include <shogun/structure/PlifMatrix.h> 00022 #include <shogun/structure/PlifBase.h> 00023 #include <shogun/structure/Plif.h> 00024 #include <shogun/structure/IntronList.h> 00025 #include <shogun/structure/SegmentLoss.h> 00026 #include <shogun/features/StringFeatures.h> 00027 #include <shogun/features/SparseFeatures.h> 00028 #include <shogun/distributions/Distribution.h> 00029 #include <shogun/lib/DynamicArray.h> 00030 #include <shogun/lib/Array.h> 00031 #include <shogun/lib/Array2.h> 00032 #include <shogun/lib/Array3.h> 00033 #include <shogun/lib/Time.h> 00034 00035 #include <stdio.h> 00036 #include <limits.h> 00037 00038 namespace shogun 00039 { 00040 template <class T> class CSparseFeatures; 00041 class CIntronList; 00042 class CPlifMatrix; 00043 class CSegmentLoss; 00044 template <class T> class CArray; 00045 00046 //#define DYNPROG_TIMING 00047 00048 #ifdef USE_BIGSTATES 00049 typedef uint16_t T_STATES ; 00050 #else 00051 typedef uint8_t T_STATES ; 00052 #endif 00053 typedef T_STATES* P_STATES ; 00054 00055 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00056 00057 struct segment_loss_struct 00058 { 00060 int32_t maxlookback; 00062 int32_t seqlen; 00064 int32_t *segments_changed; 00066 float64_t *num_segment_id; 00068 int32_t *length_segment_id ; 00069 }; 00070 #endif 00071 00077 class CDynProg : public CSGObject 00078 { 00079 public: 00084 CDynProg(int32_t p_num_svms=8); 00085 virtual ~CDynProg(); 00086 00087 // model related functions 00093 void set_num_states(int32_t N); 00094 00096 int32_t get_num_states(); 00097 00099 int32_t get_num_svms(); 00100 00106 void init_content_svm_value_array(const int32_t p_num_svms); 00107 00115 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes); 00116 00123 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs); 00124 00129 void resize_lin_feat(int32_t num_new_feat); 00134 void set_p_vector(SGVector<float64_t> p); 00135 00140 void set_q_vector(SGVector<float64_t> q); 00141 00146 void set_a(SGMatrix<float64_t> a); 00147 00152 void set_a_id(SGMatrix<int32_t> a); 00153 00158 void set_a_trans_matrix(SGMatrix<float64_t> a_trans); 00159 00164 void init_mod_words_array(SGMatrix<int32_t> p_mod_words_array); 00165 00171 bool check_svm_arrays(); 00172 00177 void set_observation_matrix(SGNDArray<float64_t> seq); 00178 00185 int32_t get_num_positions(); 00186 00196 void set_content_type_array(SGMatrix<float64_t> seg_path); 00197 00202 void set_pos(SGVector<int32_t> pos); 00203 00209 void set_orf_info(SGMatrix<int32_t> orf_info); 00210 00215 void set_gene_string(SGVector<char> genestr); 00216 00217 00222 void set_dict_weights(SGMatrix<float64_t> dictionary_weights); 00223 00228 void best_path_set_segment_loss(SGMatrix<float64_t> segment_loss); 00229 00236 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m); 00237 00239 void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2); 00240 00245 void set_plif_matrices(CPlifMatrix* pm); 00246 00247 // best_path result retrieval functions 00252 SGVector<float64_t> get_scores(); 00253 00258 SGMatrix<int32_t> get_states(); 00259 00264 SGMatrix<int32_t> get_positions(); 00265 00266 00275 void compute_nbest_paths(int32_t max_num_signals, 00276 bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences); 00277 00279 00291 void best_path_trans_deriv( 00292 int32_t* my_state_seq, int32_t *my_pos_seq, 00293 int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals); 00294 00295 // additional best_path_trans_deriv functions 00300 void set_my_state_seq(int32_t* my_state_seq); 00301 00306 void set_my_pos_seq(int32_t* my_pos_seq); 00307 00315 void get_path_scores(float64_t** my_scores, int32_t* seq_len); 00316 00324 void get_path_losses(float64_t** my_losses, int32_t* seq_len); 00325 00326 00328 inline T_STATES get_N() const 00329 { 00330 return m_N ; 00331 } 00332 00337 inline void set_q(T_STATES offset, float64_t value) 00338 { 00339 m_end_state_distribution_q[offset]=value; 00340 } 00341 00346 inline void set_p(T_STATES offset, float64_t value) 00347 { 00348 m_initial_state_distribution_p[offset]=value; 00349 } 00350 00357 inline void set_a(T_STATES line_, T_STATES column, float64_t value) 00358 { 00359 m_transition_matrix_a.element(line_,column)=value; // look also best_path! 00360 } 00361 00367 inline float64_t get_q(T_STATES offset) const 00368 { 00369 return m_end_state_distribution_q[offset]; 00370 } 00371 00377 inline float64_t get_q_deriv(T_STATES offset) const 00378 { 00379 return m_end_state_distribution_q_deriv[offset]; 00380 } 00381 00387 inline float64_t get_p(T_STATES offset) const 00388 { 00389 return m_initial_state_distribution_p[offset]; 00390 } 00391 00397 inline float64_t get_p_deriv(T_STATES offset) const 00398 { 00399 return m_initial_state_distribution_p_deriv[offset]; 00400 } 00401 00405 void precompute_content_values(); 00406 00413 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2) 00414 { 00415 m_lin_feat.get_array_size(dim1, dim2); 00416 return m_lin_feat.get_array(); 00417 } 00426 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len) 00427 { 00428 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true); 00429 } 00434 void create_word_string(); 00435 00438 void precompute_stop_codons(); 00439 00446 inline float64_t get_a(T_STATES line_, T_STATES column) const 00447 { 00448 return m_transition_matrix_a.element(line_, column); // look also best_path()! 00449 } 00450 00457 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const 00458 { 00459 return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()! 00460 } 00462 00467 void set_intron_list(CIntronList* intron_list, int32_t num_plifs); 00468 00470 CSegmentLoss* get_segment_loss_object() 00471 { 00472 return m_seg_loss_obj; 00473 } 00474 00481 void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len) 00482 { 00483 m_long_transitions = use_long_transitions; 00484 m_long_transition_threshold = threshold; 00485 SG_DEBUG("ignoring max_len\n") ; 00486 //m_long_transition_max = max_len; 00487 } 00488 00489 protected: 00490 00491 /* helper functions */ 00492 00502 void lookup_content_svm_values(const int32_t from_state, 00503 const int32_t to_state, const int32_t from_pos, const int32_t to_pos, 00504 float64_t* svm_values, int32_t frame); 00505 00513 inline void lookup_tiling_plif_values(const int32_t from_state, 00514 const int32_t to_state, const int32_t len, float64_t* svm_values); 00515 00520 inline int32_t find_frame(const int32_t from_state); 00521 00530 inline int32_t raw_intensities_interval_query( 00531 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type); 00532 00533 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00534 00535 struct svm_values_struct 00536 { 00538 int32_t maxlookback; 00540 int32_t seqlen; 00541 00543 int32_t* start_pos; 00545 float64_t ** svm_values_unnormalized; 00547 float64_t * svm_values; 00549 bool *** word_used; 00551 int32_t **num_unique_words; 00552 }; 00553 #endif // DOXYGEN_SHOULD_SKIP_THIS 00554 00563 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to); 00564 00566 inline virtual const char* get_name() const { return "DynProg"; } 00567 00568 private: 00569 00570 T_STATES trans_list_len; 00571 T_STATES **trans_list_forward; 00572 T_STATES *trans_list_forward_cnt; 00573 float64_t **trans_list_forward_val; 00574 int32_t **trans_list_forward_id; 00575 bool mem_initialized; 00576 00577 #ifdef DYNPROG_TIMING 00578 CTime MyTime; 00579 CTime MyTime2; 00580 CTime MyTime3; 00581 00582 float64_t segment_init_time; 00583 float64_t segment_pos_time; 00584 float64_t segment_clean_time; 00585 float64_t segment_extend_time; 00586 float64_t orf_time; 00587 float64_t content_time; 00588 float64_t content_penalty_time; 00589 float64_t content_svm_values_time ; 00590 float64_t content_plifs_time ; 00591 float64_t svm_init_time; 00592 float64_t svm_pos_time; 00593 float64_t inner_loop_time; 00594 float64_t inner_loop_max_time ; 00595 float64_t svm_clean_time; 00596 float64_t long_transition_time ; 00597 #endif 00598 00599 00600 protected: 00605 00606 int32_t m_N; 00607 00609 CArray2<int32_t> m_transition_matrix_a_id; 00610 CArray2<float64_t> m_transition_matrix_a; 00611 CArray2<float64_t> m_transition_matrix_a_deriv; 00612 00614 CArray<float64_t> m_initial_state_distribution_p; 00615 CArray<float64_t> m_initial_state_distribution_p_deriv; 00616 00618 CArray<float64_t> m_end_state_distribution_q; 00619 CArray<float64_t> m_end_state_distribution_q_deriv; 00620 00622 00624 int32_t m_num_degrees; 00626 int32_t m_num_svms; 00627 00629 CArray<int32_t> m_word_degree; 00631 CArray<int32_t> m_cum_num_words; 00633 int32_t * m_cum_num_words_array; 00635 CArray<int32_t> m_num_words; 00637 int32_t* m_num_words_array; 00639 CArray2<int32_t> m_mod_words; 00641 int32_t* m_mod_words_array; 00643 CArray<bool> m_sign_words; 00645 bool* m_sign_words_array; 00647 CArray<int32_t> m_string_words; 00649 int32_t* m_string_words_array; 00650 00652 // CArray<int32_t> m_svm_pos_start; 00654 CArray<int32_t> m_num_unique_words; 00656 bool m_svm_arrays_clean; 00658 int32_t m_max_a_id; 00659 00660 // input arguments 00662 CArray3<float64_t> m_observation_matrix; 00664 CArray<int32_t> m_pos; 00666 int32_t m_seq_len; 00668 CArray2<int32_t> m_orf_info; 00670 CArray2<float64_t> m_segment_sum_weights; 00672 CArray<CPlifBase*> m_plif_list; 00674 CArray2<CPlifBase*> m_PEN; 00676 CArray2<CPlifBase*> m_PEN_state_signals; 00678 CArray<char> m_genestr; 00693 uint16_t*** m_wordstr; 00695 CArray2<float64_t> m_dict_weights; 00697 CArray3<float64_t> m_segment_loss; 00699 CArray<int32_t> m_segment_ids; 00701 CArray<float64_t> m_segment_mask; 00703 CArray<int32_t> m_my_state_seq; 00705 CArray<int32_t> m_my_pos_seq; 00707 CArray<float64_t> m_my_scores; 00709 CArray<float64_t> m_my_losses; 00710 00713 CSegmentLoss* m_seg_loss_obj; 00714 00715 // output arguments 00717 CArray<float64_t> m_scores; 00719 CArray2<int32_t> m_states; 00721 CArray2<int32_t> m_positions; 00722 00724 CSparseFeatures<float64_t>* m_seq_sparse1; 00726 CSparseFeatures<float64_t>* m_seq_sparse2; 00728 CPlifMatrix* m_plif_matrices; 00729 00733 CArray<bool> m_genestr_stop; 00734 00737 CIntronList* m_intron_list; 00738 00740 int32_t m_num_intron_plifs; 00741 00746 CArray2<float64_t> m_lin_feat; 00747 00749 float64_t *m_raw_intensities; 00751 int32_t* m_probe_pos; 00753 int32_t* m_num_probes_cum; 00755 int32_t* m_num_lin_feat_plifs_cum; 00757 int32_t m_num_raw_data; 00758 00760 bool m_long_transitions ; 00763 int32_t m_long_transition_threshold ; 00768 //int32_t m_long_transition_max ; 00769 00773 static int32_t word_degree_default[4]; 00774 00778 static int32_t cum_num_words_default[5]; 00779 00782 static int32_t frame_plifs[3]; 00783 00786 static int32_t num_words_default[4]; 00787 00789 static int32_t mod_words_default[32]; 00790 00792 static bool sign_words_default[16]; 00793 00795 static int32_t string_words_default[16]; 00796 }; 00797 } 00798 #endif