SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Sergey Lisitsyn 00008 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/classifier/GaussianNaiveBayes.h> 00012 #include <shogun/machine/Machine.h> 00013 #include <shogun/features/Features.h> 00014 #include <shogun/features/Labels.h> 00015 #include <shogun/mathematics/Math.h> 00016 #include <shogun/lib/Signal.h> 00017 00018 using namespace shogun; 00019 00020 CGaussianNaiveBayes::CGaussianNaiveBayes() : 00021 CMachine(), m_features(NULL), m_min_label(0), 00022 m_num_classes(0), m_dim(0), m_means(), 00023 m_variances(), m_label_prob(), m_rates() 00024 { 00025 00026 }; 00027 00028 CGaussianNaiveBayes::CGaussianNaiveBayes(CFeatures* train_examples, CLabels* train_labels) : 00029 CMachine(), m_features(NULL), m_min_label(0), 00030 m_num_classes(0), m_dim(0), m_means(), 00031 m_variances(), m_label_prob(), m_rates() 00032 { 00033 ASSERT(train_examples->get_num_vectors() == train_labels->get_num_labels()); 00034 set_labels(train_labels); 00035 if (!train_examples->has_property(FP_DOT)) 00036 SG_ERROR("Specified features are not of type CDotFeatures\n"); 00037 set_features((CDotFeatures*)train_examples); 00038 }; 00039 00040 CGaussianNaiveBayes::~CGaussianNaiveBayes() 00041 { 00042 SG_UNREF(m_features); 00043 00044 m_means.destroy_vector(); 00045 m_rates.destroy_vector(); 00046 m_variances.destroy_vector(); 00047 m_label_prob.destroy_vector(); 00048 }; 00049 00050 bool CGaussianNaiveBayes::train(CFeatures* data) 00051 { 00052 // init features with data if necessary and assure type is correct 00053 if (data) 00054 { 00055 if (!data->has_property(FP_DOT)) 00056 SG_ERROR("Specified features are not of type CDotFeatures\n"); 00057 set_features((CDotFeatures*) data); 00058 } 00059 // get int labels to train_labels and check length equality 00060 ASSERT(labels); 00061 SGVector<int32_t> train_labels = labels->get_int_labels(); 00062 ASSERT(m_features->get_num_vectors()==train_labels.vlen); 00063 00064 // init min_label, max_label and loop variables 00065 int32_t min_label = train_labels.vector[0]; 00066 int32_t max_label = train_labels.vector[0]; 00067 int i,j; 00068 00069 // find minimal and maximal label 00070 for (i=1; i<train_labels.vlen; i++) 00071 { 00072 min_label = CMath::min(min_label, train_labels.vector[i]); 00073 max_label = CMath::max(max_label, train_labels.vector[i]); 00074 } 00075 00076 // subtract minimal label from all labels 00077 for (i=0; i<train_labels.vlen; i++) 00078 train_labels.vector[i]-= min_label; 00079 00080 // get number of classes, minimal label and dimensionality 00081 m_num_classes = max_label-min_label+1; 00082 m_min_label = min_label; 00083 m_dim = m_features->get_dim_feature_space(); 00084 00085 // allocate memory for distributions' parameters and a priori probability 00086 m_means.vector = SG_MALLOC(float64_t, m_num_classes*m_dim); 00087 m_means.vlen = m_num_classes*m_dim; 00088 00089 m_variances.vector = SG_MALLOC(float64_t, m_num_classes*m_dim); 00090 m_variances.vlen = m_num_classes*m_dim; 00091 00092 m_label_prob.vector = SG_MALLOC(float64_t, m_num_classes); 00093 m_label_prob.vlen = m_num_classes; 00094 00095 // allocate memory for label rates 00096 m_rates.vector = SG_MALLOC(float64_t, m_num_classes); 00097 m_rates.vlen = m_num_classes; 00098 00099 // assure that memory is allocated 00100 ASSERT(m_means.vector); 00101 ASSERT(m_variances.vector); 00102 ASSERT(m_rates.vector); 00103 ASSERT(m_label_prob.vector); 00104 00105 // make arrays filled by zeros before using 00106 for (i=0;i<m_num_classes*m_dim;i++) 00107 { 00108 m_means.vector[i] = 0.0; 00109 m_variances.vector[i] = 0.0; 00110 } 00111 for (i=0;i<m_num_classes;i++) 00112 { 00113 m_label_prob.vector[i] = 0.0; 00114 m_rates.vector[i] = 0.0; 00115 } 00116 00117 SGMatrix<float64_t> feature_matrix = m_features->get_computed_dot_feature_matrix(); 00118 00119 // get sum of features among labels 00120 for (i=0; i<train_labels.vlen; i++) 00121 { 00122 for (j=0; j<m_dim; j++) 00123 m_means.vector[m_dim*train_labels.vector[i]+j]+=feature_matrix.matrix[i*m_dim+j]; 00124 00125 m_label_prob.vector[train_labels.vector[i]]+=1.0; 00126 } 00127 00128 // get means of features of labels 00129 for (i=0; i<m_num_classes; i++) 00130 { 00131 for (j=0; j<m_dim; j++) 00132 m_means.vector[m_dim*i+j] /= m_label_prob.vector[i]; 00133 } 00134 00135 // compute squared residuals with means available 00136 for (i=0; i<train_labels.vlen; i++) 00137 { 00138 for (j=0; j<m_dim; j++) 00139 m_variances.vector[m_dim*train_labels.vector[i]+j]+= 00140 CMath::sq(feature_matrix.matrix[i*m_dim+j]-m_means.vector[m_dim*train_labels.vector[i]+j]); 00141 } 00142 00143 // get variance of features of labels 00144 for (i=0; i<m_num_classes; i++) 00145 { 00146 for (j=0; j<m_dim; j++) 00147 m_variances.vector[m_dim*i+j] /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1; 00148 } 00149 00150 // get a priori probabilities of labels 00151 for (i=0; i<m_num_classes; i++) 00152 { 00153 m_label_prob.vector[i]/= m_num_classes; 00154 } 00155 00156 train_labels.free_vector(); 00157 00158 return true; 00159 } 00160 00161 CLabels* CGaussianNaiveBayes::apply() 00162 { 00163 // init number of vectors 00164 int32_t n = m_features->get_num_vectors(); 00165 00166 // init result labels 00167 CLabels* result = new CLabels(n); 00168 00169 // classify each example of data 00170 for (int i=0; i<n; i++) 00171 result->set_label(i,apply(i)); 00172 00173 return result; 00174 }; 00175 00176 CLabels* CGaussianNaiveBayes::apply(CFeatures* data) 00177 { 00178 // check data correctness 00179 if (!data) 00180 SG_ERROR("No features specified\n"); 00181 if (!data->has_property(FP_DOT)) 00182 SG_ERROR("Specified features are not of type CDotFeatures\n"); 00183 00184 // set features to classify 00185 set_features((CDotFeatures*)data); 00186 00187 // classify using features 00188 return apply(); 00189 }; 00190 00191 float64_t CGaussianNaiveBayes::apply(int32_t idx) 00192 { 00193 // get [idx] feature vector 00194 SGVector<float64_t> feature_vector = m_features->get_computed_dot_feature_vector(idx); 00195 00196 // init loop variables 00197 int i,k; 00198 00199 // rate all labels 00200 for (i=0; i<m_num_classes; i++) 00201 { 00202 // set rate to 0.0 if a priori probability is 0.0 and continue 00203 if (m_label_prob.vector[i]==0.0) 00204 { 00205 m_rates.vector[i] = 0.0; 00206 continue; 00207 } 00208 else 00209 m_rates.vector[i] = m_label_prob.vector[i]; 00210 00211 // product all conditional gaussian probabilities 00212 for (k=0; k<m_dim; k++) 00213 m_rates.vector[i]*= normal_exp(feature_vector.vector[k],i,k)/CMath::sqrt(m_variances.vector[i*m_dim+k]); 00214 } 00215 00216 // find label with maximum rate 00217 int32_t max_label_idx = 0; 00218 00219 for (i=0; i<m_num_classes; i++) 00220 { 00221 if (m_rates.vector[i]>m_rates.vector[max_label_idx]) 00222 max_label_idx = i; 00223 } 00224 00225 return max_label_idx+m_min_label; 00226 };