SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 1999-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/preprocessor/PruneVarSubMean.h> 00013 #include <shogun/preprocessor/SimplePreprocessor.h> 00014 #include <shogun/features/Features.h> 00015 #include <shogun/features/SimpleFeatures.h> 00016 #include <shogun/io/SGIO.h> 00017 #include <shogun/mathematics/Math.h> 00018 00019 using namespace shogun; 00020 00021 CPruneVarSubMean::CPruneVarSubMean(bool divide) 00022 : CSimplePreprocessor<float64_t>(), idx(NULL), mean(NULL), 00023 std(NULL), num_idx(0), divide_by_std(divide), initialized(false) 00024 { 00025 } 00026 00027 CPruneVarSubMean::~CPruneVarSubMean() 00028 { 00029 cleanup(); 00030 } 00031 00033 bool CPruneVarSubMean::init(CFeatures* features) 00034 { 00035 if (!initialized) 00036 { 00037 ASSERT(features->get_feature_class()==C_SIMPLE); 00038 ASSERT(features->get_feature_type()==F_DREAL); 00039 00040 CSimpleFeatures<float64_t>* simple_features=(CSimpleFeatures<float64_t>*) features; 00041 int32_t num_examples = simple_features->get_num_vectors(); 00042 int32_t num_features = simple_features->get_num_features(); 00043 00044 SG_FREE(mean); 00045 SG_FREE(idx); 00046 SG_FREE(std); 00047 mean=NULL; 00048 idx=NULL; 00049 std=NULL; 00050 00051 mean=SG_MALLOC(float64_t, num_features); 00052 float64_t* var=SG_MALLOC(float64_t, num_features); 00053 int32_t i,j; 00054 00055 for (i=0; i<num_features; i++) 00056 { 00057 mean[i]=0; 00058 var[i]=0 ; 00059 } 00060 00061 SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix(); 00062 00063 // compute mean 00064 for (i=0; i<num_examples; i++) 00065 { 00066 for (j=0; j<num_features; j++) 00067 mean[j]+=feature_matrix.matrix[i*num_features+j]; 00068 } 00069 00070 for (j=0; j<num_features; j++) 00071 mean[j]/=num_examples; 00072 00073 // compute var 00074 for (i=0; i<num_examples; i++) 00075 { 00076 for (j=0; j<num_features; j++) 00077 var[j]+=CMath::sq(mean[j]-feature_matrix.matrix[i*num_features+j]); 00078 } 00079 00080 int32_t num_ok=0; 00081 int32_t* idx_ok=SG_MALLOC(int, num_features); 00082 00083 for (j=0; j<num_features; j++) 00084 { 00085 var[j]/=num_examples; 00086 00087 if (var[j]>=1e-14) 00088 { 00089 idx_ok[num_ok]=j; 00090 num_ok++ ; 00091 } 00092 } 00093 00094 SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ; 00095 00096 SG_FREE(idx); 00097 idx=SG_MALLOC(int, num_ok); 00098 float64_t* new_mean=SG_MALLOC(float64_t, num_ok); 00099 std=SG_MALLOC(float64_t, num_ok); 00100 00101 for (j=0; j<num_ok; j++) 00102 { 00103 idx[j]=idx_ok[j] ; 00104 new_mean[j]=mean[idx_ok[j]]; 00105 std[j]=sqrt(var[idx_ok[j]]); 00106 } 00107 num_idx = num_ok ; 00108 SG_FREE(idx_ok); 00109 SG_FREE(mean); 00110 SG_FREE(var); 00111 mean = new_mean; 00112 00113 initialized = true; 00114 return true; 00115 } 00116 else 00117 return false; 00118 } 00119 00121 void CPruneVarSubMean::cleanup() 00122 { 00123 SG_FREE(idx); 00124 idx=NULL; 00125 SG_FREE(mean); 00126 mean=NULL; 00127 SG_FREE(std); 00128 std=NULL; 00129 } 00130 00134 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features) 00135 { 00136 ASSERT(initialized); 00137 00138 int32_t num_vectors=0; 00139 int32_t num_features=0; 00140 float64_t* m=((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00141 00142 SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features); 00143 SG_INFO( "Preprocessing feature matrix\n"); 00144 for (int32_t vec=0; vec<num_vectors; vec++) 00145 { 00146 float64_t* v_src=&m[num_features*vec]; 00147 float64_t* v_dst=&m[num_idx*vec]; 00148 00149 if (divide_by_std) 00150 { 00151 for (int32_t feat=0; feat<num_idx; feat++) 00152 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat]; 00153 } 00154 else 00155 { 00156 for (int32_t feat=0; feat<num_idx; feat++) 00157 v_dst[feat]=(v_src[idx[feat]]-mean[feat]); 00158 } 00159 } 00160 00161 ((CSimpleFeatures<float64_t>*) features)->set_num_features(num_idx); 00162 ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00163 SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features); 00164 00165 return ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(); 00166 } 00167 00170 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector) 00171 { 00172 float64_t* ret=NULL; 00173 00174 if (initialized) 00175 { 00176 ret=SG_MALLOC(float64_t, num_idx); 00177 00178 if (divide_by_std) 00179 { 00180 for (int32_t i=0; i<num_idx; i++) 00181 ret[i]=(vector.vector[idx[i]]-mean[i])/std[i]; 00182 } 00183 else 00184 { 00185 for (int32_t i=0; i<num_idx; i++) 00186 ret[i]=(vector.vector[idx[i]]-mean[i]); 00187 } 00188 } 00189 else 00190 { 00191 ret=SG_MALLOC(float64_t, vector.vlen); 00192 for (int32_t i=0; i<vector.vlen; i++) 00193 ret[i]=vector.vector[i]; 00194 } 00195 00196 return SGVector<float64_t>(ret,num_idx); 00197 }