SHOGUN
v1.1.0
|
00001 /* 00002 SVM with stochastic gradient 00003 Copyright (C) 2007- Leon Bottou 00004 00005 This program is free software; you can redistribute it and/or 00006 modify it under the terms of the GNU Lesser General Public 00007 License as published by the Free Software Foundation; either 00008 version 2.1 of the License, or (at your option) any later version. 00009 00010 This program is distributed in the hope that it will be useful, 00011 but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 GNU General Public License for more details. 00014 00015 You should have received a copy of the GNU General Public License 00016 along with this program; if not, write to the Free Software 00017 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA 00018 $Id: svmsgd.cpp,v 1.13 2007/10/02 20:40:06 cvs Exp $ 00019 00020 Shogun adjustments (w) 2008-2009 Soeren Sonnenburg 00021 */ 00022 00023 #include <shogun/classifier/svm/SVMSGD.h> 00024 #include <shogun/base/Parameter.h> 00025 #include <shogun/lib/Signal.h> 00026 #include <shogun/loss/HingeLoss.h> 00027 00028 using namespace shogun; 00029 00030 CSVMSGD::CSVMSGD() 00031 : CLinearMachine() 00032 { 00033 init(); 00034 } 00035 00036 CSVMSGD::CSVMSGD(float64_t C) 00037 : CLinearMachine() 00038 { 00039 init(); 00040 00041 C1=C; 00042 C2=C; 00043 } 00044 00045 CSVMSGD::CSVMSGD(float64_t C, CDotFeatures* traindat, CLabels* trainlab) 00046 : CLinearMachine() 00047 { 00048 init(); 00049 C1=C; 00050 C2=C; 00051 00052 set_features(traindat); 00053 set_labels(trainlab); 00054 } 00055 00056 CSVMSGD::~CSVMSGD() 00057 { 00058 SG_UNREF(loss); 00059 } 00060 00061 void CSVMSGD::set_loss_function(CLossFunction* loss_func) 00062 { 00063 if (loss) 00064 SG_UNREF(loss); 00065 loss=loss_func; 00066 SG_REF(loss); 00067 } 00068 00069 bool CSVMSGD::train_machine(CFeatures* data) 00070 { 00071 // allocate memory for w and initialize everyting w and bias with 0 00072 ASSERT(labels); 00073 00074 if (data) 00075 { 00076 if (!data->has_property(FP_DOT)) 00077 SG_ERROR("Specified features are not of type CDotFeatures\n"); 00078 set_features((CDotFeatures*) data); 00079 } 00080 00081 ASSERT(features); 00082 ASSERT(labels->is_two_class_labeling()); 00083 00084 int32_t num_train_labels=labels->get_num_labels(); 00085 w_dim=features->get_dim_feature_space(); 00086 int32_t num_vec=features->get_num_vectors(); 00087 00088 ASSERT(num_vec==num_train_labels); 00089 ASSERT(num_vec>0); 00090 00091 SG_FREE(w); 00092 w=SG_MALLOC(float64_t, w_dim); 00093 memset(w, 0, w_dim*sizeof(float64_t)); 00094 bias=0; 00095 00096 float64_t lambda= 1.0/(C1*num_vec); 00097 00098 // Shift t in order to have a 00099 // reasonable initial learning rate. 00100 // This assumes |x| \approx 1. 00101 float64_t maxw = 1.0 / sqrt(lambda); 00102 float64_t typw = sqrt(maxw); 00103 float64_t eta0 = typw / CMath::max(1.0,-loss->first_derivative(-typw,1)); 00104 t = 1 / (eta0 * lambda); 00105 00106 SG_INFO("lambda=%f, epochs=%d, eta0=%f\n", lambda, epochs, eta0); 00107 00108 00109 //do the sgd 00110 calibrate(); 00111 00112 SG_INFO("Training on %d vectors\n", num_vec); 00113 CSignal::clear_cancel(); 00114 00115 ELossType loss_type = loss->get_loss_type(); 00116 bool is_log_loss = false; 00117 if ((loss_type == L_LOGLOSS) || (loss_type == L_LOGLOSSMARGIN)) 00118 is_log_loss = true; 00119 00120 for(int32_t e=0; e<epochs && (!CSignal::cancel_computations()); e++) 00121 { 00122 count = skip; 00123 for (int32_t i=0; i<num_vec; i++) 00124 { 00125 float64_t eta = 1.0 / (lambda * t); 00126 float64_t y = labels->get_label(i); 00127 float64_t z = y * (features->dense_dot(i, w, w_dim) + bias); 00128 00129 if (z < 1 || is_log_loss) 00130 { 00131 float64_t etd = -eta * loss->first_derivative(z,1); 00132 features->add_to_dense_vec(etd * y / wscale, i, w, w_dim); 00133 00134 if (use_bias) 00135 { 00136 if (use_regularized_bias) 00137 bias *= 1 - eta * lambda * bscale; 00138 bias += etd * y * bscale; 00139 } 00140 } 00141 00142 if (--count <= 0) 00143 { 00144 float64_t r = 1 - eta * lambda * skip; 00145 if (r < 0.8) 00146 r = pow(1 - eta * lambda, skip); 00147 CMath::scale_vector(r, w, w_dim); 00148 count = skip; 00149 } 00150 t++; 00151 } 00152 } 00153 00154 float64_t wnorm = CMath::dot(w,w, w_dim); 00155 SG_INFO("Norm: %.6f, Bias: %.6f\n", wnorm, bias); 00156 00157 return true; 00158 } 00159 00160 void CSVMSGD::calibrate() 00161 { 00162 ASSERT(features); 00163 int32_t num_vec=features->get_num_vectors(); 00164 int32_t c_dim=features->get_dim_feature_space(); 00165 00166 ASSERT(num_vec>0); 00167 ASSERT(c_dim>0); 00168 00169 float64_t* c=SG_MALLOC(float64_t, c_dim); 00170 memset(c, 0, c_dim*sizeof(float64_t)); 00171 00172 SG_INFO("Estimating sparsity and bscale num_vec=%d num_feat=%d.\n", num_vec, c_dim); 00173 00174 // compute average gradient size 00175 int32_t n = 0; 00176 float64_t m = 0; 00177 float64_t r = 0; 00178 00179 for (int32_t j=0; j<num_vec && m<=1000; j++, n++) 00180 { 00181 r += features->get_nnz_features_for_vector(j); 00182 features->add_to_dense_vec(1, j, c, c_dim, true); 00183 00184 //waste cpu cycles for readability 00185 //(only changed dims need checking) 00186 m=CMath::max(c, c_dim); 00187 } 00188 00189 // bias update scaling 00190 bscale = 0.5*m/n; 00191 00192 // compute weight decay skip 00193 skip = (int32_t) ((16 * n * c_dim) / r); 00194 SG_INFO("using %d examples. skip=%d bscale=%.6f\n", n, skip, bscale); 00195 00196 SG_FREE(c); 00197 } 00198 00199 void CSVMSGD::init() 00200 { 00201 t=1; 00202 C1=1; 00203 C2=1; 00204 wscale=1; 00205 bscale=1; 00206 epochs=5; 00207 skip=1000; 00208 count=1000; 00209 use_bias=true; 00210 00211 use_regularized_bias=false; 00212 00213 loss=new CHingeLoss(); 00214 SG_REF(loss); 00215 00216 m_parameters->add(&C1, "C1", "Cost constant 1."); 00217 m_parameters->add(&C2, "C2", "Cost constant 2."); 00218 m_parameters->add(&wscale, "wscale", "W scale"); 00219 m_parameters->add(&bscale, "bscale", "b scale"); 00220 m_parameters->add(&epochs, "epochs", "epochs"); 00221 m_parameters->add(&skip, "skip", "skip"); 00222 m_parameters->add(&count, "count", "count"); 00223 m_parameters->add(&use_bias, "use_bias", "Indicates if bias is used."); 00224 m_parameters->add(&use_regularized_bias, "use_regularized_bias", "Indicates if bias is regularized."); 00225 }