SHOGUN
v1.1.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Parts of this code are copyright (c) 2009 Yahoo! Inc. 00008 * All rights reserved. The copyrights embodied in the content of 00009 * this file are licensed under the BSD (revised) open source license. 00010 * 00011 * Written (W) 2010 Soeren Sonnenburg 00012 * Copyright (C) 2010 Berlin Institute of Technology 00013 */ 00014 00015 #include <shogun/features/SparseFeatures.h> 00016 #include <shogun/io/File.h> 00017 #include <shogun/io/AsciiFile.h> 00018 #include <shogun/mathematics/Math.h> 00019 #include <ctype.h> 00020 #include <stdio.h> 00021 00022 using namespace shogun; 00023 00024 CAsciiFile::CAsciiFile() 00025 { 00026 SG_UNSTABLE("CAsciiFile::CAsciiFile()", "\n"); 00027 } 00028 00029 CAsciiFile::CAsciiFile(FILE* f, const char* name) : CFile(f, name) 00030 { 00031 } 00032 00033 CAsciiFile::CAsciiFile(char* fname, char rw, const char* name) : CFile(fname, rw, name) 00034 { 00035 } 00036 00037 CAsciiFile::~CAsciiFile() 00038 { 00039 } 00040 00041 #define GET_VECTOR(fname, mfname, sg_type) \ 00042 void CAsciiFile::fname(sg_type*& vec, int32_t& len) \ 00043 { \ 00044 vec=NULL; \ 00045 len=0; \ 00046 int32_t num_feat=0; \ 00047 int32_t num_vec=0; \ 00048 mfname(vec, num_feat, num_vec); \ 00049 if ((num_feat==1) || (num_vec==1)) \ 00050 { \ 00051 if (num_feat==1) \ 00052 len=num_vec; \ 00053 else \ 00054 len=num_feat; \ 00055 } \ 00056 else \ 00057 { \ 00058 SG_FREE(vec); \ 00059 vec=NULL; \ 00060 len=0; \ 00061 SG_ERROR("Could not read vector from" \ 00062 " file %s (shape %dx%d found but " \ 00063 "vector expected).\n", filename, \ 00064 num_vec, num_feat); \ 00065 } \ 00066 } 00067 00068 GET_VECTOR(get_vector, get_matrix, uint8_t) 00069 GET_VECTOR(get_vector, get_matrix, char) 00070 GET_VECTOR(get_vector, get_matrix, int32_t) 00071 GET_VECTOR(get_vector, get_matrix, float32_t) 00072 GET_VECTOR(get_vector, get_matrix, float64_t) 00073 GET_VECTOR(get_vector, get_matrix, int16_t) 00074 GET_VECTOR(get_vector, get_matrix, uint16_t) 00075 #undef GET_VECTOR 00076 00077 #define GET_MATRIX(fname, conv, sg_type) \ 00078 void CAsciiFile::fname(sg_type*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00079 { \ 00080 struct stat stats; \ 00081 if (stat(filename, &stats)!=0) \ 00082 SG_ERROR("Could not get file statistics.\n"); \ 00083 \ 00084 char* data=SG_MALLOC(char, stats.st_size+1); \ 00085 memset(data, 0, sizeof(char)*(stats.st_size+1)); \ 00086 size_t nread=fread(data, sizeof(char), stats.st_size, file); \ 00087 if (nread<=0) \ 00088 SG_ERROR("Could not read data from %s.\n", filename); \ 00089 \ 00090 SG_DEBUG("data read from file:\n%s\n", data); \ 00091 \ 00092 /* determine num_feat and num_vec, populate dynamic array */ \ 00093 int32_t nf=0; \ 00094 num_feat=0; \ 00095 num_vec=0; \ 00096 char* ptr_item=NULL; \ 00097 char* ptr_data=data; \ 00098 DynArray<char*>* items=new DynArray<char*>(); \ 00099 \ 00100 while (*ptr_data) \ 00101 { \ 00102 if (*ptr_data=='\n') \ 00103 { \ 00104 if (ptr_item) \ 00105 nf++; \ 00106 \ 00107 if (num_feat!=0 && nf!=num_feat) \ 00108 SG_ERROR("Number of features mismatches (%d != %d) in vector" \ 00109 " %d in file %s.\n", num_feat, nf, num_vec, filename); \ 00110 \ 00111 append_item(items, ptr_data, ptr_item); \ 00112 num_feat=nf; \ 00113 num_vec++; \ 00114 nf=0; \ 00115 ptr_item=NULL; \ 00116 } \ 00117 else if (!isblank(*ptr_data) && !ptr_item) \ 00118 { \ 00119 ptr_item=ptr_data; \ 00120 } \ 00121 else if (isblank(*ptr_data) && ptr_item) \ 00122 { \ 00123 append_item(items, ptr_data, ptr_item); \ 00124 ptr_item=NULL; \ 00125 nf++; \ 00126 } \ 00127 \ 00128 ptr_data++; \ 00129 } \ 00130 \ 00131 SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec); \ 00132 SG_FREE(data); \ 00133 \ 00134 /* now copy data into matrix */ \ 00135 matrix=SG_MALLOC(sg_type, num_vec*num_feat); \ 00136 for (int32_t i=0; i<num_vec; i++) \ 00137 { \ 00138 for (int32_t j=0; j<num_feat; j++) \ 00139 { \ 00140 char* item=items->get_element(i*num_feat+j); \ 00141 matrix[i*num_feat+j]=conv(item); \ 00142 SG_FREE(item); \ 00143 } \ 00144 } \ 00145 delete items; \ 00146 } 00147 00148 GET_MATRIX(get_matrix, atoi, uint8_t) 00149 GET_MATRIX(get_int8_matrix, atoi, int8_t) 00150 GET_MATRIX(get_matrix, atoi, char) 00151 GET_MATRIX(get_matrix, atoi, int32_t) 00152 GET_MATRIX(get_uint_matrix, atoi, uint32_t) 00153 GET_MATRIX(get_long_matrix, atoll, int64_t) 00154 GET_MATRIX(get_ulong_matrix, atoll, uint64_t) 00155 GET_MATRIX(get_matrix, atof, float32_t) 00156 GET_MATRIX(get_matrix, atof, float64_t) 00157 GET_MATRIX(get_longreal_matrix, atof, floatmax_t) 00158 GET_MATRIX(get_matrix, atoi, int16_t) 00159 GET_MATRIX(get_matrix, atoi, uint16_t) 00160 #undef GET_MATRIX 00161 00162 #define GET_NDARRAY(fname, conv, sg_type) \ 00163 void CAsciiFile::fname(sg_type*& array, int32_t *& dims, int32_t & num_dims) \ 00164 { \ 00165 struct stat stats; \ 00166 if (stat(filename, &stats)!=0) \ 00167 SG_ERROR("Could not get file statistics.\n"); \ 00168 \ 00169 char* data=SG_MALLOC(char, stats.st_size+1); \ 00170 memset(data, 0, sizeof(char)*(stats.st_size+1)); \ 00171 size_t nread=fread(data, sizeof(char), stats.st_size, file); \ 00172 if (nread<=0) \ 00173 SG_ERROR("Could not read data from %s.\n", filename); \ 00174 \ 00175 SG_DEBUG("data read from file:\n%s\n", data); \ 00176 \ 00177 /* determine size of array */ \ 00178 int32_t length=0; \ 00179 int32_t counter=0; \ 00180 size_t total=0; \ 00181 num_dims = -1; \ 00182 char* ptr_item=NULL; \ 00183 char* ptr_data=data; \ 00184 DynArray<char*>* items=new DynArray<char*>(); \ 00185 \ 00186 /* read line with sizes of array*/ \ 00187 while(*ptr_data != '\n') \ 00188 { \ 00189 if(isblank(*ptr_data) && ptr_item) \ 00190 { \ 00191 append_item(items, ptr_data, ptr_item); \ 00192 num_dims++; \ 00193 ptr_item = NULL; \ 00194 } \ 00195 else if(!isblank(*ptr_data) && !ptr_item) \ 00196 ptr_item = ptr_data; \ 00197 \ 00198 ptr_data++; \ 00199 } \ 00200 ptr_item = NULL; \ 00201 ptr_data++; \ 00202 \ 00203 /* read array data*/ \ 00204 while(*ptr_data) \ 00205 { \ 00206 if (*ptr_data=='\n') \ 00207 { \ 00208 if (ptr_item) \ 00209 counter++; \ 00210 \ 00211 if (length!=0 && counter!=length) \ 00212 SG_ERROR("Invalid number of data (%d != %d) in line" \ 00213 " %d in file %s.\n", length, counter, total, filename); \ 00214 \ 00215 append_item(items, ptr_data, ptr_item); \ 00216 length=counter; \ 00217 total++; \ 00218 counter=0; \ 00219 ptr_item=NULL; \ 00220 } \ 00221 else if (!isblank(*ptr_data) && !ptr_item) \ 00222 { \ 00223 ptr_item=ptr_data; \ 00224 } \ 00225 else if (isblank(*ptr_data) && ptr_item) \ 00226 { \ 00227 append_item(items, ptr_data, ptr_item); \ 00228 ptr_item=NULL; \ 00229 counter++; \ 00230 } \ 00231 \ 00232 ptr_data++; \ 00233 } \ 00234 \ 00235 SG_DEBUG("num of data in line: %d, num of lines %d\n", counter, total); \ 00236 SG_FREE(data); \ 00237 \ 00238 /* determining sizes of dimensions*/ \ 00239 char * item; \ 00240 item=items->get_element(0); \ 00241 if(atoi(item) != num_dims) \ 00242 SG_ERROR("Invalid number of dimensions!\n"); \ 00243 SG_FREE(item); \ 00244 dims = SG_MALLOC(int32_t, num_dims); \ 00245 for(int32_t i =0;i < num_dims;i++) \ 00246 { \ 00247 item = items->get_element(i+1); \ 00248 dims[i] = atoi(item); \ 00249 SG_FREE(item); \ 00250 } \ 00251 if (dims[num_dims-1] != length) \ 00252 SG_ERROR("Invalid number of lines in file!\n"); \ 00253 \ 00254 /* converting array data */ \ 00255 total *= length; \ 00256 array=SG_MALLOC(sg_type, total); \ 00257 for (size_t i=0; i<total; i++) \ 00258 { \ 00259 item=items->get_element(i+(num_dims+1)); \ 00260 array[i]=conv(item); \ 00261 SG_FREE(item); \ 00262 } \ 00263 delete items; \ 00264 } 00265 00266 GET_NDARRAY(get_ndarray, atoi, uint8_t) 00267 GET_NDARRAY(get_int8_ndarray, atoi, int8_t) 00268 GET_NDARRAY(get_ndarray, atoi, char) 00269 GET_NDARRAY(get_ndarray, atoi, int32_t) 00270 GET_NDARRAY(get_uint_ndarray, atoi, uint32_t) 00271 GET_NDARRAY(get_long_ndarray, atoll, int64_t) 00272 GET_NDARRAY(get_ulong_ndarray, atoll, uint64_t) 00273 GET_NDARRAY(get_ndarray, atof, float32_t) 00274 GET_NDARRAY(get_ndarray, atof, float64_t) 00275 GET_NDARRAY(get_longreal_ndarray, atof, floatmax_t) 00276 GET_NDARRAY(get_ndarray, atoi, int16_t) 00277 GET_NDARRAY(get_ndarray, atoi, uint16_t) 00278 #undef GET_NDARRAY 00279 00280 #define GET_SPARSEMATRIX(fname, conv, sg_type) \ 00281 void CAsciiFile::fname(SGSparseVector<sg_type>*& matrix, int32_t& num_feat, int32_t& num_vec) \ 00282 { \ 00283 size_t blocksize=1024*1024; \ 00284 size_t required_blocksize=blocksize; \ 00285 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); \ 00286 \ 00287 if (file) \ 00288 { \ 00289 num_vec=0; \ 00290 num_feat=0; \ 00291 \ 00292 SG_INFO("counting line numbers in file %s\n", filename); \ 00293 size_t sz=blocksize; \ 00294 size_t block_offs=0; \ 00295 size_t old_block_offs=0; \ 00296 fseek(file, 0, SEEK_END); \ 00297 size_t fsize=ftell(file); \ 00298 rewind(file); \ 00299 \ 00300 while (sz == blocksize) \ 00301 { \ 00302 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \ 00303 for (size_t i=0; i<sz; i++) \ 00304 { \ 00305 block_offs++; \ 00306 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \ 00307 { \ 00308 num_vec++; \ 00309 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); \ 00310 old_block_offs=block_offs; \ 00311 } \ 00312 } \ 00313 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); \ 00314 } \ 00315 \ 00316 SG_INFO("found %d feature vectors\n", num_vec); \ 00317 SG_FREE(dummy); \ 00318 blocksize=required_blocksize; \ 00319 dummy = SG_MALLOC(uint8_t, blocksize+1); /*allow setting of '\0' at EOL*/ \ 00320 matrix=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \ 00321 \ 00322 rewind(file); \ 00323 sz=blocksize; \ 00324 int32_t lines=0; \ 00325 while (sz == blocksize) \ 00326 { \ 00327 sz=fread(dummy, sizeof(uint8_t), blocksize, file); \ 00328 \ 00329 size_t old_sz=0; \ 00330 for (size_t i=0; i<sz; i++) \ 00331 { \ 00332 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) \ 00333 { \ 00334 size_t len=i-old_sz+1; \ 00335 uint8_t* data=&dummy[old_sz]; \ 00336 \ 00337 for (size_t j=0; j<len; j++) \ 00338 dummy[j]=data[j]; \ 00339 \ 00340 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file); \ 00341 i=0; \ 00342 old_sz=0; \ 00343 sz+=len; \ 00344 } \ 00345 \ 00346 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) \ 00347 { \ 00348 \ 00349 size_t len=i-old_sz; \ 00350 uint8_t* data=&dummy[old_sz]; \ 00351 \ 00352 int32_t dims=0; \ 00353 for (size_t j=0; j<len; j++) \ 00354 { \ 00355 if (data[j]==':') \ 00356 dims++; \ 00357 } \ 00358 \ 00359 if (dims<=0) \ 00360 { \ 00361 SG_ERROR("Error in line %d - number of" \ 00362 " dimensions is %d line is %d characters" \ 00363 " long\n line_content:'%.*s'\n", lines, \ 00364 dims, len, len, (const char*) data); \ 00365 } \ 00366 \ 00367 SGSparseVectorEntry<sg_type>* feat=SG_MALLOC(SGSparseVectorEntry<sg_type>, dims); \ 00368 \ 00369 /* skip label part */ \ 00370 size_t j=0; \ 00371 for (; j<len; j++) \ 00372 { \ 00373 if (data[j]==':') \ 00374 { \ 00375 j=-1; /* file without label*/ \ 00376 break; \ 00377 } \ 00378 \ 00379 if (data[j]==' ') \ 00380 { \ 00381 data[j]='\0'; \ 00382 \ 00383 /* skip label part */ \ 00384 break; \ 00385 } \ 00386 } \ 00387 \ 00388 int32_t d=0; \ 00389 j++; \ 00390 uint8_t* start=&data[j]; \ 00391 for (; j<len; j++) \ 00392 { \ 00393 if (data[j]==':') \ 00394 { \ 00395 data[j]='\0'; \ 00396 \ 00397 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; \ 00398 num_feat=CMath::max(num_feat, feat[d].feat_index+1); \ 00399 \ 00400 j++; \ 00401 start=&data[j]; \ 00402 for (; j<len; j++) \ 00403 { \ 00404 if (data[j]==' ' || data[j]=='\n') \ 00405 { \ 00406 data[j]='\0'; \ 00407 feat[d].entry=(sg_type) conv((const char*) start); \ 00408 d++; \ 00409 break; \ 00410 } \ 00411 } \ 00412 \ 00413 if (j==len) \ 00414 { \ 00415 data[j]='\0'; \ 00416 feat[dims-1].entry=(sg_type) conv((const char*) start); \ 00417 } \ 00418 \ 00419 j++; \ 00420 start=&data[j]; \ 00421 } \ 00422 } \ 00423 \ 00424 matrix[lines].vec_index=lines; \ 00425 matrix[lines].num_feat_entries=dims; \ 00426 matrix[lines].features=feat; \ 00427 \ 00428 old_sz=i+1; \ 00429 lines++; \ 00430 SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t"); \ 00431 } \ 00432 } \ 00433 } \ 00434 \ 00435 SG_INFO("file successfully read\n"); \ 00436 } \ 00437 \ 00438 SG_FREE(dummy); \ 00439 } 00440 00441 GET_SPARSEMATRIX(get_sparse_matrix, atoi, bool) 00442 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint8_t) 00443 GET_SPARSEMATRIX(get_int8_sparsematrix, atoi, int8_t) 00444 GET_SPARSEMATRIX(get_sparse_matrix, atoi, char) 00445 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int32_t) 00446 GET_SPARSEMATRIX(get_uint_sparsematrix, atoi, uint32_t) 00447 GET_SPARSEMATRIX(get_long_sparsematrix, atoll, int64_t) 00448 GET_SPARSEMATRIX(get_ulong_sparsematrix, atoll, uint64_t) 00449 GET_SPARSEMATRIX(get_sparse_matrix, atof, float32_t) 00450 GET_SPARSEMATRIX(get_sparse_matrix, atof, float64_t) 00451 GET_SPARSEMATRIX(get_longreal_sparsematrix, atof, floatmax_t) 00452 GET_SPARSEMATRIX(get_sparse_matrix, atoi, int16_t) 00453 GET_SPARSEMATRIX(get_sparse_matrix, atoi, uint16_t) 00454 #undef GET_SPARSEMATRIX 00455 00456 00457 void CAsciiFile::get_string_list(SGString<uint8_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00458 { 00459 size_t blocksize=1024*1024; 00460 size_t required_blocksize=0; 00461 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); 00462 uint8_t* overflow=NULL; 00463 int32_t overflow_len=0; 00464 00465 if (file) 00466 { 00467 num_str=0; 00468 max_string_len=0; 00469 00470 SG_INFO("counting line numbers in file %s\n", filename); 00471 size_t sz=blocksize; 00472 size_t block_offs=0; 00473 size_t old_block_offs=0; 00474 fseek(file, 0, SEEK_END); 00475 size_t fsize=ftell(file); 00476 rewind(file); 00477 00478 while (sz == blocksize) 00479 { 00480 sz=fread(dummy, sizeof(uint8_t), blocksize, file); 00481 for (size_t i=0; i<sz; i++) 00482 { 00483 block_offs++; 00484 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00485 { 00486 num_str++; 00487 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00488 old_block_offs=block_offs; 00489 } 00490 } 00491 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00492 } 00493 00494 SG_INFO("found %d strings\n", num_str); 00495 SG_DEBUG("block_size=%d\n", required_blocksize); 00496 SG_FREE(dummy); 00497 blocksize=required_blocksize; 00498 dummy=SG_MALLOC(uint8_t, blocksize); 00499 overflow=SG_MALLOC(uint8_t, blocksize); 00500 strings=SG_MALLOC(SGString<uint8_t>, num_str); 00501 00502 rewind(file); 00503 sz=blocksize; 00504 int32_t lines=0; 00505 size_t old_sz=0; 00506 while (sz == blocksize) 00507 { 00508 sz=fread(dummy, sizeof(uint8_t), blocksize, file); 00509 00510 old_sz=0; 00511 for (size_t i=0; i<sz; i++) 00512 { 00513 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00514 { 00515 int32_t len=i-old_sz; 00516 max_string_len=CMath::max(max_string_len, len+overflow_len); 00517 00518 strings[lines].slen=len+overflow_len; 00519 strings[lines].string=SG_MALLOC(uint8_t, len+overflow_len); 00520 00521 for (int32_t j=0; j<overflow_len; j++) 00522 strings[lines].string[j]=overflow[j]; 00523 for (int32_t j=0; j<len; j++) 00524 strings[lines].string[j+overflow_len]=dummy[old_sz+j]; 00525 00526 // clear overflow 00527 overflow_len=0; 00528 00529 //CMath::display_vector(strings[lines].string, len); 00530 old_sz=i+1; 00531 lines++; 00532 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t"); 00533 } 00534 } 00535 00536 for (size_t i=old_sz; i<sz; i++) 00537 overflow[i-old_sz]=dummy[i]; 00538 00539 overflow_len=sz-old_sz; 00540 } 00541 SG_INFO("file successfully read\n"); 00542 SG_INFO("max_string_length=%d\n", max_string_len); 00543 SG_INFO("num_strings=%d\n", num_str); 00544 } 00545 00546 SG_FREE(dummy); 00547 SG_FREE(overflow); 00548 } 00549 00550 void CAsciiFile::get_int8_string_list(SGString<int8_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00551 { 00552 size_t blocksize=1024*1024; 00553 size_t required_blocksize=0; 00554 int8_t* dummy=SG_MALLOC(int8_t, blocksize); 00555 int8_t* overflow=NULL; 00556 int32_t overflow_len=0; 00557 00558 if (file) 00559 { 00560 num_str=0; 00561 max_string_len=0; 00562 00563 SG_INFO("counting line numbers in file %s\n", filename); 00564 size_t sz=blocksize; 00565 size_t block_offs=0; 00566 size_t old_block_offs=0; 00567 fseek(file, 0, SEEK_END); 00568 size_t fsize=ftell(file); 00569 rewind(file); 00570 00571 while (sz == blocksize) 00572 { 00573 sz=fread(dummy, sizeof(int8_t), blocksize, file); 00574 for (size_t i=0; i<sz; i++) 00575 { 00576 block_offs++; 00577 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00578 { 00579 num_str++; 00580 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00581 old_block_offs=block_offs; 00582 } 00583 } 00584 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00585 } 00586 00587 SG_INFO("found %d strings\n", num_str); 00588 SG_DEBUG("block_size=%d\n", required_blocksize); 00589 SG_FREE(dummy); 00590 blocksize=required_blocksize; 00591 dummy=SG_MALLOC(int8_t, blocksize); 00592 overflow=SG_MALLOC(int8_t, blocksize); 00593 strings=SG_MALLOC(SGString<int8_t>, num_str); 00594 00595 rewind(file); 00596 sz=blocksize; 00597 int32_t lines=0; 00598 size_t old_sz=0; 00599 while (sz == blocksize) 00600 { 00601 sz=fread(dummy, sizeof(int8_t), blocksize, file); 00602 00603 old_sz=0; 00604 for (size_t i=0; i<sz; i++) 00605 { 00606 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00607 { 00608 int32_t len=i-old_sz; 00609 max_string_len=CMath::max(max_string_len, len+overflow_len); 00610 00611 strings[lines].slen=len+overflow_len; 00612 strings[lines].string=SG_MALLOC(int8_t, len+overflow_len); 00613 00614 for (int32_t j=0; j<overflow_len; j++) 00615 strings[lines].string[j]=overflow[j]; 00616 for (int32_t j=0; j<len; j++) 00617 strings[lines].string[j+overflow_len]=dummy[old_sz+j]; 00618 00619 // clear overflow 00620 overflow_len=0; 00621 00622 //CMath::display_vector(strings[lines].string, len); 00623 old_sz=i+1; 00624 lines++; 00625 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t"); 00626 } 00627 } 00628 00629 for (size_t i=old_sz; i<sz; i++) 00630 overflow[i-old_sz]=dummy[i]; 00631 00632 overflow_len=sz-old_sz; 00633 } 00634 SG_INFO("file successfully read\n"); 00635 SG_INFO("max_string_length=%d\n", max_string_len); 00636 SG_INFO("num_strings=%d\n", num_str); 00637 } 00638 00639 SG_FREE(dummy); 00640 SG_FREE(overflow); 00641 } 00642 00643 void CAsciiFile::get_string_list(SGString<char>*& strings, int32_t& num_str, int32_t& max_string_len) 00644 { 00645 size_t blocksize=1024*1024; 00646 size_t required_blocksize=0; 00647 char* dummy=SG_MALLOC(char, blocksize); 00648 char* overflow=NULL; 00649 int32_t overflow_len=0; 00650 00651 if (file) 00652 { 00653 num_str=0; 00654 max_string_len=0; 00655 00656 SG_INFO("counting line numbers in file %s\n", filename); 00657 size_t sz=blocksize; 00658 size_t block_offs=0; 00659 size_t old_block_offs=0; 00660 fseek(file, 0, SEEK_END); 00661 size_t fsize=ftell(file); 00662 rewind(file); 00663 00664 while (sz == blocksize) 00665 { 00666 sz=fread(dummy, sizeof(char), blocksize, file); 00667 for (size_t i=0; i<sz; i++) 00668 { 00669 block_offs++; 00670 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00671 { 00672 num_str++; 00673 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00674 old_block_offs=block_offs; 00675 } 00676 } 00677 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00678 } 00679 00680 SG_INFO("found %d strings\n", num_str); 00681 SG_DEBUG("block_size=%d\n", required_blocksize); 00682 SG_FREE(dummy); 00683 blocksize=required_blocksize; 00684 dummy=SG_MALLOC(char, blocksize); 00685 overflow=SG_MALLOC(char, blocksize); 00686 strings=SG_MALLOC(SGString<char>, num_str); 00687 00688 rewind(file); 00689 sz=blocksize; 00690 int32_t lines=0; 00691 size_t old_sz=0; 00692 while (sz == blocksize) 00693 { 00694 sz=fread(dummy, sizeof(char), blocksize, file); 00695 00696 old_sz=0; 00697 for (size_t i=0; i<sz; i++) 00698 { 00699 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00700 { 00701 int32_t len=i-old_sz; 00702 max_string_len=CMath::max(max_string_len, len+overflow_len); 00703 00704 strings[lines].slen=len+overflow_len; 00705 strings[lines].string=SG_MALLOC(char, len+overflow_len); 00706 00707 for (int32_t j=0; j<overflow_len; j++) 00708 strings[lines].string[j]=overflow[j]; 00709 for (int32_t j=0; j<len; j++) 00710 strings[lines].string[j+overflow_len]=dummy[old_sz+j]; 00711 00712 // clear overflow 00713 overflow_len=0; 00714 00715 //CMath::display_vector(strings[lines].string, len); 00716 old_sz=i+1; 00717 lines++; 00718 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t"); 00719 } 00720 } 00721 00722 for (size_t i=old_sz; i<sz; i++) 00723 overflow[i-old_sz]=dummy[i]; 00724 00725 overflow_len=sz-old_sz; 00726 } 00727 SG_INFO("file successfully read\n"); 00728 SG_INFO("max_string_length=%d\n", max_string_len); 00729 SG_INFO("num_strings=%d\n", num_str); 00730 } 00731 00732 SG_FREE(dummy); 00733 SG_FREE(overflow); 00734 } 00735 00736 void CAsciiFile::get_string_list(SGString<int32_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00737 { 00738 strings=NULL; 00739 num_str=0; 00740 max_string_len=0; 00741 } 00742 00743 void CAsciiFile::get_uint_string_list(SGString<uint32_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00744 { 00745 strings=NULL; 00746 num_str=0; 00747 max_string_len=0; 00748 } 00749 00750 void CAsciiFile::get_string_list(SGString<int16_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00751 { 00752 strings=NULL; 00753 num_str=0; 00754 max_string_len=0; 00755 } 00756 00757 void CAsciiFile::get_string_list(SGString<uint16_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00758 { 00759 strings=NULL; 00760 num_str=0; 00761 max_string_len=0; 00762 } 00763 00764 void CAsciiFile::get_long_string_list(SGString<int64_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00765 { 00766 strings=NULL; 00767 num_str=0; 00768 max_string_len=0; 00769 } 00770 00771 void CAsciiFile::get_ulong_string_list(SGString<uint64_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00772 { 00773 strings=NULL; 00774 num_str=0; 00775 max_string_len=0; 00776 } 00777 00778 void CAsciiFile::get_string_list(SGString<float32_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00779 { 00780 strings=NULL; 00781 num_str=0; 00782 max_string_len=0; 00783 } 00784 00785 void CAsciiFile::get_string_list(SGString<float64_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00786 { 00787 strings=NULL; 00788 num_str=0; 00789 max_string_len=0; 00790 } 00791 00792 void CAsciiFile::get_longreal_string_list(SGString<floatmax_t>*& strings, int32_t& num_str, int32_t& max_string_len) 00793 { 00794 strings=NULL; 00795 num_str=0; 00796 max_string_len=0; 00797 } 00798 00799 00802 #define SET_VECTOR(fname, mfname, sg_type) \ 00803 void CAsciiFile::fname(const sg_type* vec, int32_t len) \ 00804 { \ 00805 mfname(vec, len, 1); \ 00806 } 00807 SET_VECTOR(set_vector, set_matrix, uint8_t) 00808 SET_VECTOR(set_vector, set_matrix, char) 00809 SET_VECTOR(set_vector, set_matrix, int32_t) 00810 SET_VECTOR(set_vector, set_matrix, float32_t) 00811 SET_VECTOR(set_vector, set_matrix, float64_t) 00812 SET_VECTOR(set_vector, set_matrix, int16_t) 00813 SET_VECTOR(set_vector, set_matrix, uint16_t) 00814 #undef SET_VECTOR 00815 00816 #define SET_MATRIX(fname, sg_type, fprt_type, type_str) \ 00817 void CAsciiFile::fname(const sg_type* matrix, int32_t num_feat, int32_t num_vec) \ 00818 { \ 00819 if (!(file && matrix)) \ 00820 SG_ERROR("File or matrix invalid.\n"); \ 00821 \ 00822 for (int32_t i=0; i<num_vec; i++) \ 00823 { \ 00824 for (int32_t j=0; j<num_feat; j++) \ 00825 { \ 00826 sg_type v=matrix[num_feat*i+j]; \ 00827 if (j==num_feat-1) \ 00828 fprintf(file, type_str "\n", (fprt_type) v); \ 00829 else \ 00830 fprintf(file, type_str " ", (fprt_type) v); \ 00831 } \ 00832 } \ 00833 } 00834 SET_MATRIX(set_matrix, char, char, "%c") 00835 SET_MATRIX(set_matrix, uint8_t, uint8_t, "%u") 00836 SET_MATRIX(set_int8_matrix, int8_t, int8_t, "%d") 00837 SET_MATRIX(set_matrix, int32_t, int32_t, "%i") 00838 SET_MATRIX(set_uint_matrix, uint32_t, uint32_t, "%u") 00839 SET_MATRIX(set_long_matrix, int64_t, long long int, "%lli") 00840 SET_MATRIX(set_ulong_matrix, uint64_t, long long unsigned int, "%llu") 00841 SET_MATRIX(set_matrix, int16_t, int16_t, "%i") 00842 SET_MATRIX(set_matrix, uint16_t, uint16_t, "%u") 00843 SET_MATRIX(set_matrix, float32_t, float32_t, "%f") 00844 SET_MATRIX(set_matrix, float64_t, float64_t, "%f") 00845 SET_MATRIX(set_longreal_matrix, floatmax_t, floatmax_t, "%Lf") 00846 #undef SET_MATRIX 00847 00848 #define SET_NDARRAY(fname, sg_type, fprt_type, type_str) \ 00849 void CAsciiFile::fname(const sg_type* array, int32_t * dims, int32_t num_dims) \ 00850 { \ 00851 if (!(file && array)) \ 00852 SG_ERROR("File or data invalid.\n"); \ 00853 \ 00854 size_t total = 1; \ 00855 for(int i = 0;i < num_dims;i++) \ 00856 total *= dims[i]; \ 00857 int32_t block_size = dims[num_dims-1]; \ 00858 \ 00859 fprintf(file,"%d ",num_dims); \ 00860 for(int i = 0;i < num_dims;i++) \ 00861 fprintf(file,"%d ",dims[i]); \ 00862 fprintf(file,"\n"); \ 00863 \ 00864 for (size_t i=0; i < total; i++) \ 00865 { \ 00866 sg_type v= array[i]; \ 00867 if ( ((i+1) % block_size) == 0) \ 00868 fprintf(file, type_str "\n", (fprt_type) v); \ 00869 else \ 00870 fprintf(file, type_str " ", (fprt_type) v); \ 00871 } \ 00872 } 00873 00874 SET_NDARRAY(set_ndarray, char, char, "%c") 00875 SET_NDARRAY(set_ndarray, uint8_t, uint8_t, "%u") 00876 SET_NDARRAY(set_int8_ndarray, int8_t, int8_t, "%d") 00877 SET_NDARRAY(set_ndarray, int32_t, int32_t, "%i") 00878 SET_NDARRAY(set_uint_ndarray, uint32_t, uint32_t, "%u") 00879 SET_NDARRAY(set_long_ndarray, int64_t, long long int, "%lli") 00880 SET_NDARRAY(set_ulong_ndarray, uint64_t, long long unsigned int, "%llu") 00881 SET_NDARRAY(set_ndarray, int16_t, int16_t, "%i") 00882 SET_NDARRAY(set_ndarray, uint16_t, uint16_t, "%u") 00883 SET_NDARRAY(set_ndarray, float32_t, float32_t, "%f") 00884 SET_NDARRAY(set_ndarray, float64_t, float64_t, "%f") 00885 SET_NDARRAY(set_longreal_ndarray, floatmax_t, floatmax_t, "%Lf") 00886 #undef SET_NDARRAY 00887 00888 #define SET_SPARSEMATRIX(fname, sg_type, fprt_type, type_str) \ 00889 void CAsciiFile::fname(const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \ 00890 { \ 00891 if (!(file && matrix)) \ 00892 SG_ERROR("File or matrix invalid.\n"); \ 00893 \ 00894 for (int32_t i=0; i<num_vec; i++) \ 00895 { \ 00896 SGSparseVectorEntry<sg_type>* vec = matrix[i].features; \ 00897 int32_t len=matrix[i].num_feat_entries; \ 00898 \ 00899 for (int32_t j=0; j<len; j++) \ 00900 { \ 00901 if (j<len-1) \ 00902 { \ 00903 fprintf(file, "%d:" type_str " ", \ 00904 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \ 00905 } \ 00906 else \ 00907 { \ 00908 fprintf(file, "%d:" type_str "\n", \ 00909 (int32_t) vec[j].feat_index+1, (fprt_type) vec[j].entry); \ 00910 } \ 00911 } \ 00912 } \ 00913 } 00914 SET_SPARSEMATRIX(set_sparse_matrix, bool, uint8_t, "%u") 00915 SET_SPARSEMATRIX(set_sparse_matrix, char, char, "%c") 00916 SET_SPARSEMATRIX(set_sparse_matrix, uint8_t, uint8_t, "%u") 00917 SET_SPARSEMATRIX(set_int8_sparsematrix, int8_t, int8_t, "%d") 00918 SET_SPARSEMATRIX(set_sparse_matrix, int32_t, int32_t, "%i") 00919 SET_SPARSEMATRIX(set_uint_sparsematrix, uint32_t, uint32_t, "%u") 00920 SET_SPARSEMATRIX(set_long_sparsematrix, int64_t, long long int, "%lli") 00921 SET_SPARSEMATRIX(set_ulong_sparsematrix, uint64_t, long long unsigned int, "%llu") 00922 SET_SPARSEMATRIX(set_sparse_matrix, int16_t, int16_t, "%i") 00923 SET_SPARSEMATRIX(set_sparse_matrix, uint16_t, uint16_t, "%u") 00924 SET_SPARSEMATRIX(set_sparse_matrix, float32_t, float32_t, "%f") 00925 SET_SPARSEMATRIX(set_sparse_matrix, float64_t, float64_t, "%f") 00926 SET_SPARSEMATRIX(set_longreal_sparsematrix, floatmax_t, floatmax_t, "%Lf") 00927 #undef SET_SPARSEMATRIX 00928 00929 void CAsciiFile::set_string_list(const SGString<uint8_t>* strings, int32_t num_str) 00930 { 00931 if (!(file && strings)) 00932 SG_ERROR("File or strings invalid.\n"); 00933 00934 for (int32_t i=0; i<num_str; i++) 00935 { 00936 int32_t len = strings[i].slen; 00937 fwrite(strings[i].string, sizeof(uint8_t), len, file); 00938 fprintf(file, "\n"); 00939 } 00940 } 00941 00942 void CAsciiFile::set_int8_string_list(const SGString<int8_t>* strings, int32_t num_str) 00943 { 00944 if (!(file && strings)) 00945 SG_ERROR("File or strings invalid.\n"); 00946 00947 for (int32_t i=0; i<num_str; i++) 00948 { 00949 int32_t len = strings[i].slen; 00950 fwrite(strings[i].string, sizeof(int8_t), len, file); 00951 fprintf(file, "\n"); 00952 } 00953 } 00954 00955 void CAsciiFile::set_string_list(const SGString<char>* strings, int32_t num_str) 00956 { 00957 if (!(file && strings)) 00958 SG_ERROR("File or strings invalid.\n"); 00959 00960 for (int32_t i=0; i<num_str; i++) 00961 { 00962 int32_t len = strings[i].slen; 00963 fwrite(strings[i].string, sizeof(char), len, file); 00964 fprintf(file, "\n"); 00965 } 00966 } 00967 00968 void CAsciiFile::set_string_list(const SGString<int32_t>* strings, int32_t num_str) 00969 { 00970 } 00971 00972 void CAsciiFile::set_uint_string_list(const SGString<uint32_t>* strings, int32_t num_str) 00973 { 00974 } 00975 00976 void CAsciiFile::set_string_list(const SGString<int16_t>* strings, int32_t num_str) 00977 { 00978 } 00979 00980 void CAsciiFile::set_string_list(const SGString<uint16_t>* strings, int32_t num_str) 00981 { 00982 } 00983 00984 void CAsciiFile::set_long_string_list(const SGString<int64_t>* strings, int32_t num_str) 00985 { 00986 } 00987 00988 void CAsciiFile::set_ulong_string_list(const SGString<uint64_t>* strings, int32_t num_str) 00989 { 00990 } 00991 00992 void CAsciiFile::set_string_list(const SGString<float32_t>* strings, int32_t num_str) 00993 { 00994 } 00995 00996 void CAsciiFile::set_string_list(const SGString<float64_t>* strings, int32_t num_str) 00997 { 00998 } 00999 01000 void CAsciiFile::set_longreal_string_list(const SGString<floatmax_t>* strings, int32_t num_str) 01001 { 01002 } 01003 01004 template <class T> void CAsciiFile::append_item( 01005 DynArray<T>* items, char* ptr_data, char* ptr_item) 01006 { 01007 size_t len=(ptr_data-ptr_item)/sizeof(char); 01008 char* item=SG_MALLOC(char, len+1); 01009 memset(item, 0, sizeof(char)*(len+1)); 01010 item=strncpy(item, ptr_item, len); 01011 01012 SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item); 01013 items->append_element(item); 01014 } 01015 01016 #ifdef __MACH__ 01017 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream) 01018 { 01019 int32_t total_bytes_read=0; 01020 int32_t default_size=10; 01021 01022 if ((lineptr == NULL) || (n == NULL) || (stream == NULL)) 01023 return -1; 01024 01025 if ((*lineptr == NULL) && (*n == 0)) 01026 { 01027 *lineptr=SG_MALLOC(char, default_size); 01028 *n=default_size; 01029 } 01030 01031 int32_t bytes_read, pos=-1; 01032 int32_t threshold_size=100000; 01033 01034 while (1) 01035 { 01036 // We need some limit in case file does not contain '\n' 01037 if (*n > threshold_size) 01038 return -1; 01039 01040 // Read from file and append to buffer 01041 bytes_read=fread(*lineptr+total_bytes_read, sizeof(char), *n-total_bytes_read, stream); 01042 01043 for (int i=0; i<bytes_read; i++) 01044 { 01045 if ((*lineptr)[total_bytes_read+i] == delimiter) 01046 { 01047 pos=i; 01048 break; 01049 } 01050 } 01051 01052 if (pos==-1) 01053 { 01054 if (feof(stream)) 01055 return -1; 01056 total_bytes_read+=bytes_read; 01057 *lineptr=SG_REALLOC(char, *lineptr, (*n)*2); 01058 *n=(*n)*2; 01059 // A better reallocated size should be used 01060 } 01061 else 01062 { 01063 total_bytes_read+=pos+1; 01064 (*lineptr)[total_bytes_read]='\0'; 01065 // Seek back to position after \n 01066 fseek(stream, (bytes_read-pos-1) * -1, SEEK_CUR); 01067 return total_bytes_read; 01068 } 01069 } 01070 } 01071 01072 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream) 01073 { 01074 return getdelim(lineptr, n, '\n', stream); 01075 } 01076 01077 #else 01078 ssize_t CAsciiFile::getdelim(char **lineptr, size_t *n, char delimiter, FILE *stream) 01079 { 01080 return ::getdelim(lineptr, n, delimiter, stream); 01081 } 01082 01083 ssize_t CAsciiFile::getline(char **lineptr, size_t *n, FILE *stream) 01084 { 01085 return ::getline(lineptr, n, stream); 01086 } 01087 #endif 01088 01089 void CAsciiFile::tokenize(char delim, substring s, v_array<substring>& ret) 01090 { 01091 ret.erase(); 01092 char *last = s.start; 01093 for (; s.start != s.end; s.start++) 01094 { 01095 if (*s.start == delim) 01096 { 01097 if (s.start != last) 01098 { 01099 substring temp = {last,s.start}; 01100 ret.push(temp); 01101 } 01102 last = s.start+1; 01103 } 01104 } 01105 if (s.start != last) 01106 { 01107 substring final = {last, s.start}; 01108 ret.push(final); 01109 } 01110 }