presage
0.8.7
|
00001 00002 /****************************************************** 00003 * Presage, an extensible predictive text entry system 00004 * --------------------------------------------------- 00005 * 00006 * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk> 00007 00008 This program is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU General Public License as published by 00010 the Free Software Foundation; either version 2 of the License, or 00011 (at your option) any later version. 00012 00013 This program is distributed in the hope that it will be useful, 00014 but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 GNU General Public License for more details. 00017 00018 You should have received a copy of the GNU General Public License along 00019 with this program; if not, write to the Free Software Foundation, Inc., 00020 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 00021 * 00022 **********(*)*/ 00023 00024 00025 #include "config.h" 00026 00027 #include <iostream> 00028 #include <fstream> 00029 #include <vector> 00030 #include <list> 00031 #include <string> 00032 #include <map> 00033 00034 #ifdef HAVE_UNISTD_H 00035 # include <unistd.h> 00036 #endif 00037 00038 #ifdef HAVE_STDLIB_H 00039 # include <stdlib.h> 00040 #endif 00041 00042 #include <getopt.h> 00043 #include <assert.h> 00044 00045 #include "core/tokenizer/forwardTokenizer.h" 00046 #include "core/iso8859_1.h" 00047 #include "core/progress.h" 00048 00049 #include "../lib/predictors/dbconnector/sqliteDatabaseConnector.h" 00050 00051 const std::string PROGRAM_NAME = "text2ngram"; 00052 00053 typedef std::list<std::string> NgramList; 00054 00055 void usage(); 00056 void version(); 00057 00058 int main(int argc, char* argv[]) 00059 { 00060 int next_option; 00061 00062 // Setup some defaults 00063 // - default to generating 1-gram counts 00064 int ngrams = 1; 00065 00066 // - default output to stdout (empty string signifies stdout) 00067 std::string output; 00068 00069 const std::string TABBED_SEPARATED_VALUES = "tsv"; 00070 const std::string SQLITE = "sqlite"; 00071 // - default format is tabbed separated values 00072 std::string format = TABBED_SEPARATED_VALUES; 00073 00074 // - default to case sensitive 00075 bool lowercase = false; 00076 00077 // - default to no append 00078 bool append = false; 00079 00080 00081 // getopt structures 00082 const char * const short_options = "n:o:f:alhv"; 00083 const struct option long_options[] = 00084 { 00085 { "ngrams", required_argument, 0, 'n' }, 00086 { "output", required_argument, 0, 'o' }, 00087 { "format", required_argument, 0, 'f' }, 00088 { "append", no_argument, 0, 'a' }, 00089 { "lowercase", no_argument, 0, 'l' }, 00090 { "help", no_argument, 0, 'h' }, 00091 { "version", no_argument, 0, 'v' }, 00092 { 0, 0, 0, 0 } 00093 }; 00094 00095 do { 00096 next_option = getopt_long(argc, 00097 argv, 00098 short_options, 00099 long_options, 00100 NULL); 00101 00102 switch (next_option) { 00103 case 'n': // --ngrams or -n option 00104 if (atoi(optarg) > 0) { 00105 ngrams = atoi(optarg); 00106 } else { 00107 usage(); 00108 } 00109 break; 00110 case 'o': // --output or -o option 00111 output = optarg; 00112 break; 00113 case 'f': // --format or -f option 00114 if (optarg == SQLITE 00115 || optarg == TABBED_SEPARATED_VALUES) { 00116 format = optarg; 00117 } else { 00118 std::cerr << "Unknown format " << optarg << std::endl << std::endl; 00119 usage(); 00120 return -1; 00121 } 00122 break; 00123 case 'a': // --append or -a option 00124 // append mode 00125 append = true; 00126 break; 00127 case 'l': // --lowercase or -l option 00128 lowercase = true; 00129 break; 00130 case 'h': // --help or -h option 00131 usage(); 00132 exit (0); 00133 break; 00134 case 'v': // --version or -v option 00135 version(); 00136 exit (0); 00137 break; 00138 case '?': // unknown option 00139 usage(); 00140 exit (0); 00141 break; 00142 case -1: 00143 break; 00144 default: 00145 std::cerr << "Error: unhandled option." << std::endl; 00146 exit(0); 00147 } 00148 00149 } while (next_option != -1); 00150 00151 00152 if ((argc - optind < 1)) { 00153 usage(); 00154 return -1; 00155 } 00156 00157 00158 // ngramMap stores <token,count> pairs 00159 std::map<NgramList, int> ngramMap; 00160 00161 for (int i = optind; i < argc; i++) { 00162 // do the actual processing file by file 00163 std::string token; 00164 NgramList ngram; 00165 00166 // points to output file 00167 // print out file information 00168 std::cout << "Parsing " << argv[i] << "..." 00169 << std::endl; 00170 00171 ProgressBar<char> progressBar; 00172 00173 // create tokenizer object and open input file stream 00174 std::ifstream infile(argv[i]); 00175 ForwardTokenizer tokenizer(infile, 00176 " \f\n\r\t\v", 00177 "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<"); 00178 tokenizer.lowercaseMode(lowercase); 00179 00180 // take care of first N-1 tokens 00181 for (int i = 0; (i < ngrams - 1 && tokenizer.hasMoreTokens()); i++) { 00182 ngram.push_back(tokenizer.nextToken()); 00183 } 00184 00185 while (tokenizer.hasMoreTokens()) { 00186 // extract token from input stream 00187 token = tokenizer.nextToken(); 00188 00189 // update ngram with new token 00190 ngram.push_back(token); 00191 00192 // update map with new token occurrence 00193 ngramMap[ngram] = ngramMap[ngram] + 1; 00194 00195 // update progress bar 00196 //progressBar(tokenizer.progress()); 00197 progressBar.update(tokenizer.progress()); 00198 00199 // remove front token from ngram 00200 ngram.pop_front(); 00201 } 00202 00203 infile.close(); 00204 } 00205 00206 00207 std::cout << "Writing out to " << format << " format file " 00208 << output << "..." << std::endl; 00209 if (format == TABBED_SEPARATED_VALUES) { 00210 // output to tabbed separated values text file 00211 // 00212 00213 std::ofstream *outstream = 0; 00214 std::ostream *prev_outstream = 0; 00215 00216 if (output.c_str()) { 00217 // tie outstream to file 00218 outstream = new std::ofstream (output.c_str(), std::ios::out); 00219 assert(outstream); 00220 prev_outstream = std::cout.tie (outstream); 00221 } 00222 00223 // write results to output stream 00224 ProgressBar<char> progressBar; 00225 long total = ngramMap.size(); 00226 long count = 0; 00227 std::map<NgramList, int>::const_iterator it; 00228 for (it = ngramMap.begin(); it != ngramMap.end(); it++) { 00229 for (NgramList::const_iterator ngram_it = it->first.begin(); 00230 ngram_it != it->first.end(); 00231 ngram_it++) { 00232 std::cout << *ngram_it << '\t'; 00233 } 00234 std::cout << it->second << std::endl; 00235 progressBar.update(static_cast<double>(count++)/total); 00236 } 00237 00238 if (output.c_str()) { 00239 std::cout.tie (prev_outstream); 00240 outstream->close (); 00241 delete outstream; 00242 } 00243 00244 } else if (format == SQLITE) { 00245 // output to SQLITE 00246 // 00247 00248 SqliteDatabaseConnector sqliteDbCntr(output); 00249 sqliteDbCntr.beginTransaction(); 00250 sqliteDbCntr.createNgramTable(ngrams); 00251 00252 // write results to output stream 00253 ProgressBar<char> progressBar; 00254 long total = ngramMap.size(); 00255 long count = 0; 00256 std::map<NgramList, int>::const_iterator it; 00257 for (it = ngramMap.begin(); it != ngramMap.end(); it++) { 00258 00259 // convert from NgramList to Ngram 00260 Ngram ngram; 00261 for (NgramList::const_iterator jt = it->first.begin(); 00262 jt != it->first.end(); 00263 jt++) { 00264 ngram.push_back(*jt); 00265 } 00266 00267 if (append) { 00268 // need to check whether ngram is already in database. 00269 // when appending to existing database 00270 int count = sqliteDbCntr.getNgramCount(ngram); 00271 if (count > 0) { 00272 // ngram already in database, update count 00273 sqliteDbCntr.updateNgram(ngram, count + it->second); 00274 } else { 00275 // ngram not in database, insert it 00276 sqliteDbCntr.insertNgram(ngram, it->second); 00277 } 00278 } else { 00279 // insert ngram 00280 sqliteDbCntr.insertNgram(ngram, it->second); 00281 } 00282 00283 progressBar.update(static_cast<double>(count++)/total); 00284 } 00285 sqliteDbCntr.endTransaction(); 00286 } else { 00287 abort(); 00288 } 00289 00290 00291 std::cout << std::endl; 00292 00293 return 0; 00294 } 00295 00296 00297 void version() 00298 { 00299 std::cout 00300 << PROGRAM_NAME << " (" << PACKAGE << ") version " << VERSION << std::endl 00301 << "Copyright (C) Matteo Vescovi" << std::endl 00302 << "This is free software; see the source for copying conditions. There is NO" << std::endl 00303 << "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." << std::endl 00304 << std::endl; 00305 } 00306 00307 00308 void usage() 00309 { 00310 std::cout 00311 << "Usage: " << PROGRAM_NAME << " [OPTION]... infiles..." << std::endl 00312 << std::endl 00313 << " --output, -o O " << "Output file name O" << std::endl 00314 << " --ngrams, -n N " << "Specify ngram cardinality N" << std::endl 00315 << " --format, -f F " << "Output file format F: sqlite, tsv (tabbed separated values)" << std::endl 00316 << " --lowercase, -l " << "Enable lowercase conversion mode" << std::endl 00317 << " --append, -a " << "Open output file in append mode" << std::endl 00318 << " --help, -h " << "Display this information" << std::endl 00319 << " --version, -v " << "Show version information" << std::endl 00320 << std::endl 00321 << PROGRAM_NAME << " is free software distributed under the GPL." << std::endl 00322 << "Send bug reports to " << PACKAGE_BUGREPORT << std::endl 00323 << "Copyright (C) Matteo Vescovi" << std::endl; 00324 }