presage  0.8.7
text2ngram.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************
00003  *  Presage, an extensible predictive text entry system
00004  *  ---------------------------------------------------
00005  *
00006  *  Copyright (C) 2008  Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
00007 
00008     This program is free software; you can redistribute it and/or modify
00009     it under the terms of the GNU General Public License as published by
00010     the Free Software Foundation; either version 2 of the License, or
00011     (at your option) any later version.
00012 
00013     This program is distributed in the hope that it will be useful,
00014     but WITHOUT ANY WARRANTY; without even the implied warranty of
00015     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016     GNU General Public License for more details.
00017 
00018     You should have received a copy of the GNU General Public License along
00019     with this program; if not, write to the Free Software Foundation, Inc.,
00020     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00021                                                                              *
00022                                                                 **********(*)*/
00023 
00024 
00025 #include "config.h"
00026 
00027 #include <iostream>
00028 #include <fstream>
00029 #include <vector>
00030 #include <list>
00031 #include <string>
00032 #include <map>
00033 
00034 #ifdef HAVE_UNISTD_H
00035 # include <unistd.h>
00036 #endif
00037 
00038 #ifdef HAVE_STDLIB_H
00039 # include <stdlib.h>
00040 #endif
00041 
00042 #include <getopt.h>
00043 #include <assert.h>
00044 
00045 #include "core/tokenizer/forwardTokenizer.h"
00046 #include "core/iso8859_1.h"
00047 #include "core/progress.h"
00048 
00049 #include "../lib/predictors/dbconnector/sqliteDatabaseConnector.h"
00050 
00051 const std::string PROGRAM_NAME = "text2ngram";
00052 
00053 typedef std::list<std::string> NgramList;
00054 
00055 void usage();
00056 void version();
00057 
00058 int main(int argc, char* argv[])
00059 {
00060     int next_option;
00061 
00062     // Setup some defaults
00063     //  - default to generating 1-gram counts
00064     int ngrams = 1;
00065 
00066     //  - default output to stdout (empty string signifies stdout)
00067     std::string output;
00068 
00069     const std::string TABBED_SEPARATED_VALUES = "tsv";
00070     const std::string SQLITE = "sqlite";
00071     //  - default format is tabbed separated values 
00072     std::string format = TABBED_SEPARATED_VALUES;
00073 
00074     //  - default to case sensitive
00075     bool lowercase = false;
00076 
00077     //  - default to no append
00078     bool append    = false;
00079 
00080         
00081     // getopt structures
00082     const char * const  short_options  = "n:o:f:alhv";
00083     const struct option long_options[] =
00084         {
00085             { "ngrams",    required_argument, 0, 'n' },
00086             { "output",    required_argument, 0, 'o' },
00087             { "format",    required_argument, 0, 'f' },
00088             { "append",    no_argument,       0, 'a' },
00089             { "lowercase", no_argument,       0, 'l' },
00090             { "help",      no_argument,       0, 'h' },
00091             { "version",   no_argument,       0, 'v' },
00092             { 0,           0,                 0, 0   }
00093         };
00094 
00095     do {
00096         next_option = getopt_long(argc,
00097                                   argv, 
00098                                   short_options,
00099                                   long_options,
00100                                   NULL);
00101                 
00102         switch (next_option) {
00103         case 'n': // --ngrams or -n option
00104             if (atoi(optarg) > 0) {
00105                 ngrams = atoi(optarg);
00106             } else {
00107                 usage();
00108             }
00109             break;
00110         case 'o': // --output or -o option
00111             output = optarg;
00112             break;
00113         case 'f': // --format or -f option
00114             if (optarg == SQLITE
00115                 || optarg == TABBED_SEPARATED_VALUES) {
00116                 format = optarg;
00117             } else {
00118                 std::cerr << "Unknown format " << optarg << std::endl << std::endl;
00119                 usage();
00120                 return -1;
00121             }
00122             break;
00123         case 'a': // --append or -a option
00124             // append mode
00125             append = true;
00126             break;
00127         case 'l': // --lowercase or -l option
00128             lowercase = true;
00129             break;
00130         case 'h': // --help or -h option
00131             usage();
00132             exit (0);
00133             break;
00134         case 'v': // --version or -v option
00135             version();
00136             exit (0);
00137             break;
00138         case '?': // unknown option
00139             usage();
00140             exit (0);
00141             break;
00142         case -1:
00143             break;
00144         default:
00145             std::cerr << "Error: unhandled option." << std::endl;
00146             exit(0);
00147         }
00148 
00149     } while (next_option != -1);
00150 
00151 
00152     if ((argc - optind < 1)) {
00153         usage();
00154         return -1;
00155     }
00156         
00157 
00158     // ngramMap stores <token,count> pairs
00159     std::map<NgramList, int> ngramMap;
00160 
00161     for (int i = optind; i < argc; i++) {
00162         // do the actual processing file by file
00163         std::string token;
00164         NgramList ngram;
00165 
00166         // points to output file
00167         // print out file information
00168         std::cout << "Parsing " << argv[i] << "..."
00169                   << std::endl;
00170 
00171         ProgressBar<char> progressBar;
00172 
00173         // create tokenizer object and open input file stream
00174         std::ifstream infile(argv[i]);
00175         ForwardTokenizer tokenizer(infile,
00176                                    " \f\n\r\t\v",
00177                                    "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<");
00178         tokenizer.lowercaseMode(lowercase);
00179 
00180         // take care of first N-1 tokens
00181         for (int i = 0; (i < ngrams - 1 && tokenizer.hasMoreTokens()); i++) {
00182             ngram.push_back(tokenizer.nextToken());
00183         }
00184 
00185         while (tokenizer.hasMoreTokens()) {
00186             // extract token from input stream
00187             token = tokenizer.nextToken();
00188 
00189             // update ngram with new token
00190             ngram.push_back(token);
00191                         
00192             // update map with new token occurrence
00193             ngramMap[ngram] = ngramMap[ngram] + 1;
00194                     
00195             // update progress bar
00196             //progressBar(tokenizer.progress());
00197             progressBar.update(tokenizer.progress());
00198 
00199             // remove front token from ngram
00200             ngram.pop_front();
00201         }
00202         
00203         infile.close();
00204     }
00205 
00206 
00207     std::cout << "Writing out to " << format << " format file "
00208               << output << "..." << std::endl;
00209     if (format == TABBED_SEPARATED_VALUES) {
00210         // output to tabbed separated values text file
00211         //
00212 
00213         std::ofstream *outstream = 0;
00214         std::ostream  *prev_outstream = 0;
00215 
00216         if (output.c_str()) {
00217             // tie outstream to file
00218             outstream = new std::ofstream (output.c_str(), std::ios::out);
00219             assert(outstream);
00220             prev_outstream = std::cout.tie (outstream);
00221         }
00222 
00223         // write results to output stream
00224         ProgressBar<char> progressBar;
00225         long total = ngramMap.size();
00226         long count = 0;
00227         std::map<NgramList, int>::const_iterator it;
00228         for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
00229             for (NgramList::const_iterator ngram_it = it->first.begin();
00230                  ngram_it != it->first.end();
00231                  ngram_it++) {
00232                 std::cout << *ngram_it << '\t';
00233             }
00234             std::cout << it->second << std::endl;
00235             progressBar.update(static_cast<double>(count++)/total);
00236         }
00237 
00238         if (output.c_str()) {
00239             std::cout.tie (prev_outstream);
00240             outstream->close ();
00241             delete outstream;
00242         }
00243 
00244     } else if (format == SQLITE) {
00245         // output to SQLITE
00246         // 
00247 
00248         SqliteDatabaseConnector sqliteDbCntr(output);
00249         sqliteDbCntr.beginTransaction();
00250         sqliteDbCntr.createNgramTable(ngrams);
00251 
00252         // write results to output stream
00253         ProgressBar<char> progressBar;
00254         long total = ngramMap.size();
00255         long count = 0;
00256         std::map<NgramList, int>::const_iterator it;
00257         for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
00258 
00259             // convert from NgramList to Ngram
00260             Ngram ngram;
00261             for (NgramList::const_iterator jt = it->first.begin();
00262                  jt != it->first.end();
00263                  jt++) {
00264                 ngram.push_back(*jt);
00265             }
00266 
00267             if (append) {
00268                 // need to check whether ngram is already in database.
00269                 // when appending to existing database
00270                 int count = sqliteDbCntr.getNgramCount(ngram);
00271                 if (count > 0) {
00272                     // ngram already in database, update count
00273                     sqliteDbCntr.updateNgram(ngram, count + it->second);
00274                 } else {
00275                     // ngram not in database, insert it
00276                     sqliteDbCntr.insertNgram(ngram, it->second);
00277                 }
00278             } else {
00279                 // insert ngram
00280                 sqliteDbCntr.insertNgram(ngram, it->second);
00281             }
00282 
00283             progressBar.update(static_cast<double>(count++)/total);
00284         }
00285         sqliteDbCntr.endTransaction();
00286     } else {
00287         abort();
00288     }
00289 
00290 
00291     std::cout << std::endl;
00292 
00293     return 0;
00294 }
00295 
00296 
00297 void version()
00298 {
00299     std::cout
00300         << PROGRAM_NAME << " (" << PACKAGE << ") version " << VERSION << std::endl
00301         << "Copyright (C) Matteo Vescovi" << std::endl
00302         << "This is free software; see the source for copying conditions.  There is NO" << std::endl
00303         << "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." << std::endl
00304         << std::endl;
00305 }
00306 
00307 
00308 void usage()
00309 {
00310     std::cout 
00311         << "Usage: " << PROGRAM_NAME << " [OPTION]... infiles..." << std::endl
00312         << std::endl
00313         << "  --output, -o O  " << "Output file name O" << std::endl
00314         << "  --ngrams, -n N  " << "Specify ngram cardinality N" << std::endl
00315         << "  --format, -f F  " << "Output file format F: sqlite, tsv (tabbed separated values)" << std::endl
00316         << "  --lowercase, -l " << "Enable lowercase conversion mode" << std::endl
00317         << "  --append, -a    " << "Open output file in append mode" << std::endl
00318         << "  --help, -h      " << "Display this information" << std::endl
00319         << "  --version, -v   " << "Show version information" << std::endl
00320         << std::endl
00321         << PROGRAM_NAME << " is free software distributed under the GPL." << std::endl
00322         << "Send bug reports to " << PACKAGE_BUGREPORT << std::endl
00323         << "Copyright (C) Matteo Vescovi" << std::endl;
00324 }