presage  0.8.7
reverseTokenizer.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************
00003  *  Presage, an extensible predictive text entry system
00004  *  ---------------------------------------------------
00005  *
00006  *  Copyright (C) 2008  Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
00007 
00008     This program is free software; you can redistribute it and/or modify
00009     it under the terms of the GNU General Public License as published by
00010     the Free Software Foundation; either version 2 of the License, or
00011     (at your option) any later version.
00012 
00013     This program is distributed in the hope that it will be useful,
00014     but WITHOUT ANY WARRANTY; without even the implied warranty of
00015     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016     GNU General Public License for more details.
00017 
00018     You should have received a copy of the GNU General Public License along
00019     with this program; if not, write to the Free Software Foundation, Inc.,
00020     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00021                                                                              *
00022                                                                 **********(*)*/
00023 
00024 
00025 #include "reverseTokenizer.h"
00026 
00027 ReverseTokenizer::ReverseTokenizer(std::istream& stream,
00028                                    const std::string   blanks,
00029                                    const std::string   separs)
00030     : Tokenizer(stream, blanks, separs)
00031 {
00032     offset = offend;
00033     //assert( stream.good());
00034     //assert(!stream.fail());
00035     //assert(!stream.bad() );
00036     //assert(!stream.eof() );
00037     // stream clearing needed because offset is positioned at end
00038     stream.clear();
00039 
00040     //std::cerr << "ReverseTokenizer::ReverseTokenizer() offbeg: " << offbeg 
00041     //          << " offset: " << offset << " offend: " << offend << std::endl;
00042 }
00043 
00044 ReverseTokenizer::~ReverseTokenizer()
00045 {}
00046 
00047 int ReverseTokenizer::countTokens()
00048 {
00049     StreamGuard guard(stream, offset);
00050 
00051     // store current seek pointer position
00052     std::streamoff curroff = offset;
00053 
00054     // position get pointer at end of stream
00055     offset = offend;
00056 
00057     int count = 0;
00058     while (hasMoreTokens()) {
00059         nextToken();
00060         count++;
00061     }
00062 
00063     // reposition seek get pointer to original position
00064     offset = curroff;
00065 
00066     return count;
00067 }
00068 
00069 bool ReverseTokenizer::hasMoreTokens() const
00070 {
00071     //std::cerr << "ReverseTokenizer::hasMoreTokens() offbeg: " << offbeg 
00072     //          << " offset: " << offset << " offend: " << offend << std::endl;
00073     if (offbeg < offset) {
00074         return true;
00075     } else {
00076         return false;
00077     }
00078 }
00079     
00080 std::string ReverseTokenizer::nextToken()
00081 {
00082     StreamGuard guard(stream, offset);
00083 
00084     int current;
00085     std::string str;
00086 
00087     if (stream.good()) {
00088         while (offbeg < offset
00089                && str.empty()) {
00090             stream.seekg(offset - 1);
00091             current = stream.peek();
00092 
00093             if (offset == offend &&
00094                 (isSeparator(current) || isBlankspace(current))) {
00095                 offset--;
00096                 return str;
00097             }
00098 
00099             while ((isBlankspace(current) || isSeparator(current))
00100                    && offbeg < offset ) {
00101                 offset--;
00102                 stream.seekg(offset - 1);
00103                 current = stream.peek();
00104             }
00105 
00106             while (!isBlankspace(current)
00107                    && !isSeparator(current)
00108                    && offbeg < offset) {
00109                 
00110                 if( lowercaseMode() ) {
00111                     current = tolower( current );
00112                 }
00113 
00114                 // since the token is read backwards, the string
00115                 // needs to be reversed by inserting the char at
00116                 // the front
00117                 str.insert(str.begin(), current);
00118 
00119                 offset--;
00120                 stream.seekg(offset - 1);
00121                 current = stream.peek();
00122             }
00123         }
00124     }
00125 
00126 //    if (stream.good()) {
00127 //      do {
00128 //          do {
00129 //              current = stream.peek();
00130 //              offset--;
00131 //              stream.seekg(offset);
00132 //
00133 //              // handle case where last character is a separator by
00134 //              // returning an empty token
00135 //                if (offset == offend - 2
00136 //                    && isSeparator(current)) {
00137 //                    return "";
00138 //                }
00139 //                
00140 //              //std::cerr << "[DEBUG] read: "
00141 //                //  << static_cast<char>(current)
00142 //                //  << std::endl;
00143 //
00144 //              if (!isBlankspace(current)
00145 //                    && !isSeparator(current)
00146 //                    && offset >= offbeg - 1) {
00147 //                    
00148 //                  if( lowercaseMode() ) {
00149 //                      current = tolower( current );
00150 //                  }
00151 //
00152 //                    // since the token is read backwards, the string
00153 //                    // needs to be reversed by inserting the char at
00154 //                    // the front
00155 //                  str.insert(str.begin(), current);
00156 //
00157 //                  //std::cerr << "[DEBUG] pushed: "
00158 //                    //  << static_cast<char>(current)
00159 //                    //  << std::endl;
00160 //                    //std::cerr << "[DEBUG] partial string: "
00161 //                    //          << str << std::endl;
00162 //              }
00163 //          } while (!isBlankspace(current)
00164 //                     && !isSeparator(current)
00165 //                     && (offset >= offbeg));
00166 //      } while (str.empty() && (offset >= offbeg));
00167 //    }
00168 
00169     //std::cerr << "[DEBUG] token: " << str << std::endl;
00170 
00171     return str;
00172 }
00173 
00174 double ReverseTokenizer::progress() const
00175 {
00176     return static_cast<double>(offend - offset) / (offend - offbeg);
00177 }