presage  0.8.7
tokenizer.h
Go to the documentation of this file.
00001 
00002 /******************************************************
00003  *  Presage, an extensible predictive text entry system
00004  *  ---------------------------------------------------
00005  *
00006  *  Copyright (C) 2008  Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
00007 
00008     This program is free software; you can redistribute it and/or modify
00009     it under the terms of the GNU General Public License as published by
00010     the Free Software Foundation; either version 2 of the License, or
00011     (at your option) any later version.
00012 
00013     This program is distributed in the hope that it will be useful,
00014     but WITHOUT ANY WARRANTY; without even the implied warranty of
00015     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016     GNU General Public License for more details.
00017 
00018     You should have received a copy of the GNU General Public License along
00019     with this program; if not, write to the Free Software Foundation, Inc.,
00020     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00021                                                                              *
00022                                                                 **********(*)*/
00023 
00024 
00025 #ifndef PRESAGE_TOKENIZER
00026 #define PRESAGE_TOKENIZER
00027 
00028 #ifdef HAVE_CONFIG_H
00029 #include "config.h"
00030 #endif
00031 
00032 #include <iostream>
00033 #include <istream>
00034 #include <string>
00035 #include <assert.h>
00036 
00064 class Tokenizer {
00065 public:
00066     Tokenizer(std::istream& stream,
00067               const std::string   blankspaces,
00068               const std::string   separators  );
00069     virtual ~Tokenizer();
00070 
00073     virtual int countTokens() = 0;
00074 
00077     virtual bool hasMoreTokens() const = 0;
00078     
00081     virtual std::string nextToken() = 0;
00082 
00085     virtual double progress() const = 0;
00086 
00087 
00090     void blankspaceChars(const std::string);
00093     std::string blankspaceChars() const;
00094 
00097     void separatorChars(const std::string);
00100     std::string separatorChars() const;
00101 
00104     void lowercaseMode(const bool);
00107     bool lowercaseMode() const;
00108 
00109     std::string streamToString() const {
00110         std::streamoff offbackup = stream.tellg();
00111         std::string str;
00112         std::streamoff curroff = offbeg;
00113         stream.seekg(curroff);
00114         while (curroff < offend) {
00115             stream.clear();
00116             str.push_back(stream.peek());
00117             curroff++;
00118             stream.seekg(curroff);
00119         }
00120         stream.seekg(offbackup);
00121         return str;
00122     }
00123     
00124 protected:
00125     class StreamGuard {
00126     public:
00127         StreamGuard(std::istream& so, std::streamoff& of) 
00128             : guardedStream(so) {
00129             currstate = guardedStream.rdstate();
00130             curroff   = guardedStream.tellg();
00131             guardedStream.seekg   (of       );
00132         }
00133         ~StreamGuard() {
00134             guardedStream.seekg   (curroff  );
00135             guardedStream.setstate(currstate);
00136         }
00137 
00138     private:
00139         std::istream&     guardedStream;
00140         std::ios::iostate currstate;
00141         std::streamoff    curroff;
00142     };
00143 
00144     std::istream&     stream;
00145     std::ios::iostate sstate;
00146     std::streamoff    offbeg;
00147     std::streamoff    offend;
00148     std::streamoff    offset;
00149 
00150     bool isBlankspace(const int character) const;
00151     bool isSeparator (const int character) const;
00152 
00153 private:
00154     std::string blankspaces;
00155     std::string separators;
00156 
00157     bool lowercase;
00158 };
00159 
00160 #endif // PRESAGE_TOKENIZER