presage
0.8.7
|
00001 00002 /****************************************************** 00003 * Presage, an extensible predictive text entry system 00004 * --------------------------------------------------- 00005 * 00006 * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk> 00007 00008 This program is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU General Public License as published by 00010 the Free Software Foundation; either version 2 of the License, or 00011 (at your option) any later version. 00012 00013 This program is distributed in the hope that it will be useful, 00014 but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 GNU General Public License for more details. 00017 00018 You should have received a copy of the GNU General Public License along 00019 with this program; if not, write to the Free Software Foundation, Inc., 00020 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 00021 * 00022 **********(*)*/ 00023 00024 00025 #include "forwardTokenizer.h" 00026 00027 ForwardTokenizer::ForwardTokenizer(std::istream& stream, 00028 const std::string blankspaces, 00029 const std::string separators) 00030 : Tokenizer(stream, blankspaces, separators) 00031 { 00032 //std::cerr << "ForwardTokenizer::ForwardTokenizer()" << std::endl; 00033 offset = offbeg; 00034 } 00035 00036 ForwardTokenizer::~ForwardTokenizer() 00037 {} 00038 00039 int ForwardTokenizer::countTokens() 00040 { 00041 StreamGuard guard(stream, offset); 00042 00043 // store current seek pointer position 00044 std::streamoff curroff = offset; 00045 00046 // position get pointer at beginning of stream 00047 offset = offbeg; 00048 00049 int count = 0; 00050 while (hasMoreTokens()) { 00051 count++; 00052 nextToken(); 00053 } 00054 00055 // reposition seek get pointer to original position 00056 offset = curroff; 00057 00058 return count; 00059 } 00060 00061 bool ForwardTokenizer::hasMoreTokens() const 00062 { 00063 //StreamGuard guard(stream, offset); 00064 00065 if (offset >= offend) { 00066 return false; 00067 } else { 00068 return true; 00069 } 00070 } 00071 00072 std::string ForwardTokenizer::nextToken() 00073 { 00074 StreamGuard guard(stream, offset); 00075 00076 int current; 00077 std::string str; 00078 00079 if (stream.good()) { // good() if bad,fail and eof bit are not set 00080 current = stream.peek(); 00081 if (offset < offend) { 00082 00083 while (isBlankspace(current) 00084 || isSeparator(current)) { 00085 offset++; 00086 stream.seekg(offset); 00087 current = stream.peek(); 00088 } 00089 00090 while (!isBlankspace(current) 00091 && !isSeparator(current) 00092 && offset < offend) { 00093 00094 //std::cerr << "[DEBUG] read: " 00095 // << static_cast<char>(current) 00096 // << std::endl; 00097 00098 if( lowercaseMode() ) { 00099 current = tolower( current ); 00100 } 00101 00102 str.push_back(current); 00103 00104 //std::cerr << "[DEBUG] pushed: " 00105 // << static_cast<char>(current) 00106 // << std::endl; 00107 00108 offset++; 00109 stream.seekg(offset); 00110 current = stream.peek(); 00111 } 00112 } 00113 00114 // do { 00115 // do { 00116 // current = stream.peek(); 00117 // offset++; 00118 // stream.seekg(offset); 00119 // 00120 // //std::cerr << "[DEBUG] read: " 00121 // // << static_cast<char>(current) 00122 // // << std::endl; 00123 // 00124 // if ( !isBlankspace(current) 00125 // && !isSeparator(current) 00126 // && offset <= offend) { 00127 // 00128 // if( lowercaseMode() ) { 00129 // current = tolower( current ); 00130 // } 00131 // 00132 // str.push_back(current); 00133 // 00134 // //std::cerr << "[DEBUG] pushed: " 00135 // // << static_cast<char>(current) 00136 // // << std::endl; 00137 // } 00138 // } while ( !isBlankspace(current) 00139 // && !isSeparator(current) 00140 // && offset < offend); 00141 // } while (str.empty() && (offset < offend)); 00142 } else { 00143 std::cerr << "stream is NOT good!" << std::endl; 00144 } 00145 00146 //std::cerr << "[DEBUG] token: " << str << std::endl; 00147 00148 return str; 00149 } 00150 00151 double ForwardTokenizer::progress() const 00152 { 00153 return static_cast<double>(offset) / offend; 00154 } 00155