presage
0.8.7
|
00001 00002 /****************************************************** 00003 * Presage, an extensible predictive text entry system 00004 * --------------------------------------------------- 00005 * 00006 * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk> 00007 00008 This program is free software; you can redistribute it and/or modify 00009 it under the terms of the GNU General Public License as published by 00010 the Free Software Foundation; either version 2 of the License, or 00011 (at your option) any later version. 00012 00013 This program is distributed in the hope that it will be useful, 00014 but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 GNU General Public License for more details. 00017 00018 You should have received a copy of the GNU General Public License along 00019 with this program; if not, write to the Free Software Foundation, Inc., 00020 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 00021 * 00022 **********(*)*/ 00023 00024 00025 #include "contextChangeDetector.h" 00026 #include "../tokenizer/reverseTokenizer.h" 00027 00028 #include <iostream> 00029 #include <sstream> 00030 #include <stdlib.h> // for atoi() 00031 #include <assert.h> 00032 00033 const std::string::size_type ContextChangeDetector::DEFAULT_SLIDING_WINDOW_SIZE = 80; 00034 00035 ContextChangeDetector::ContextChangeDetector(const std::string wChars, 00036 const std::string tChars, 00037 const std::string bChars, 00038 const std::string cChars) 00039 : wordChars (wChars), 00040 separatorChars (tChars), 00041 blankspaceChars(bChars), 00042 controlChars (cChars) 00043 { 00044 //std::cerr << "ContextChangeDetector::ContextChangeDetector()" << std::endl 00045 // << "wordChars: " << wordChars << std::endl; 00046 } 00047 00048 ContextChangeDetector::~ContextChangeDetector() 00049 { 00050 00051 } 00052 00053 void ContextChangeDetector::set_sliding_window_size(const std::string& str) 00054 { 00055 if(!str.empty()) { 00056 SLIDING_WINDOW_SIZE = atoi(str.c_str()); 00057 } else { 00058 SLIDING_WINDOW_SIZE = DEFAULT_SLIDING_WINDOW_SIZE; 00059 } 00060 } 00061 00062 void ContextChangeDetector::update_sliding_window(const std::string& str) 00063 { 00064 if (str.size() <= SLIDING_WINDOW_SIZE) { 00065 // past stream fits in sliding window 00066 sliding_window = str; 00067 } else { 00068 // trim past stream down to sliding window 00069 sliding_window = str.substr(str.size() - SLIDING_WINDOW_SIZE); 00070 assert(sliding_window.size() == SLIDING_WINDOW_SIZE); 00071 } 00072 } 00073 00074 bool ContextChangeDetector::context_change(const std::string& past_stream) const 00075 { 00076 // Here's how this is going to be implemented... We'll keep a 00077 // sliding window on the last few chars seen by presage; the 00078 // buffer holding them is the sliding window. We'll search for the 00079 // last occurence of sliding_window in past_stream, if any. 00080 00081 // If sliding_window is not found in past_stream, then it is not 00082 // possible to relate the current context to the previously seen 00083 // context, hence we assume a context change has occured. 00084 00085 // If sliding_window is found, then we need to examine the chars 00086 // following the sliding window in the past stream. We call this 00087 // the remainder. If there are any non-word chars in the 00088 // remainder, then a context change has occurred. Else, no context 00089 // change occured. 00090 00091 // The sliding window is never implicitly updated as part of 00092 // invoking this method. 00093 00094 return context_change_helper(sliding_window, past_stream); 00095 } 00096 00097 00098 bool ContextChangeDetector::context_change_helper(const std::string& prev_context, const std::string& curr_context) const 00099 { 00100 bool result = false; 00101 00102 if (prev_context.empty()) { 00103 if (curr_context.empty()) { 00104 // both contexts are empty, nothing has happened, no 00105 // change happened 00106 result = false; 00107 } else { 00108 // current context changed, previous context is empty, 00109 // first change happened 00110 result = true; // REVISIT: this should really be true, 00111 // but setting it to true screws up 00112 // learning 00113 } 00114 } else { 00115 // find position of previous context in current context 00116 // i.e. find index pointing to last char of last occurence of 00117 // prev_context in curr_context 00118 std::string::size_type ctx_idx = curr_context.rfind(prev_context); 00119 00120 if (ctx_idx == std::string::npos) { 00121 // prev_context could not be found in curr_context, a lot 00122 // changed 00123 result = true; 00124 } else { 00125 // found prev_context, examine remainder string. 00126 // remainder string is substr(ctx_idx + 00127 // prev_context.size()); i.e. substring given by index 00128 // returned by rfind (which points at beginning of 00129 // prev_context string found in curr_context) plus size of 00130 // prev_context: this index points at end of prev_context 00131 // substring found in curr_context 00132 00133 std::string remainder = curr_context.substr(ctx_idx + prev_context.size()); 00134 00135 std::string::size_type idx = remainder.find_last_of(wordChars); 00136 if (idx == std::string::npos) { 00137 if (remainder.empty()) { 00138 result = false; 00139 } else { 00140 char last_char = curr_context[ctx_idx + prev_context.size() - 1]; 00141 idx = wordChars.find(last_char); 00142 if (idx == std::string::npos) { 00143 result = false; 00144 } else { 00145 result = true; 00146 } 00147 } 00148 } else { 00149 if (idx == remainder.size() - 1) { 00150 result = false; 00151 } else { 00152 result = true; 00153 } 00154 } 00155 00156 00157 /* 00158 * alternate implementation of the logic in the enclosing else 00159 * block. This uses tokenizers, which is not desirable as it makes 00160 * tokenizer a dependency of context change detector. 00161 00162 std::string remainder = curr_context.substr(loc + prev_context.size()); 00163 00164 std::stringstream curr_strstream(curr_context); 00165 std::stringstream prev_strstream(prev_context); 00166 00167 ReverseTokenizer curr_tokenizer(curr_strstream, blankspaceChars, separatorChars); 00168 ReverseTokenizer prev_tokenizer(prev_strstream, blankspaceChars, separatorChars); 00169 00170 std::string prev_token = prev_tokenizer.nextToken(); 00171 std::string curr_token = curr_tokenizer.nextToken(); 00172 00173 if (curr_token.empty()) { 00174 if (prev_token.empty()) { 00175 result = false; 00176 00177 loc = curr_context.find_first_of(wordChars, loc); 00178 if (loc == std::string::npos) { 00179 result = false; 00180 } else { 00181 result = true; 00182 } 00183 00184 } else { 00185 result = true; 00186 } 00187 00188 } else { 00189 loc = curr_token.find(prev_token); 00190 if (loc == std::string::npos) { 00191 result = true; 00192 } else { 00193 result = false; 00194 } 00195 } 00196 */ 00197 00198 } 00199 } 00200 00201 return result; 00202 } 00203 00204 std::string ContextChangeDetector::change(const std::string& past_stream) const 00205 { 00206 const std::string& prev_context = sliding_window; // let's rename these 00207 const std::string& curr_context = past_stream; // for clarity's sake 00208 00209 std::string result; 00210 00211 if (sliding_window.empty()) { 00212 result = past_stream; 00213 } else { 00214 // find position of previous context in current context 00215 // i.e. find index pointing to last char of last occurence of 00216 // prev_context in curr_context 00217 std::string::size_type ctx_idx = curr_context.rfind(prev_context); 00218 00219 if (ctx_idx == std::string::npos) { 00220 // prev_context could not be found in curr_context, a lot 00221 // changed 00222 result = past_stream; 00223 } else { 00224 // found prev_context, examine remainder string. 00225 // remainder string is substr(ctx_idx + 00226 // prev_context.size()); i.e. substring given by index 00227 // returned by rfind (which points at beginning of 00228 // prev_context string found in curr_context) plus size of 00229 // prev_context: this index points at end of prev_context 00230 // substring found in curr_context 00231 00232 result = curr_context.substr(ctx_idx + prev_context.size()); 00233 } 00234 } 00235 00236 return result; 00237 } 00238 00239 std::string ContextChangeDetector::get_sliding_window() const 00240 { 00241 return sliding_window; 00242 }