presage  0.8.7
contextChangeDetector.cpp
Go to the documentation of this file.
00001 
00002 /******************************************************
00003  *  Presage, an extensible predictive text entry system
00004  *  ---------------------------------------------------
00005  *
00006  *  Copyright (C) 2008  Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
00007 
00008     This program is free software; you can redistribute it and/or modify
00009     it under the terms of the GNU General Public License as published by
00010     the Free Software Foundation; either version 2 of the License, or
00011     (at your option) any later version.
00012 
00013     This program is distributed in the hope that it will be useful,
00014     but WITHOUT ANY WARRANTY; without even the implied warranty of
00015     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016     GNU General Public License for more details.
00017 
00018     You should have received a copy of the GNU General Public License along
00019     with this program; if not, write to the Free Software Foundation, Inc.,
00020     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00021                                                                              *
00022                                                                 **********(*)*/
00023 
00024 
00025 #include "contextChangeDetector.h"
00026 #include "../tokenizer/reverseTokenizer.h"
00027 
00028 #include <iostream>
00029 #include <sstream>
00030 #include <stdlib.h>  // for atoi()
00031 #include <assert.h>
00032 
00033 const std::string::size_type ContextChangeDetector::DEFAULT_SLIDING_WINDOW_SIZE = 80;
00034 
00035 ContextChangeDetector::ContextChangeDetector(const std::string wChars,
00036                                              const std::string tChars,
00037                                              const std::string bChars,
00038                                              const std::string cChars)
00039     : wordChars      (wChars),
00040       separatorChars (tChars),
00041       blankspaceChars(bChars),
00042       controlChars   (cChars)
00043 {
00044     //std::cerr << "ContextChangeDetector::ContextChangeDetector()" << std::endl
00045     //          << "wordChars: " << wordChars << std::endl;
00046 }
00047 
00048 ContextChangeDetector::~ContextChangeDetector()
00049 {
00050 
00051 }
00052 
00053 void ContextChangeDetector::set_sliding_window_size(const std::string& str)
00054 {
00055     if(!str.empty()) {
00056         SLIDING_WINDOW_SIZE = atoi(str.c_str());
00057     } else {
00058         SLIDING_WINDOW_SIZE = DEFAULT_SLIDING_WINDOW_SIZE;
00059     }
00060 }
00061 
00062 void ContextChangeDetector::update_sliding_window(const std::string& str)
00063 {
00064     if (str.size() <= SLIDING_WINDOW_SIZE) {
00065         // past stream fits in sliding window
00066         sliding_window = str;
00067     } else {
00068         // trim past stream down to sliding window
00069         sliding_window = str.substr(str.size() - SLIDING_WINDOW_SIZE);
00070         assert(sliding_window.size() == SLIDING_WINDOW_SIZE);
00071     }
00072 }
00073 
00074 bool ContextChangeDetector::context_change(const std::string& past_stream) const
00075 {
00076     // Here's how this is going to be implemented...  We'll keep a
00077     // sliding window on the last few chars seen by presage; the
00078     // buffer holding them is the sliding window. We'll search for the
00079     // last occurence of sliding_window in past_stream, if any.
00080 
00081     // If sliding_window is not found in past_stream, then it is not
00082     // possible to relate the current context to the previously seen
00083     // context, hence we assume a context change has occured.
00084 
00085     // If sliding_window is found, then we need to examine the chars
00086     // following the sliding window in the past stream. We call this
00087     // the remainder. If there are any non-word chars in the
00088     // remainder, then a context change has occurred. Else, no context
00089     // change occured.
00090 
00091     // The sliding window is never implicitly updated as part of
00092     // invoking this method.
00093 
00094     return context_change_helper(sliding_window, past_stream);
00095 }
00096 
00097 
00098 bool ContextChangeDetector::context_change_helper(const std::string& prev_context, const std::string& curr_context) const
00099 {
00100     bool result = false;
00101     
00102     if (prev_context.empty()) {
00103         if (curr_context.empty()) {
00104             // both contexts are empty, nothing has happened, no
00105             // change happened
00106             result = false;
00107         } else {
00108             // current context changed, previous context is empty,
00109             // first change happened
00110             result = true;  // REVISIT: this should really be true,
00111                             // but setting it to true screws up
00112                             // learning
00113         }
00114     } else {
00115         // find position of previous context in current context
00116         // i.e. find index pointing to last char of last occurence of
00117         // prev_context in curr_context
00118         std::string::size_type ctx_idx = curr_context.rfind(prev_context);
00119         
00120         if (ctx_idx == std::string::npos) {
00121             // prev_context could not be found in curr_context, a lot
00122             // changed
00123             result = true;
00124         } else {
00125             // found prev_context, examine remainder string.
00126             // remainder string is substr(ctx_idx +
00127             // prev_context.size()); i.e. substring given by index
00128             // returned by rfind (which points at beginning of
00129             // prev_context string found in curr_context) plus size of
00130             // prev_context: this index points at end of prev_context
00131             // substring found in curr_context
00132 
00133             std::string remainder = curr_context.substr(ctx_idx + prev_context.size());
00134 
00135             std::string::size_type idx = remainder.find_last_of(wordChars);
00136             if (idx == std::string::npos) {
00137                 if (remainder.empty()) {
00138                     result = false;
00139                 } else {
00140                     char last_char = curr_context[ctx_idx + prev_context.size() - 1];
00141                     idx = wordChars.find(last_char);
00142                     if (idx == std::string::npos) {
00143                         result = false;
00144                     } else {
00145                         result = true;
00146                     }
00147                 }
00148             } else {
00149                 if (idx == remainder.size() - 1) {
00150                     result = false;
00151                 } else {
00152                     result = true;
00153                 }
00154             }
00155 
00156 
00157 /*
00158  * alternate implementation of the logic in the enclosing else
00159  * block. This uses tokenizers, which is not desirable as it makes
00160  * tokenizer a dependency of context change detector.
00161 
00162             std::string remainder = curr_context.substr(loc + prev_context.size());
00163 
00164             std::stringstream curr_strstream(curr_context);
00165             std::stringstream prev_strstream(prev_context);
00166 
00167             ReverseTokenizer curr_tokenizer(curr_strstream, blankspaceChars, separatorChars);
00168             ReverseTokenizer prev_tokenizer(prev_strstream, blankspaceChars, separatorChars);
00169 
00170             std::string prev_token = prev_tokenizer.nextToken();
00171             std::string curr_token = curr_tokenizer.nextToken();
00172 
00173             if (curr_token.empty()) {
00174                 if (prev_token.empty()) {
00175                     result = false;
00176                     
00177                     loc = curr_context.find_first_of(wordChars, loc);
00178                     if (loc == std::string::npos) {
00179                         result = false;
00180                     } else {
00181                         result = true;
00182                     }
00183 
00184                 } else {
00185                     result = true;
00186                 }
00187 
00188             } else {
00189                 loc = curr_token.find(prev_token);
00190                 if (loc == std::string::npos) {
00191                     result = true;
00192                 } else {
00193                     result = false;
00194                 }
00195             }
00196 */
00197 
00198         }
00199     }
00200 
00201     return result;
00202 }
00203 
00204 std::string ContextChangeDetector::change(const std::string& past_stream) const
00205 {
00206     const std::string& prev_context = sliding_window;  // let's rename these
00207     const std::string& curr_context = past_stream;     // for clarity's sake
00208 
00209     std::string result;
00210 
00211     if (sliding_window.empty()) {
00212         result = past_stream;
00213     } else {
00214         // find position of previous context in current context
00215         // i.e. find index pointing to last char of last occurence of
00216         // prev_context in curr_context
00217         std::string::size_type ctx_idx = curr_context.rfind(prev_context);
00218         
00219         if (ctx_idx == std::string::npos) {
00220             // prev_context could not be found in curr_context, a lot
00221             // changed
00222             result = past_stream;
00223         } else {
00224             // found prev_context, examine remainder string.
00225             // remainder string is substr(ctx_idx +
00226             // prev_context.size()); i.e. substring given by index
00227             // returned by rfind (which points at beginning of
00228             // prev_context string found in curr_context) plus size of
00229             // prev_context: this index points at end of prev_context
00230             // substring found in curr_context
00231 
00232             result = curr_context.substr(ctx_idx + prev_context.size());
00233         }
00234     }
00235 
00236     return result;
00237 }
00238 
00239 std::string ContextChangeDetector::get_sliding_window() const
00240 {
00241     return sliding_window;
00242 }