35 "SmoothedNgramPredictor, a linear interpolating n-gram predictor",
36 "SmoothedNgramPredictor, long description." ),
39 learn_mode_set (false),
81 std::stringstream ss_deltas(value);
84 while (ss_deltas >> delta) {
85 logger << DEBUG <<
"Pushing delta: " << delta <<
endl;
142 const char separator[] =
"|";
143 std::string result = separator;
145 for (Ngram::const_iterator it = ngram.begin();
149 result += *it + separator;
173 unsigned int result = 0;
176 assert(ngram_size >= 0);
178 if (ngram_size > 0) {
179 Ngram ngram(ngram_size);
180 copy(tokens.end() - ngram_size + offset , tokens.end() + offset, ngram.begin());
185 logger << DEBUG <<
"unigram counts sum: " << result <<
endl;
220 std::vector<std::string> prefixCompletionCandidates;
221 for (
size_t k =
cardinality; (k > 0 && prefixCompletionCandidates.size() < max_partial_prediction_size); k--) {
222 logger << DEBUG <<
"Building partial prefix completion table of cardinality: " << k <<
endl;
224 Ngram prefix_ngram(k);
225 copy(tokens.end() - k, tokens.end(), prefix_ngram.begin());
228 logger << DEBUG <<
"prefix_ngram: ";
229 for (
size_t r = 0; r < prefix_ngram.size(); r++) {
230 logger << DEBUG << prefix_ngram[r] <<
' ';
241 partial =
db->
getNgramLikeTable(prefix_ngram,max_partial_prediction_size - prefixCompletionCandidates.size());
249 logger << DEBUG <<
"partial prefixCompletionCandidates" <<
endl
250 << DEBUG <<
"----------------------------------" <<
endl;
251 for (
size_t j = 0; j < partial.size(); j++) {
252 for (
size_t k = 0; k < partial[j].size(); k++) {
253 logger << DEBUG << partial[j][k] <<
" ";
259 logger << DEBUG <<
"Partial prefix completion table contains " << partial.size() <<
" potential completions." <<
endl;
265 std::vector<Ngram>::const_iterator it = partial.begin();
266 while (it != partial.end() && prefixCompletionCandidates.size() < max_partial_prediction_size) {
270 std::string candidate = *(it->end() - 2);
271 if (find(prefixCompletionCandidates.begin(),
272 prefixCompletionCandidates.end(),
273 candidate) == prefixCompletionCandidates.end()) {
274 prefixCompletionCandidates.push_back(candidate);
281 logger << DEBUG <<
"prefixCompletionCandidates" <<
endl
282 << DEBUG <<
"--------------------------" <<
endl;
283 for (
size_t j = 0; j < prefixCompletionCandidates.size(); j++) {
284 logger << DEBUG << prefixCompletionCandidates[j] <<
endl;
294 for (
size_t j = 0; (j < prefixCompletionCandidates.size() && j < max_partial_prediction_size); j++) {
296 tokens[
cardinality - 1] = prefixCompletionCandidates[j];
298 logger << DEBUG <<
"------------------" <<
endl;
301 double probability = 0;
303 double numerator =
count(tokens, 0, k+1);
305 double denominator = (k == 0 ? unigrams_counts_sum :
count(tokens, -1, k));
306 double frequency = ((denominator > 0) ? (numerator / denominator) : 0);
307 probability +=
deltas[k] * frequency;
309 logger << DEBUG <<
"numerator: " << numerator <<
endl;
310 logger << DEBUG <<
"denominator: " << denominator <<
endl;
311 logger << DEBUG <<
"frequency: " << frequency <<
endl;
315 assert(numerator <= denominator);
316 assert(frequency <= 1);
320 logger << DEBUG <<
"probability: " << probability <<
endl;
322 if (probability > 0) {
342 std::map<std::list<std::string>,
int> ngramMap;
346 for (
size_t curr_cardinality = 1;
351 int change_size = change.size();
353 std::list<std::string> ngram_list;
357 (i < curr_cardinality - 1 && change_idx < change_size);
360 ngram_list.push_back(change[change_idx]);
364 while (change_idx < change_size)
366 ngram_list.push_back(change[change_idx++]);
367 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
368 ngram_list.pop_front();
391 if (change.size() > 0 &&
396 std::list<std::string> ngram_list(change.begin(), change.begin() + 1);
412 logger << DEBUG <<
"Adding extra token: " << extra_token <<
endl;
414 if (extra_token.empty())
418 ngram_list.push_front(extra_token);
420 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
429 std::map<std::list<std::string>,
int>::const_iterator it;
430 for (it = ngramMap.begin(); it != ngramMap.end(); it++)
433 Ngram ngram((it->first).begin(), (it->first).end());
451 logger << INFO <<
"Committed learning update to database" <<
endl;
456 logger << ERROR <<
"Rolling back learning update : " << ex.
what() <<
endl;
472 size_t size = ngram.size();
473 for (
size_t i = 0; i < size; i++) {
474 if (
count(ngram, -i, size - i) >
count(ngram, -(i + 1), size - (i + 1))) {
475 logger << INFO <<
"consistency adjustment needed!" <<
endl;
477 int offset = -(i + 1);
478 int sub_ngram_size = size - (i + 1);
480 logger << DEBUG <<
"i: " << i <<
" | offset: " << offset <<
" | sub_ngram_size: " << sub_ngram_size <<
endl;
482 Ngram sub_ngram(sub_ngram_size);
483 copy(ngram.end() - sub_ngram_size + offset, ngram.end() + offset, sub_ngram.begin());
486 logger <<
"ngram to be count adjusted is: ";
487 for (
size_t i = 0; i < sub_ngram.size(); i++) {
488 logger << sub_ngram[i] <<
' ';
494 logger << DEBUG <<
"consistency adjusted" <<
endl;
Tracks user interaction and context.
std::string getExtraTokenToLearn(const int index, const std::vector< std::string > &change) const
std::string getToken(const int) const
virtual void endTransaction() const
virtual void beginTransaction() const
virtual void rollbackTransaction() const
NgramTable getNgramLikeTable(const Ngram ngram, int limit=-1) const
NgramTable getNgramLikeTableFiltered(const Ngram ngram, const char **filter, int limit=-1) const
int incrementNgramCount(const Ngram ngram) const
void insertNgram(const Ngram ngram, const int count) const
int getUnigramCountsSum() const
int getNgramCount(const Ngram ngram) const
void updateNgram(const Ngram ngram, const int count) const
void dispatch(const Observable *var)
void map(Observable *var, const mbr_func_ptr_t &ptr)
virtual std::string get_name() const =0
virtual std::string get_value() const =0
void addSuggestion(Suggestion)
ContextTracker * contextTracker
const std::string PREDICTORS
virtual void set_logger(const std::string &level)
virtual const char * what() const
void check_learn_consistency(const Ngram &name) const
Dispatcher< SmoothedNgramPredictor > dispatcher
std::vector< double > deltas
void set_database_logger_level(const std::string &level)
virtual void learn(const std::vector< std::string > &change)
unsigned int count(const std::vector< std::string > &tokens, int offset, int ngram_size) const
Builds the required n-gram and returns its count.
virtual void update(const Observable *variable)
void set_dbfilename(const std::string &filename)
void set_learn(const std::string &learn_mode)
SmoothedNgramPredictor(Configuration *, ContextTracker *, const char *)
virtual Prediction predict(const size_t size, const char **filter) const
Generate prediction.
~SmoothedNgramPredictor()
void set_deltas(const std::string &deltas)
std::string DATABASE_LOGGER
void init_database_connector_if_ready()
static double toDouble(const std::string)
static bool isYes(const char *)
std::vector< Ngram > NgramTable
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
static std::string ngram_to_string(const Ngram &ngram)