presage  0.9.1
text2ngram.cpp
Go to the documentation of this file.
1 
2 /******************************************************
3  * Presage, an extensible predictive text entry system
4  * ---------------------------------------------------
5  *
6  * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License along
19  with this program; if not, write to the Free Software Foundation, Inc.,
20  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  **********(*)*/
23 
24 
25 #include "config.h"
26 
27 #include <iostream>
28 #include <fstream>
29 #include <vector>
30 #include <list>
31 #include <string>
32 #include <map>
33 
34 #ifdef HAVE_UNISTD_H
35 # include <unistd.h>
36 #endif
37 
38 #ifdef HAVE_STDLIB_H
39 # include <stdlib.h>
40 #endif
41 
42 #include <getopt.h>
43 #include <assert.h>
44 
46 #include "core/iso8859_1.h"
47 #include "core/progress.h"
48 
49 #include "../lib/predictors/dbconnector/sqliteDatabaseConnector.h"
50 
51 const std::string PROGRAM_NAME = "text2ngram";
52 
53 typedef std::list<std::string> NgramList;
54 
55 void usage();
56 void version();
57 
58 int main(int argc, char* argv[])
59 {
60  int next_option;
61 
62  // Setup some defaults
63  // - default to generating 1-gram counts
64  int ngrams = 1;
65 
66  // - default output to stdout (empty string signifies stdout)
67  std::string output;
68 
69  const std::string TABBED_SEPARATED_VALUES = "tsv";
70  const std::string SQLITE = "sqlite";
71  // - default format is tabbed separated values
72  std::string format = TABBED_SEPARATED_VALUES;
73 
74  // - default to case sensitive
75  bool lowercase = false;
76 
77  // - default to no append
78  bool append = false;
79 
80 
81  // getopt structures
82  const char * const short_options = "n:o:f:alhv";
83  const struct option long_options[] =
84  {
85  { "ngrams", required_argument, 0, 'n' },
86  { "output", required_argument, 0, 'o' },
87  { "format", required_argument, 0, 'f' },
88  { "append", no_argument, 0, 'a' },
89  { "lowercase", no_argument, 0, 'l' },
90  { "help", no_argument, 0, 'h' },
91  { "version", no_argument, 0, 'v' },
92  { 0, 0, 0, 0 }
93  };
94 
95  do {
96  next_option = getopt_long(argc,
97  argv,
98  short_options,
99  long_options,
100  NULL);
101 
102  switch (next_option) {
103  case 'n': // --ngrams or -n option
104  if (atoi(optarg) > 0) {
105  ngrams = atoi(optarg);
106  } else {
107  usage();
108  }
109  break;
110  case 'o': // --output or -o option
111  output = optarg;
112  break;
113  case 'f': // --format or -f option
114  if (optarg == SQLITE
115  || optarg == TABBED_SEPARATED_VALUES) {
116  format = optarg;
117  } else {
118  std::cerr << "Unknown format " << optarg << std::endl << std::endl;
119  usage();
120  return -1;
121  }
122  break;
123  case 'a': // --append or -a option
124  // append mode
125  append = true;
126  break;
127  case 'l': // --lowercase or -l option
128  lowercase = true;
129  break;
130  case 'h': // --help or -h option
131  usage();
132  exit (0);
133  break;
134  case 'v': // --version or -v option
135  version();
136  exit (0);
137  break;
138  case '?': // unknown option
139  usage();
140  exit (0);
141  break;
142  case -1:
143  break;
144  default:
145  std::cerr << "Error: unhandled option." << std::endl;
146  exit(0);
147  }
148 
149  } while (next_option != -1);
150 
151 
152  if ((argc - optind < 1)) {
153  usage();
154  return -1;
155  }
156 
157 
158  // ngramMap stores <token,count> pairs
159  std::map<NgramList, int> ngramMap;
160 
161  for (int i = optind; i < argc; i++) {
162  // do the actual processing file by file
163  std::string token;
164  NgramList ngram;
165 
166  // points to output file
167  // print out file information
168  std::cout << "Parsing " << argv[i] << "..."
169  << std::endl;
170 
171  ProgressBar<char> progressBar;
172 
173  // create tokenizer object and open input file stream
174  std::ifstream infile(argv[i]);
175  ForwardTokenizer tokenizer(infile,
176  " \f\n\r\t\v",
177  "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<");
178  tokenizer.lowercaseMode(lowercase);
179 
180  // take care of first N-1 tokens
181  for (int i = 0; (i < ngrams - 1 && tokenizer.hasMoreTokens()); i++) {
182  ngram.push_back(tokenizer.nextToken());
183  }
184 
185  while (tokenizer.hasMoreTokens()) {
186  // extract token from input stream
187  token = tokenizer.nextToken();
188 
189  // update ngram with new token
190  ngram.push_back(token);
191 
192  // update map with new token occurrence
193  ngramMap[ngram] = ngramMap[ngram] + 1;
194 
195  // update progress bar
196  //progressBar(tokenizer.progress());
197  progressBar.update(tokenizer.progress());
198 
199  // remove front token from ngram
200  ngram.pop_front();
201  }
202 
203  infile.close();
204  }
205 
206 
207  std::cout << "Writing out to " << format << " format file "
208  << output << "..." << std::endl;
209  if (format == TABBED_SEPARATED_VALUES) {
210  // output to tabbed separated values text file
211  //
212 
213  std::ofstream *outstream = 0;
214  std::ostream *prev_outstream = 0;
215 
216  if (output.c_str()) {
217  // tie outstream to file
218  outstream = new std::ofstream (output.c_str(), std::ios::out);
219  assert(outstream);
220  prev_outstream = std::cout.tie (outstream);
221  }
222 
223  // write results to output stream
224  ProgressBar<char> progressBar;
225  long total = ngramMap.size();
226  long count = 0;
227  std::map<NgramList, int>::const_iterator it;
228  for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
229  for (NgramList::const_iterator ngram_it = it->first.begin();
230  ngram_it != it->first.end();
231  ngram_it++) {
232  std::cout << *ngram_it << '\t';
233  }
234  std::cout << it->second << std::endl;
235  progressBar.update(static_cast<double>(count++)/total);
236  }
237 
238  if (output.c_str()) {
239  std::cout.tie (prev_outstream);
240  outstream->close ();
241  delete outstream;
242  }
243 
244  } else if (format == SQLITE) {
245  // output to SQLITE
246  //
247 
248  SqliteDatabaseConnector sqliteDbCntr(output, ngrams, true);
249  sqliteDbCntr.beginTransaction();
250  sqliteDbCntr.createNgramTable(ngrams);
251 
252  // write results to output stream
253  ProgressBar<char> progressBar;
254  long total = ngramMap.size();
255  long count = 0;
256  std::map<NgramList, int>::const_iterator it;
257  for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
258 
259  // convert from NgramList to Ngram
260  Ngram ngram;
261  for (NgramList::const_iterator jt = it->first.begin();
262  jt != it->first.end();
263  jt++) {
264  ngram.push_back(*jt);
265  }
266 
267  if (append) {
268  // need to check whether ngram is already in database.
269  // when appending to existing database
270  int count = sqliteDbCntr.getNgramCount(ngram);
271  if (count > 0) {
272  // ngram already in database, update count
273  sqliteDbCntr.updateNgram(ngram, count + it->second);
274  } else {
275  // ngram not in database, insert it
276  sqliteDbCntr.insertNgram(ngram, it->second);
277  }
278  } else {
279  // insert ngram
280  sqliteDbCntr.insertNgram(ngram, it->second);
281  }
282 
283  progressBar.update(static_cast<double>(count++)/total);
284  }
285  sqliteDbCntr.endTransaction();
286  } else {
287  abort();
288  }
289 
290 
291  std::cout << std::endl;
292 
293  return 0;
294 }
295 
296 
297 void version()
298 {
299  std::cout
300  << PROGRAM_NAME << " (" << PACKAGE << ") version " << VERSION << std::endl
301  << "Copyright (C) Matteo Vescovi" << std::endl
302  << "This is free software; see the source for copying conditions. There is NO" << std::endl
303  << "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." << std::endl
304  << std::endl;
305 }
306 
307 
308 void usage()
309 {
310  std::cout
311  << "Usage: " << PROGRAM_NAME << " [OPTION]... infiles..." << std::endl
312  << std::endl
313  << " --output, -o O " << "Output file name O" << std::endl
314  << " --ngrams, -n N " << "Specify ngram cardinality N" << std::endl
315  << " --format, -f F " << "Output file format F: sqlite, tsv (tabbed separated values)" << std::endl
316  << " --lowercase, -l " << "Enable lowercase conversion mode" << std::endl
317  << " --append, -a " << "Open output file in append mode" << std::endl
318  << " --help, -h " << "Display this information" << std::endl
319  << " --version, -v " << "Show version information" << std::endl
320  << std::endl
321  << PROGRAM_NAME << " is free software distributed under the GPL." << std::endl
322  << "Send bug reports to " << PACKAGE_BUGREPORT << std::endl
323  << "Copyright (C) Matteo Vescovi" << std::endl;
324 }
virtual void endTransaction() const
void createNgramTable(const size_t cardinality) const
virtual void beginTransaction() const
void insertNgram(const Ngram ngram, const int count) const
int getNgramCount(const Ngram ngram) const
void updateNgram(const Ngram ngram, const int count) const
virtual bool hasMoreTokens() const
virtual std::string nextToken()
virtual double progress() const
Definition: ngram.h:33
void update(const double percentage)
Definition: progress.h:54
void lowercaseMode(const bool)
Definition: tokenizer.cpp:81
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
Definition: logger.h:278
int main(int argc, char *argv[])
Definition: text2ngram.cpp:58
std::list< std::string > NgramList
Definition: text2ngram.cpp:53
void usage()
Definition: text2ngram.cpp:308
const std::string PROGRAM_NAME
Definition: text2ngram.cpp:51
void version()
Definition: text2ngram.cpp:297