49 #include "../lib/predictors/dbconnector/sqliteDatabaseConnector.h"
58 int main(
int argc,
char* argv[])
69 const std::string TABBED_SEPARATED_VALUES =
"tsv";
70 const std::string SQLITE =
"sqlite";
72 std::string format = TABBED_SEPARATED_VALUES;
75 bool lowercase =
false;
82 const char *
const short_options =
"n:o:f:alhv";
83 const struct option long_options[] =
85 {
"ngrams", required_argument, 0,
'n' },
86 {
"output", required_argument, 0,
'o' },
87 {
"format", required_argument, 0,
'f' },
88 {
"append", no_argument, 0,
'a' },
89 {
"lowercase", no_argument, 0,
'l' },
90 {
"help", no_argument, 0,
'h' },
91 {
"version", no_argument, 0,
'v' },
96 next_option = getopt_long(argc,
102 switch (next_option) {
104 if (atoi(optarg) > 0) {
105 ngrams = atoi(optarg);
115 || optarg == TABBED_SEPARATED_VALUES) {
145 std::cerr <<
"Error: unhandled option." <<
std::endl;
149 }
while (next_option != -1);
152 if ((argc - optind < 1)) {
159 std::map<NgramList, int> ngramMap;
161 for (
int i = optind; i < argc; i++) {
168 std::cout <<
"Parsing " << argv[i] <<
"..."
174 std::ifstream infile(argv[i]);
177 "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<");
181 for (
int i = 0; (i < ngrams - 1 && tokenizer.
hasMoreTokens()); i++) {
190 ngram.push_back(token);
193 ngramMap[ngram] = ngramMap[ngram] + 1;
207 std::cout <<
"Writing out to " << format <<
" format file "
209 if (format == TABBED_SEPARATED_VALUES) {
213 std::ofstream *outstream = 0;
214 std::ostream *prev_outstream = 0;
216 if (output.c_str()) {
218 outstream =
new std::ofstream (output.c_str(), std::ios::out);
220 prev_outstream = std::cout.tie (outstream);
225 long total = ngramMap.size();
227 std::map<NgramList, int>::const_iterator it;
228 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
229 for (NgramList::const_iterator ngram_it = it->first.begin();
230 ngram_it != it->first.end();
232 std::cout << *ngram_it <<
'\t';
235 progressBar.
update(
static_cast<double>(count++)/total);
238 if (output.c_str()) {
239 std::cout.tie (prev_outstream);
244 }
else if (format == SQLITE) {
254 long total = ngramMap.size();
256 std::map<NgramList, int>::const_iterator it;
257 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
261 for (NgramList::const_iterator jt = it->first.begin();
262 jt != it->first.end();
264 ngram.push_back(*jt);
273 sqliteDbCntr.
updateNgram(ngram, count + it->second);
283 progressBar.
update(
static_cast<double>(count++)/total);
301 <<
"Copyright (C) Matteo Vescovi" <<
std::endl
302 <<
"This is free software; see the source for copying conditions. There is NO" <<
std::endl
303 <<
"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." <<
std::endl
313 <<
" --output, -o O " <<
"Output file name O" <<
std::endl
314 <<
" --ngrams, -n N " <<
"Specify ngram cardinality N" <<
std::endl
315 <<
" --format, -f F " <<
"Output file format F: sqlite, tsv (tabbed separated values)" <<
std::endl
316 <<
" --lowercase, -l " <<
"Enable lowercase conversion mode" <<
std::endl
317 <<
" --append, -a " <<
"Open output file in append mode" <<
std::endl
318 <<
" --help, -h " <<
"Display this information" <<
std::endl
319 <<
" --version, -v " <<
"Show version information" <<
std::endl
322 <<
"Send bug reports to " << PACKAGE_BUGREPORT <<
std::endl
323 <<
"Copyright (C) Matteo Vescovi" <<
std::endl;
virtual void endTransaction() const
void createNgramTable(const size_t cardinality) const
virtual void beginTransaction() const
void insertNgram(const Ngram ngram, const int count) const
int getNgramCount(const Ngram ngram) const
void updateNgram(const Ngram ngram, const int count) const
virtual bool hasMoreTokens() const
virtual std::string nextToken()
virtual double progress() const
void update(const double percentage)
void lowercaseMode(const bool)
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
int main(int argc, char *argv[])
std::list< std::string > NgramList
const std::string PROGRAM_NAME