2 * Copyright (C) 2004-2006 Felipe Sánchez-Martínez
3 * Copyright (C) 2006 Universitat d'Alacant
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 * Main program of the TL-driven method used to train a HMM-based
22 * part-of-speech tagger. (source file)
24 * For a deeper description on how the method works read the paper:
26 * Exploring the use of target-language information to train the
27 * part-of-speech tagger of machine translation systems. By Felipe
28 * Sánchez-Martínez, Juan Antonio Pérez-Ortiz and Mikel L. Forcada.
29 * In Lecture Notes in Computer Science 3230 (Advances in Natural
30 * Language Processing, Proceedings of EsTAL - España for Natural
31 * Language Processing), p. 137-148, October 20-22, 2004, Alicante,
33 * © Springer-Verlag Berling Heidelberg 2004
34 * http://www.dlsi.ua.es/~fsanchez/pub/pdf/sanchez04b.pdf
36 * @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
39 #include "HMM_TL_driven_trainer.H"
41 #include "PathsPruner.H"
42 #include "configure.H"
44 #include <apertium/tagger_utils.h>
45 #include <apertium/tagger_word.h>
46 #include <apertium/utf_converter.h>
52 #define MODE_UNKNOWN 0
57 void help(char *name) {
59 <<name<<" --tsxfile tsxfile --train <n> [--prune <m> <l> <p> <c> --initprob init.prob]"
60 <<" --file file --tscript tscript --lscript lscript [--trules transferrules]"
61 <<" [--supforms superficialforms] [--genpaths pathsfile]"
62 <<" [--translations transfile [--likelihoods likefiles]]"
63 <<" [--save <n>] [--norules] [--debug]\n\n"
65 <<" --tsxfile|-x: Specify the tagger specification file in XML format\n"
66 <<" --train|-t: Train the HMM-based part-of-speech tagger using\n"
67 <<" information from the target language.\n"
68 <<" Up to <n> words are processed from the training corpus\n"
69 <<" --file|-f: Used in conjunction with --train to specify the files\n"
70 <<" the method will work with\n"
72 <<" file.dic: Full expanded dictionary\n"
73 <<" file.crp: Training text corpus\n"
74 <<" Output files generated:\n"
75 <<" file.prob: HMM parameters\n"
76 <<" --tscript|-r: Specify the full path to the translation script\n"
77 <<" --lscript|-l: Specify the full path to the likelihood-evaluation\n"
79 <<" --trules|-u: Specify the file with the transfer rules used when translating\n"
80 <<" (see xtract_transfer_rules.sh)\n"
81 <<" --save|-s: Specify after how many words the HMM parameters must\n"
82 <<" be calculated and stored (optional)\n"
83 <<" --norules|-n: Forbidden and enforce rules will not be used to discard\n"
84 <<" disambiguation paths during training (by default those \n"
85 <<" rules are used)\n"
86 <<" --genpaths|-g: Specify a file in which all disambiguations paths\n"
87 <<" for each segment are written This cause translations not\n"
88 <<" to be performed, for batch training, 1st stage\n"
89 <<" --translations|-a: Specify a file from which all translations of\n"
90 <<" each segment are read. Used for batch training, 2nd stage\n"
91 <<" --likelihoods|-e: Specify a file from which the likelihood of each\n"
92 <<" translation is read. Used for batch training, 2nd/3rd stage\n"
93 <<" --supforms|-p: Specify a set of superficial forms (separated by '|') that\n"
94 <<" will be tested during the source-language text segmentation \n"
95 <<" to prevent the method from segmenting at those superficial forms\n"
96 <<" --prune|-k: Tell the algorithm that a disambiguation path pruning must be\n"
97 <<" performed. Meaning of the arguments to --prune:\n"
98 <<" <m> mode of prunning:\n"
99 <<" 1: Consider only those disambiguation paths whose a priori\n"
100 <<" likelihood is within the <p> mass probability of all the\n"
101 <<" disambiguation paths\n"
102 <<" <l> latency: after how many words should the parameters used to\n"
103 <<" discard disambiguation paths be updated with new ones\n"
104 <<" if -1, no updated will be performed\n"
105 <<" <p> mass of probability: used as a threshold to discard \n"
106 <<" disambiguation paths. It only has sense if pruning mode is 1\n"
107 <<" Range of possible values: 0 < p <= 1.0\n"
108 <<" <c> mixing constant function: constant to be used when mixing\n"
109 <<" new parameters (weigh of the new model). Range: c > 0 \n"
110 <<" --initprob|-b: Specify the file (.prob) with the initial parameters to be used\n"
111 <<" when pruning techniques are used\n"
112 <<" --debug|-d: Print debug information while operating\n"
113 <<" --help|-h: Prints this help message\n"
114 <<" --version|-v: Print version and license information and exits\n\n";
117 int main(int argc, char *argv[]) {
118 int mode=MODE_UNKNOWN;
122 TaggerWord::show_ingnored_string=false;
123 TransferRules transfer_rules;
125 cerr<<"LOCALE: "<<setlocale(LC_ALL,"")<<"\n";
134 wstring supforms=L"";
136 bool use_tags_rules=true;
137 int prune_m=-1; //If greater than 0, path pruning techniques will
141 double mixing_c=-1.0;
148 int save_after_nwords=0;
153 //cerr<<PACKAGE_STRING<<"\n";
154 cerr<<"Command line: ";
155 for(int i=0; i<argc; i++)
160 static struct option long_options[] =
162 {"tsxfile", required_argument, 0, 'x'},
163 {"train", required_argument, 0, 't'},
164 {"file", required_argument, 0, 'f'},
165 {"tscript", required_argument, 0, 'r'},
166 {"lscript", required_argument, 0, 'l'},
167 {"trules", required_argument, 0, 'u'},
168 {"supforms", required_argument, 0, 'p'},
169 {"prune", required_argument, 0, 'k'},
170 {"initprob", required_argument, 0, 'b'},
171 {"genpaths", required_argument, 0, 'g'},
172 {"translations",required_argument, 0, 'a'},
173 {"likelihoods", required_argument, 0, 'e'},
174 {"save", required_argument, 0, 's'},
175 {"norules", no_argument, 0, 'n'},
176 {"debug", no_argument, 0, 'd'},
177 {"help", no_argument, 0, 'h'},
178 {"version", no_argument, 0, 'v'},
182 c=getopt_long(argc, argv, "x:t:f:r:l:u:k:b:g:a:e:s:ndhv",long_options, &option_index);
188 cerr<<PACKAGE_STRING<<"\n";
190 <<" Copyright (C) 2004-2006 Felipe Sánchez Martínez\n"
191 <<" 2006 Universitat d'Alacant\n\n"
192 <<" This program is free software; you can redistribute it and/or\n"
193 <<" modify it under the terms of the GNU General Public License as\n"
194 <<" published by the Free Software Foundation; either version 2 of the\n"
195 <<" License, or (at your option) any later version.\n"
196 <<" This program is distributed in the hope that it will be useful, but\n"
197 <<" WITHOUT ANY WARRANTY; without even the implied warranty of\n"
198 <<" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n"
199 <<" General Public License for more details.\n"
201 <<" You should have received a copy of the GNU General Public License\n"
202 <<" along with this program; if not, write to the Free Software\n"
203 <<" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA\n"
204 <<" 02111-1307, USA.\n";
211 if (mode==MODE_TRAIN) {
212 prune_m=atoi(optarg);
213 prune_l=atoi(argv[optind++]);
214 prune_p=atof(argv[optind++]);
215 mixing_c=atof(argv[optind++]);
217 cerr<<"Error: Unknown prunnig mode "<<prune_m<<"\n";
221 if ((prune_p<=0) || (prune_p>1.0)) {
222 cerr<<"Error: mass of probability parameter given to --prune option must be grater than 0.0 and less or equal to 1.0\n";
227 cerr<<"Error: Mixing constant c given to --prune must be grater than 0.0\n";
232 cerr<<"Error: --prune argument can only be used in conjunction with --train\n";
241 corpus_length = atoi(optarg);
242 if(corpus_length<=0) {
243 cerr<<"Error: mandatory --train argument <n> must be a positive integer\n";
262 save_after_nwords=atoi(optarg);
274 use_tags_rules=false;
277 supforms=UtfConverter::fromUtf8(optarg);
295 cerr<<"Error: No tagger specification file was given, Use --tsxfile argument to provide that file\n";
300 if (mode==MODE_UNKNOWN) {
305 HMM_TL_driven_trainer hmm_trainer(tsxfile, &transfer_rules);
306 hmm_trainer.set_use_tags_rules(use_tags_rules);
308 Utils::translation_script=tscript;
309 Utils::likelihood_script=lscript;
311 PathsPruner::mode=prune_m;
312 PathsPruner::latency=prune_l;
313 PathsPruner::probmass=prune_p;
315 if (mode==MODE_TRAIN) {
317 cerr<<"Error: When using --train a file must be provided through the --file option\n";
322 cerr<<"Error: When using --train a translation script must be given through the --tscript option\n";
327 cerr<<"Error: When using --train a likelihood-evaluation script must be provided through the --lscript option\n";
331 if ((prune_m>0) && (initprob=="")) {
332 cerr<<"Error: When using --train <n> --prune <k> initial parameters file must be provided through the --initprob option\n";
339 cerr<<"Reading transfer rules from file '"<<trules<<"' "<<flush;
340 transfer_rules.read_rules_from_file(trules);
343 transfer_rules.compile_regular_expressions();
346 cerr<<"Reading superficial forms to take into account when segmenting source-language text ... ";
347 transfer_rules.set_superficial_forms(supforms);
351 FILE *fdic, *fcrp, *fprob;
357 if (mode==MODE_TRAIN) {
359 fprob = fopen(initprob.c_str(), "r");
360 if (!fprob) tagger_utils::file_name_error(initprob);
361 cerr<<"Reading apertium-tagger parameters from file '"<<initprob<<"'\n";
362 hmm_trainer.read_parameters(fprob);
366 cerr<<"Error: --genpaths cannot be used when --prune.\n";
367 cerr<<"First use --genpaths without --prune (normal training) and the\n";
368 cerr<<"use --translations and --likelihood in conjuntion (or not) with --prune\n";
373 fdic = fopen((filename+".dic").c_str(), "r");
374 if (!fdic) tagger_utils::file_name_error(filename+".dic");
375 hmm_trainer.read_dictionary(fdic);
379 fcrp = fopen((filename+".crp").c_str(), "r");
380 if (!fcrp) tagger_utils::file_name_error(filename+".crp");
382 fprob = fopen((filename+".prob").c_str(), "w");
383 if (!fprob) tagger_utils::file_name_error(filename+".prob");
386 fpaths.open(pathsfile.c_str(), ios::out | ios::trunc);
387 if(fpaths.fail()) tagger_utils::file_name_error(pathsfile);
391 ftrans.open(transfile.c_str(), ios::in);
392 if (ftrans.fail()) tagger_utils::file_name_error(transfile);
396 flike.open(likefile.c_str(), ios::in);
397 if(flike.fail()) tagger_utils::file_name_error(likefile);
401 if (mode==MODE_TRAIN) {
403 hmm_trainer.train(fcrp, corpus_length, save_after_nwords, filename, fpaths, ftrans, flike);
405 hmm_trainer.train_pruning(fcrp, corpus_length, save_after_nwords, filename, mixing_c, ftrans, flike);
409 if (mode==MODE_TRAIN) {
412 hmm_trainer.write_parameters(fprob);