2 * Copyright (C) 2004-2006 Felipe Sánchez-Martínez
3 * Copyright (C) 2006 Universitat d'Alacant
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 * Implementation of the method that trains a HMM-based part-of-speech
22 * tagger using information from the target language (TL) by means of
23 * the apertium MT system. (header file)
25 * For a deeper description on how the method works read the paper:
27 * Exploring the use of target-language information to train the
28 * part-of-speech tagger of machine translation systems. By Felipe
29 * Sánchez-Martínez, Juan Antonio Pérez-Ortiz and Mikel L. Forcada.
30 * In Lecture Notes in Computer Science 3230 (Advances in Natural
31 * Language Processing, Proceedings of EsTAL - España for Natural
32 * Language Processing), p. 137-148, October 20-22, 2004, Alicante,
34 * © Springer-Verlag Berling Heidelberg 2004
35 * http://www.dlsi.ua.es/~fsanchez/pub/pdf/sanchez04b.pdf
37 * @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
40 #ifndef __HMM_TL_DRIVEN_TRAINER_H
41 #define __HMM_TL_DRIVEN_TRAINER_H
50 #include <apertium/Collection.H>
51 #include <apertium/ConstantManager.H>
52 #include <apertium/MorphoStream.H>
53 #include <apertium/TaggerData.H>
54 #include <apertium/TaggerUtils.H>
55 #include <apertium/TaggerWord.H>
56 #include <apertium/HMM.H>
57 #include <apertium/TTag.H>
60 #include "Translations.H"
61 #include "TransferRules.H"
63 #define MAX_PATHS_PER_SEGMENT 25000000
64 #define MAX_TRANSLATIONS_PER_SEGMENT 100000 //NEW
72 class HMM_TL_driven_trainer {
75 TaggerData tagger_data;
77 //TaggerData of an initial model, used when mixing parameters
78 TaggerData tagger_data_initial_model;
84 map<int, map<int, bool> > allowed_bigrams; //Matrix (NxN) with all the
85 //bigrams, this matrix is
86 //initialized after calling
87 //int_allowed_bigrams.
89 TransferRules* transfer_rules;
91 void init_allowed_bigrams();
93 void update_counts(Segment* seg, vector<Translations*> &trans,
94 map<int, map<int, double> > &tags_pair,
95 map<int, map<int, double> > &emis, TTag last_tag_prev_segment,
96 map<int, double> &tags_count,
97 map<int, double> &ambclass_count,
98 map<int, double> &tags_count_for_emis);
100 bool is_feasible_path(const TTag& last_etq_ant, const vector<TTag>& etqpart);
102 void calculate_parameters(map<int, map<int, double> > &tags_pairs, map<int, map<int, double> > &emis);
104 void mix_parameters(double words_processed, double corpus_length, double mixing_c);
110 HMM_TL_driven_trainer(string tsxfile, TransferRules* tr);
114 ~HMM_TL_driven_trainer();
116 /** Set the flag use_tags_rules to a desired value. If false, the
117 * information that comes from forbidden and enforce rules will no be
120 void set_use_tags_rules(bool b);
122 void read_dictionary(FILE *fdic);
124 void read_parameters(FILE *in);
126 void write_parameters(FILE *out);
128 void train(FILE *is, int corpus_length, int save_after_nwords, string filename, ofstream& fpaths, ifstream& ftrans, ifstream& flike);
130 void train_pruning(FILE *is, int corpus_length, int save_after_nwords, string filename, double mixing_c, ifstream& ftrans, ifstream& flike);