2 * Copyright (C) 2004-2006 Felipe Sánchez-Martínez
3 * Copyright (C) 2006 Universitat d'Alacant
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 * Class Segment. It define a text segment (source file)
23 * @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
30 map<string, TTag> Segment::tag_index;
37 Segment::get_number_paths() {
42 Segment::set_tag_index(map<string, TTag> ti) {
47 Segment::get_path(vector <TTag>& etqpart, int path) {
51 for(size_t i=0; i<contador_caminos.size(); i++) {
52 int tag_position=((int)(path/nfijo_caminos[i]))%contador_caminos[i].size();
53 s+=vwords[i].get_lexical_form(contador_caminos[i][tag_position], tag_index["TAG_kEOF"]);
54 if ((!vwords[i].get_plus_cut()) && (i<(contador_caminos.size()-1)))
56 etqpart.push_back(contador_caminos[i][tag_position]);
63 Segment::new_segment(MorphoStream &ms, TransferRules* tr, TaggerData &td) {
64 TaggerWord *word=NULL;
66 set<TTag>::iterator itag;
69 static int index_start=1;
70 static deque<TaggerWord> wordsbuffer;
71 static bool first_call=true;
72 static bool end_of_corpus_reached=false;
77 eosword.add_tag(td.getTagIndex()["TAG_SENT"], "", td.getPreferRules());
78 wordsbuffer.push_back(eosword);
80 //Fill the buffer of words
81 while (wordsbuffer.size()<TAGGER_WORD_BUFFER_SIZE) {
82 word=ms.get_next_word();
85 end_of_corpus_reached=true;
89 wordsbuffer.push_back(*word);
97 cerr<<"BUFFER (begining): ";
98 for (int i=0; i<wordsbuffer.size(); i++) {
99 cerr<<"["<<wordsbuffer[i].get_superficial_form()<<"] ";
102 cerr<<"Buffer size (begining): "<<wordsbuffer.size()<<"\n";
103 cerr<<"Index start (begining): "<<index_start<<"\n";
106 Segment* seg=new Segment();
107 int number_of_paths=1;
109 int segmentation_point=-1;
110 int advance; //Number of word that can be skipped when looking for a segmentation point
111 for(size_t i=index_start; i<wordsbuffer.size(); i++) {
112 if (tr->is_segmentation_point(tag_index["TAG_kEOF"], wordsbuffer, i, advance)) {
113 segmentation_point=i;
120 if ((segmentation_point==-1) && (!end_of_corpus_reached)) {
121 cerr<<"Error: No segmentation point was found.\n";
122 cerr<<"Try making the buffer longer, current maximum size is "<<TAGGER_WORD_BUFFER_SIZE<<"\n";
123 cerr<<"See Segment.H, TAGGER_WORD_BUFFER_SIZE constant\n";
127 //cerr<<"Segmentation point: "<<segmentation_point<<"\n";
129 //The segment to return is from index_start to segmentation_point
130 for(int i=index_start; i<=segmentation_point; i++) {
131 tags=wordsbuffer[i].get_tags();
132 seg->contador_caminos.push_back(auxvec);
134 number_of_paths*=tags.size();
135 for(itag=tags.begin(); itag!=tags.end(); itag++)
136 seg->contador_caminos.back().push_back(*itag);
138 //seg->contador_caminos.back().push_back(-1); //Palabra desconocida
140 tags=td.getOpenClass();
141 number_of_paths*=tags.size();
143 for(itag=tags.begin(); itag!=tags.end(); itag++)
144 seg->contador_caminos.back().push_back(*itag);
146 seg->vwords.push_back(wordsbuffer[i]);
149 //Calculate which words can be removed from the buffer, we need some
150 //words before the segment being return, more concretely, from the
151 //last non-ambiguous word until the first word of the segment being
153 int preserve_word_from=-1;
154 for (int i=(index_start-1); i>=0; i--) {
155 if (wordsbuffer[i].get_tags().size()==1) {
156 preserve_word_from=i;
161 //cerr<<"Preserve words from index: "<<preserve_word_from<<"\n";
163 for(int i=0; i<preserve_word_from; i++) {
164 wordsbuffer.pop_front();
165 segmentation_point--;
170 cerr<<"BUFFER (after removing words): ";
171 for (int i=0; i<wordsbuffer.size(); i++) {
172 cerr<<"["<<wordsbuffer[i].get_superficial_form()<<"] ";
175 cerr<<"Buffer size (after removing words): "<<wordsbuffer.size()<<"\n";
176 cerr<<"Index start (after removing words): "<<index_start<<"\n";
177 cerr<<"Segmention point (after removing words): "<<segmentation_point<<"\n";
181 if (!end_of_corpus_reached) {
182 while (wordsbuffer.size()<TAGGER_WORD_BUFFER_SIZE) {
183 word=ms.get_next_word();
186 end_of_corpus_reached=true;
190 wordsbuffer.push_back(*word);
196 cerr<<"BUFFER (after refill): ";
197 for (int i=0; i<wordsbuffer.size(); i++) {
198 cerr<<"["<<wordsbuffer[i].get_superficial_form()<<"] ";
201 cerr<<"Buffer size (after refill): "<<wordsbuffer.size()<<"\n";
202 cerr<<"Index start (after refill): "<<index_start<<"\n";
203 cerr<<"Segmention point (after refill): "<<segmentation_point<<"\n";
206 //Now we retrieve words before and after this segment, for the
207 //calculus of the alphas and betas in the pruning method
208 for (int i=0; i<index_start; i++)
209 seg->vwords_before.push_back(wordsbuffer[i]);
211 bool found_forward=false;
212 for(size_t i=segmentation_point+1; i<wordsbuffer.size(); i++) {
213 seg->vwords_after.push_back(wordsbuffer[i]);
214 if (wordsbuffer[i].get_tags().size()==1) {
220 if (!found_forward) {
221 if (!end_of_corpus_reached) {
222 cerr<<"Error: No unambiguous word was found when looking fordward.\n";
223 cerr<<"Try making the buffer longer, current maximum size is "<<TAGGER_WORD_BUFFER_SIZE<<"\n";
224 cerr<<"See Segment.H, TAGGER_WORD_BUFFER_SIZE constant\n";
228 eosword.add_tag(td.getTagIndex()["TAG_SENT"], "", td.getPreferRules());
229 seg->vwords_after.push_back(eosword);
233 index_start=segmentation_point+1; //For the next search
235 //We have the total number of disambiguation paths for this segment.
236 //Now we initialize the structure used to retrieve all the paths in
237 //an efficient way. (nfijos_caminos = nº de veces que se tiene que
238 //usar una etiqueta antes de pasar a la siguiente)
239 for(size_t i=0; i<seg->contador_caminos.size(); i++) {
241 for(size_t j=i+1; j<seg->contador_caminos.size(); j++) {
242 fijo*=seg->contador_caminos[j].size();
244 seg->nfijo_caminos.push_back(fijo);
247 if(seg->vwords.size()==0) //That's all folks
250 seg->npaths=number_of_paths;