apertium-tagger-training-tools/src/Segment.C

   1 /*
   2  * Copyright (C) 2004-2006 Felipe Sánchez-Martínez
   3  * Copyright (C) 2006 Universitat d'Alacant
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 /**
  21  * Class Segment. It define a text segment  (source file)
  22  *
  23  *  @author   Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es
  24  */
  25
  26 #include "Segment.H"
  27
  28 #include <deque>
  29
  30 map<string, TTag> Segment::tag_index;
  31
  32 Segment::Segment() {
  33   npaths=0;
  34 }
  35
  36 int
  37 Segment::get_number_paths() {
  38   return npaths;
  39 }
  40
  41 void
  42 Segment::set_tag_index(map<string, TTag> ti) {
  43   tag_index=ti;
  44 }
  45
  46 string
  47 Segment::get_path(vector <TTag>& etqpart, int path) {
  48   etqpart.clear();
  49   string s="";
  50
  51   for(size_t i=0; i<contador_caminos.size(); i++) {
  52     int tag_position=((int)(path/nfijo_caminos[i]))%contador_caminos[i].size();
  53     s+=vwords[i].get_lexical_form(contador_caminos[i][tag_position], tag_index["TAG_kEOF"]);
  54     if ((!vwords[i].get_plus_cut()) && (i<(contador_caminos.size()-1)))
  55       s+=" ";
  56     etqpart.push_back(contador_caminos[i][tag_position]);
  57   }
  58
  59   return s;
  60 }
  61
  62 Segment*
  63 Segment::new_segment(MorphoStream &ms, TransferRules* tr,  TaggerData &td) {
  64   TaggerWord *word=NULL;
  65   set<TTag> tags;
  66   set<TTag>::iterator itag;
  67   vector<TTag> auxvec;
  68
  69   static int index_start=1;
  70   static deque<TaggerWord> wordsbuffer;
  71   static bool first_call=true;
  72   static bool end_of_corpus_reached=false;
  73
  74
  75   if (first_call) {
  76     TaggerWord eosword;
  77     eosword.add_tag(td.getTagIndex()["TAG_SENT"], "", td.getPreferRules());
  78     wordsbuffer.push_back(eosword);
  79
  80     //Fill the buffer of words
  81     while (wordsbuffer.size()<TAGGER_WORD_BUFFER_SIZE) {
  82       word=ms.get_next_word();
  83
  84       if(word==NULL) {
  85         end_of_corpus_reached=true;
  86         break;
  87       }
  88
  89       wordsbuffer.push_back(*word);
  90       delete word;
  91     }
  92
  93     first_call=false;
  94   }
  95
  96   /*
  97     cerr<<"BUFFER (begining): ";
  98     for (int i=0; i<wordsbuffer.size(); i++) {
  99     cerr<<"["<<wordsbuffer[i].get_superficial_form()<<"] ";
 100     }
 101     cerr<<"\n";
 102     cerr<<"Buffer size (begining): "<<wordsbuffer.size()<<"\n";
 103     cerr<<"Index start (begining): "<<index_start<<"\n";
 104   */
 105
 106   Segment* seg=new Segment();
 107   int number_of_paths=1;
 108
 109   int segmentation_point=-1;
 110   int advance; //Number of word that can be skipped when looking for a segmentation point
 111   for(size_t i=index_start; i<wordsbuffer.size(); i++) {
 112     if (tr->is_segmentation_point(tag_index["TAG_kEOF"], wordsbuffer, i, advance)) {
 113       segmentation_point=i;
 114       break;
 115     } else{
 116       i+=advance;
 117     }
 118   }
 119
 120   if ((segmentation_point==-1) && (!end_of_corpus_reached)) {
 121     cerr<<"Error: No segmentation point was found.\n";
 122     cerr<<"Try making the buffer longer, current maximum size is "<<TAGGER_WORD_BUFFER_SIZE<<"\n";
 123     cerr<<"See Segment.H, TAGGER_WORD_BUFFER_SIZE constant\n";
 124     exit(EXIT_FAILURE);
 125   }
 126
 127   //cerr<<"Segmentation point: "<<segmentation_point<<"\n";
 128
 129   //The segment to return is from index_start to segmentation_point
 130   for(int i=index_start; i<=segmentation_point; i++) {
 131     tags=wordsbuffer[i].get_tags();
 132     seg->contador_caminos.push_back(auxvec);
 133     if (tags.size()>0) {
 134       number_of_paths*=tags.size();
 135       for(itag=tags.begin(); itag!=tags.end(); itag++)
 136         seg->contador_caminos.back().push_back(*itag);
 137     } else {
 138       //seg->contador_caminos.back().push_back(-1); //Palabra desconocida
 139
 140       tags=td.getOpenClass();
 141       number_of_paths*=tags.size();
 142
 143       for(itag=tags.begin(); itag!=tags.end(); itag++)
 144         seg->contador_caminos.back().push_back(*itag);
 145     }
 146     seg->vwords.push_back(wordsbuffer[i]);
 147   }
 148
 149   //Calculate which words can be removed from the buffer, we need some
 150   //words before the segment being return, more concretely, from the
 151   //last non-ambiguous word until the first word of the segment being
 152   //returned
 153   int preserve_word_from=-1;
 154   for (int i=(index_start-1); i>=0; i--) {
 155     if (wordsbuffer[i].get_tags().size()==1) {
 156       preserve_word_from=i;
 157       break;
 158     }
 159   }
 160
 161   //cerr<<"Preserve words from index: "<<preserve_word_from<<"\n";
 162
 163   for(int i=0; i<preserve_word_from; i++) {
 164     wordsbuffer.pop_front();
 165     segmentation_point--;
 166     index_start--;
 167   }
 168
 169   /*
 170     cerr<<"BUFFER (after removing words): ";
 171     for (int i=0; i<wordsbuffer.size(); i++) {
 172     cerr<<"["<<wordsbuffer[i].get_superficial_form()<<"] ";
 173     }
 174     cerr<<"\n";
 175     cerr<<"Buffer size (after removing words): "<<wordsbuffer.size()<<"\n";
 176     cerr<<"Index start (after removing words): "<<index_start<<"\n";
 177     cerr<<"Segmention point (after removing words): "<<segmentation_point<<"\n";
 178   */
 179
 180   //Refill the buffer
 181   if (!end_of_corpus_reached) {
 182     while (wordsbuffer.size()<TAGGER_WORD_BUFFER_SIZE) {
 183       word=ms.get_next_word();
 184
 185       if(word==NULL) {
 186         end_of_corpus_reached=true;
 187         break;
 188       }
 189
 190       wordsbuffer.push_back(*word);
 191       delete word;
 192     }
 193   }
 194
 195   /*
 196     cerr<<"BUFFER (after refill): ";
 197     for (int i=0; i<wordsbuffer.size(); i++) {
 198     cerr<<"["<<wordsbuffer[i].get_superficial_form()<<"] ";
 199     }
 200     cerr<<"\n";
 201     cerr<<"Buffer size (after refill): "<<wordsbuffer.size()<<"\n";
 202     cerr<<"Index start (after refill): "<<index_start<<"\n";
 203     cerr<<"Segmention point (after refill): "<<segmentation_point<<"\n";
 204   */
 205
 206   //Now we retrieve words before and after this segment, for the
 207   //calculus of the alphas and betas in the pruning method
 208   for (int i=0; i<index_start; i++)
 209     seg->vwords_before.push_back(wordsbuffer[i]);
 210
 211   bool found_forward=false;
 212   for(size_t i=segmentation_point+1; i<wordsbuffer.size(); i++) {
 213     seg->vwords_after.push_back(wordsbuffer[i]);
 214     if (wordsbuffer[i].get_tags().size()==1) {
 215       found_forward=true;
 216       break;
 217     }
 218   }
 219
 220   if (!found_forward) {
 221     if (!end_of_corpus_reached) {
 222       cerr<<"Error: No unambiguous word was found when looking fordward.\n";
 223       cerr<<"Try making the buffer longer, current maximum size is "<<TAGGER_WORD_BUFFER_SIZE<<"\n";
 224       cerr<<"See Segment.H, TAGGER_WORD_BUFFER_SIZE constant\n";
 225       exit(EXIT_FAILURE);
 226     }  else {
 227       TaggerWord eosword;
 228       eosword.add_tag(td.getTagIndex()["TAG_SENT"], "", td.getPreferRules());
 229       seg->vwords_after.push_back(eosword);
 230     }
 231   }
 232
 233   index_start=segmentation_point+1; //For the next search
 234
 235   //We have the total number of disambiguation paths for this segment.
 236   //Now we initialize the structure used to retrieve all the paths in
 237   //an efficient way. (nfijos_caminos = nº de veces que se tiene que
 238   //usar una etiqueta antes de pasar a la siguiente)
 239   for(size_t i=0; i<seg->contador_caminos.size(); i++) {
 240     int fijo=1;
 241     for(size_t j=i+1; j<seg->contador_caminos.size(); j++) {
 242       fijo*=seg->contador_caminos[j].size();
 243     }
 244     seg->nfijo_caminos.push_back(fijo);
 245   }
 246
 247   if(seg->vwords.size()==0)    //That's all folks
 248     seg->npaths=0;
 249   else
 250     seg->npaths=number_of_paths;
 251
 252   return seg;
 253 }