apertium-unicode/apertium/tagger_word.cc

   1 /*
   2  * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public License as
   6  * published by the Free Software Foundation; either version 2 of the
   7  * License, or (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful, but
  10  * WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  17  * 02111-1307, USA.
  18  */
  19 #include <apertium/tagger_word.h>
  20 #include <apertium/utf_converter.h>
  21 #include <apertium/string_utils.h>
  22
  23 using namespace Apertium;
  24
  25 vector<wstring> TaggerWord::array_tags;
  26
  27 bool TaggerWord::show_ingnored_string=true;
  28
  29 map<wstring, ApertiumRE, Ltstr> TaggerWord::patterns;
  30
  31 TaggerWord::TaggerWord(bool prev_plus_cut){
  32    ignored_string = L"";
  33    plus_cut=false;
  34    previous_plus_cut=prev_plus_cut;
  35 }
  36
  37 TaggerWord::TaggerWord(const TaggerWord &w){
  38   superficial_form = w.superficial_form;
  39   tags = w.tags;
  40   lexical_forms = w.lexical_forms;
  41   ignored_string = w.ignored_string;
  42   plus_cut = w.plus_cut;
  43   previous_plus_cut=w.previous_plus_cut;
  44 }
  45
  46 TaggerWord::~TaggerWord(){
  47 }
  48
  49 void
  50 TaggerWord::set_superficial_form(const wstring &sf){
  51   superficial_form = sf;
  52 }
  53
  54 wstring&
  55 TaggerWord::get_superficial_form() {
  56   return superficial_form;
  57 }
  58
  59 bool
  60 TaggerWord::match(wstring const &s, wstring const &pattern)
  61 {
  62   map<wstring, ApertiumRE, Ltstr>::iterator it = patterns.find(pattern);
  63   string const utfs = UtfConverter::toUtf8(s);
  64
  65   if(it == patterns.end())
  66   {
  67     string utfpattern = UtfConverter::toUtf8(pattern);
  68     string regexp = "";
  69
  70     while(true)
  71     {
  72       size_t pos = utfpattern.find("<*>");
  73       if(pos == string::npos)
  74       {
  75         break;
  76       }
  77       utfpattern.replace(pos, 3, "(<[^>]+>)+");
  78     }
  79     patterns[pattern].compile(utfpattern);
  80     return patterns[pattern].match(utfs) != "";
  81   }
  82   else
  83   {
  84     return it->second.match(utfs) != "";
  85   }
  86 }
  87
  88 void
  89 TaggerWord::add_tag(TTag &t, const wstring &lf, vector<wstring> const &prefer_rules){
  90
  91   //Tag is added only is it is not present yet
  92   //Sometime one word can have more than one lexical form assigned to the same tag
  93   if (tags.find(t)==tags.end()) {
  94     tags.insert(t);
  95     lexical_forms[t]=lf;
  96   } else {
  97     //Take a look at the prefer rules
  98     for(int i=0; i < (int) prefer_rules.size(); i++)
  99     {
 100       if (match(lf, prefer_rules[i]))
 101       {
 102         lexical_forms[t]=lf;
 103         break;
 104       }
 105     }
 106   }
 107 }
 108
 109 set<TTag>&
 110 TaggerWord::get_tags() {
 111   return tags;
 112 }
 113
 114 wstring
 115 TaggerWord::get_string_tags() {
 116   wstring st;
 117   set<TTag>::iterator itag;
 118
 119   st=L"{";
 120   for(itag=tags.begin(); itag!=tags.end(); itag++) {
 121     if (itag!=tags.begin())
 122       st+=L',';
 123     st+=array_tags[*itag];
 124   }
 125   st += L'}';
 126
 127   return st;
 128 }
 129
 130 wstring
 131 TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) {
 132   wstring ret= L"";
 133
 134   if (show_ingnored_string)
 135     ret.append(ignored_string);
 136
 137   if(t==TAG_kEOF)
 138     return ret;
 139
 140   if (!previous_plus_cut)
 141     ret+=L'^';
 142
 143   if (lexical_forms.size()==0) { // This is an UNKNOWN WORD
 144     ret +=L'*';
 145     ret.append(superficial_form);
 146   } else if ((*lexical_forms.begin()).second[0]==L'*') { //This is an
 147                                                         //unknown word
 148                                                         //that has
 149                                                         //been guessed
 150     ret += L'*';
 151     ret.append(superficial_form);
 152   } else if (lexical_forms.size()>1) {  //This is an ambiguous word
 153     ret.append(lexical_forms[t]);
 154   } else {
 155     ret.append(lexical_forms[t]);
 156   }
 157
 158   if (ret != ignored_string) {
 159     if (plus_cut)
 160       ret+=L'+';
 161     else {
 162       ret += L'$';
 163     }
 164   }
 165
 166
 167   //if ((superficial_form.length()>0)&&(superficial_form[superficial_form.length()-1]=='\''))
 168   //   //Si la forma superficial termina en apostrofo metemos un espacio en blanco tras la cadena '/$'
 169   //   //o '/'. De no hacerlo en la traducción aparecerán dos palabras sin blanco alguno.
 170   //   ret+=" "; //Quizá este no sea el sitio apropiado para hacer esto, lo suyo sería un módulo
 171   //             //antes del tagger o del anmor.
 172
 173   return ret;
 174 }
 175
 176 wstring
 177 TaggerWord::get_all_choosen_tag_first(TTag &t, int const TAG_kEOF) {
 178   wstring ret=L"";
 179
 180   if (show_ingnored_string)
 181     ret.append(ignored_string);
 182
 183   if(t==TAG_kEOF)
 184     return ret;
 185
 186   if (!previous_plus_cut)
 187     ret+=L"^";
 188
 189   ret.append(superficial_form);
 190
 191   if (lexical_forms.size()==0) { // This is an UNKNOWN WORD
 192     ret+=L"/*";
 193     ret.append(superficial_form);
 194   } else {
 195     ret+=L"/";
 196     ret.append(lexical_forms[t]);
 197     if (lexical_forms.size()>1) {
 198       set<TTag>::iterator it;
 199       for (it=tags.begin(); it!=tags.end(); it++) {
 200         if (*it != t) {
 201           ret+=L"/";
 202           ret.append(lexical_forms[*it]);
 203         }
 204       }
 205     }
 206   }
 207
 208   if (ret != ignored_string) {
 209     if (plus_cut)
 210       ret+=L"+";
 211     else {
 212       ret+=L"$";
 213     }
 214   }
 215
 216   return ret;
 217 }
 218
 219 //OBSOLETE
 220 wstring
 221 TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) {
 222   wstring ret;
 223
 224   if(t==TAG_kEOF)
 225      return ret;
 226
 227   if (lexical_forms.size()==0) { //This is an unknown word
 228       ret.append(L"*^");
 229       ret.append(superficial_form);
 230   } else if ((*lexical_forms.begin()).second[0]=='*') {  //This is an unknown word that has been guessed
 231     ret.append(L"*^");
 232     ret.append(superficial_form);
 233   } else {
 234     ret += L'^';
 235     ret.append(lexical_forms[t]);
 236   }
 237
 238   if (ret.length() != 0) {
 239     if (plus_cut)
 240       ret+=L'+';
 241     else {
 242       ret +=L'$';
 243     }
 244   }
 245
 246   return ret;
 247 }
 248
 249 void
 250 TaggerWord::add_ignored_string(wstring const &s) {
 251   ignored_string.append(s);
 252 }
 253
 254 void
 255 TaggerWord::set_plus_cut(const bool &c) {
 256   plus_cut=c;
 257 }
 258
 259 bool
 260 TaggerWord::get_plus_cut() {
 261   return plus_cut;
 262 }
 263
 264 wostream&
 265 operator<< (wostream& os, TaggerWord &w) {
 266   os<<w.get_string_tags()<< L" \t Word: " << w.get_superficial_form();
 267   return os;
 268 }
 269
 270 void
 271 TaggerWord::setArrayTags(vector<wstring> const &at)
 272 {
 273   array_tags = at;
 274 }
 275
 276 void
 277 TaggerWord::print()
 278 {
 279   wcout << L"[#" << superficial_form << L"# ";
 280   for(set<TTag>::iterator it=tags.begin(), limit = tags.end(); it != limit; it++)
 281   {
 282     wcout << L"(" << *it << L" " << lexical_forms[*it] << L") ";
 283   }
 284   wcout << L"\b]\n";
 285 }
 286
 287 void
 288 TaggerWord::outputOriginal(FILE *output) {
 289
 290   wstring s=superficial_form;
 291
 292   map<TTag, wstring>::iterator it;
 293   for(it=lexical_forms.begin(); it!=lexical_forms.end(); it++) {
 294     if (it->second.length()>0)
 295     {
 296       s+=L'/';
 297       s.append(it->second);
 298     }
 299   }
 300
 301   if (s.length()>0)
 302   {
 303     s=L"^"+s+L"$\n";
 304   }
 305
 306   fputws_unlocked(s.c_str(), output);
 307 }