Poc més
[apertium.git] / apertium-unicode / apertium / tagger_word.cc
blob58214ab67a60adc0e3350e057bc02d6f267c8aba
1 /*
2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 * 02111-1307, USA.
19 #include <apertium/tagger_word.h>
20 #include <apertium/utf_converter.h>
21 #include <apertium/string_utils.h>
23 using namespace Apertium;
25 vector<wstring> TaggerWord::array_tags;
27 bool TaggerWord::show_ingnored_string=true;
29 map<wstring, ApertiumRE, Ltstr> TaggerWord::patterns;
31 TaggerWord::TaggerWord(bool prev_plus_cut){
32 ignored_string = L"";
33 plus_cut=false;
34 previous_plus_cut=prev_plus_cut;
37 TaggerWord::TaggerWord(const TaggerWord &w){
38 superficial_form = w.superficial_form;
39 tags = w.tags;
40 lexical_forms = w.lexical_forms;
41 ignored_string = w.ignored_string;
42 plus_cut = w.plus_cut;
43 previous_plus_cut=w.previous_plus_cut;
46 TaggerWord::~TaggerWord(){
49 void
50 TaggerWord::set_superficial_form(const wstring &sf){
51 superficial_form = sf;
54 wstring&
55 TaggerWord::get_superficial_form() {
56 return superficial_form;
59 bool
60 TaggerWord::match(wstring const &s, wstring const &pattern)
62 map<wstring, ApertiumRE, Ltstr>::iterator it = patterns.find(pattern);
63 string const utfs = UtfConverter::toUtf8(s);
65 if(it == patterns.end())
67 string utfpattern = UtfConverter::toUtf8(pattern);
68 string regexp = "";
70 while(true)
72 size_t pos = utfpattern.find("<*>");
73 if(pos == string::npos)
75 break;
77 utfpattern.replace(pos, 3, "(<[^>]+>)+");
79 patterns[pattern].compile(utfpattern);
80 return patterns[pattern].match(utfs) != "";
82 else
84 return it->second.match(utfs) != "";
88 void
89 TaggerWord::add_tag(TTag &t, const wstring &lf, vector<wstring> const &prefer_rules){
91 //Tag is added only is it is not present yet
92 //Sometime one word can have more than one lexical form assigned to the same tag
93 if (tags.find(t)==tags.end()) {
94 tags.insert(t);
95 lexical_forms[t]=lf;
96 } else {
97 //Take a look at the prefer rules
98 for(int i=0; i < (int) prefer_rules.size(); i++)
100 if (match(lf, prefer_rules[i]))
102 lexical_forms[t]=lf;
103 break;
109 set<TTag>&
110 TaggerWord::get_tags() {
111 return tags;
114 wstring
115 TaggerWord::get_string_tags() {
116 wstring st;
117 set<TTag>::iterator itag;
119 st=L"{";
120 for(itag=tags.begin(); itag!=tags.end(); itag++) {
121 if (itag!=tags.begin())
122 st+=L',';
123 st+=array_tags[*itag];
125 st += L'}';
127 return st;
130 wstring
131 TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) {
132 wstring ret= L"";
134 if (show_ingnored_string)
135 ret.append(ignored_string);
137 if(t==TAG_kEOF)
138 return ret;
140 if (!previous_plus_cut)
141 ret+=L'^';
143 if (lexical_forms.size()==0) { // This is an UNKNOWN WORD
144 ret +=L'*';
145 ret.append(superficial_form);
146 } else if ((*lexical_forms.begin()).second[0]==L'*') { //This is an
147 //unknown word
148 //that has
149 //been guessed
150 ret += L'*';
151 ret.append(superficial_form);
152 } else if (lexical_forms.size()>1) { //This is an ambiguous word
153 ret.append(lexical_forms[t]);
154 } else {
155 ret.append(lexical_forms[t]);
158 if (ret != ignored_string) {
159 if (plus_cut)
160 ret+=L'+';
161 else {
162 ret += L'$';
167 //if ((superficial_form.length()>0)&&(superficial_form[superficial_form.length()-1]=='\''))
168 // //Si la forma superficial termina en apostrofo metemos un espacio en blanco tras la cadena '/$'
169 // //o '/'. De no hacerlo en la traducción aparecerán dos palabras sin blanco alguno.
170 // ret+=" "; //Quizá este no sea el sitio apropiado para hacer esto, lo suyo sería un módulo
171 // //antes del tagger o del anmor.
173 return ret;
176 wstring
177 TaggerWord::get_all_choosen_tag_first(TTag &t, int const TAG_kEOF) {
178 wstring ret=L"";
180 if (show_ingnored_string)
181 ret.append(ignored_string);
183 if(t==TAG_kEOF)
184 return ret;
186 if (!previous_plus_cut)
187 ret+=L"^";
189 ret.append(superficial_form);
191 if (lexical_forms.size()==0) { // This is an UNKNOWN WORD
192 ret+=L"/*";
193 ret.append(superficial_form);
194 } else {
195 ret+=L"/";
196 ret.append(lexical_forms[t]);
197 if (lexical_forms.size()>1) {
198 set<TTag>::iterator it;
199 for (it=tags.begin(); it!=tags.end(); it++) {
200 if (*it != t) {
201 ret+=L"/";
202 ret.append(lexical_forms[*it]);
208 if (ret != ignored_string) {
209 if (plus_cut)
210 ret+=L"+";
211 else {
212 ret+=L"$";
216 return ret;
219 //OBSOLETE
220 wstring
221 TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) {
222 wstring ret;
224 if(t==TAG_kEOF)
225 return ret;
227 if (lexical_forms.size()==0) { //This is an unknown word
228 ret.append(L"*^");
229 ret.append(superficial_form);
230 } else if ((*lexical_forms.begin()).second[0]=='*') { //This is an unknown word that has been guessed
231 ret.append(L"*^");
232 ret.append(superficial_form);
233 } else {
234 ret += L'^';
235 ret.append(lexical_forms[t]);
238 if (ret.length() != 0) {
239 if (plus_cut)
240 ret+=L'+';
241 else {
242 ret +=L'$';
246 return ret;
249 void
250 TaggerWord::add_ignored_string(wstring const &s) {
251 ignored_string.append(s);
254 void
255 TaggerWord::set_plus_cut(const bool &c) {
256 plus_cut=c;
259 bool
260 TaggerWord::get_plus_cut() {
261 return plus_cut;
264 wostream&
265 operator<< (wostream& os, TaggerWord &w) {
266 os<<w.get_string_tags()<< L" \t Word: " << w.get_superficial_form();
267 return os;
270 void
271 TaggerWord::setArrayTags(vector<wstring> const &at)
273 array_tags = at;
276 void
277 TaggerWord::print()
279 wcout << L"[#" << superficial_form << L"# ";
280 for(set<TTag>::iterator it=tags.begin(), limit = tags.end(); it != limit; it++)
282 wcout << L"(" << *it << L" " << lexical_forms[*it] << L") ";
284 wcout << L"\b]\n";
287 void
288 TaggerWord::outputOriginal(FILE *output) {
290 wstring s=superficial_form;
292 map<TTag, wstring>::iterator it;
293 for(it=lexical_forms.begin(); it!=lexical_forms.end(); it++) {
294 if (it->second.length()>0)
296 s+=L'/';
297 s.append(it->second);
301 if (s.length()>0)
303 s=L"^"+s+L"$\n";
306 fputws_unlocked(s.c_str(), output);