2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 #include <apertium/tagger_word.h>
20 #include <apertium/utf_converter.h>
21 #include <apertium/string_utils.h>
23 using namespace Apertium
;
25 vector
<wstring
> TaggerWord::array_tags
;
27 bool TaggerWord::show_ingnored_string
=true;
29 map
<wstring
, ApertiumRE
, Ltstr
> TaggerWord::patterns
;
31 TaggerWord::TaggerWord(bool prev_plus_cut
){
34 previous_plus_cut
=prev_plus_cut
;
37 TaggerWord::TaggerWord(const TaggerWord
&w
){
38 superficial_form
= w
.superficial_form
;
40 lexical_forms
= w
.lexical_forms
;
41 ignored_string
= w
.ignored_string
;
42 plus_cut
= w
.plus_cut
;
43 previous_plus_cut
=w
.previous_plus_cut
;
46 TaggerWord::~TaggerWord(){
50 TaggerWord::set_superficial_form(const wstring
&sf
){
51 superficial_form
= sf
;
55 TaggerWord::get_superficial_form() {
56 return superficial_form
;
60 TaggerWord::match(wstring
const &s
, wstring
const &pattern
)
62 map
<wstring
, ApertiumRE
, Ltstr
>::iterator it
= patterns
.find(pattern
);
63 string
const utfs
= UtfConverter::toUtf8(s
);
65 if(it
== patterns
.end())
67 string utfpattern
= UtfConverter::toUtf8(pattern
);
72 size_t pos
= utfpattern
.find("<*>");
73 if(pos
== string::npos
)
77 utfpattern
.replace(pos
, 3, "(<[^>]+>)+");
79 patterns
[pattern
].compile(utfpattern
);
80 return patterns
[pattern
].match(utfs
) != "";
84 return it
->second
.match(utfs
) != "";
89 TaggerWord::add_tag(TTag
&t
, const wstring
&lf
, vector
<wstring
> const &prefer_rules
){
91 //Tag is added only is it is not present yet
92 //Sometime one word can have more than one lexical form assigned to the same tag
93 if (tags
.find(t
)==tags
.end()) {
97 //Take a look at the prefer rules
98 for(int i
=0; i
< (int) prefer_rules
.size(); i
++)
100 if (match(lf
, prefer_rules
[i
]))
110 TaggerWord::get_tags() {
115 TaggerWord::get_string_tags() {
117 set
<TTag
>::iterator itag
;
120 for(itag
=tags
.begin(); itag
!=tags
.end(); itag
++) {
121 if (itag
!=tags
.begin())
123 st
+=array_tags
[*itag
];
131 TaggerWord::get_lexical_form(TTag
&t
, int const TAG_kEOF
) {
134 if (show_ingnored_string
)
135 ret
.append(ignored_string
);
140 if (!previous_plus_cut
)
143 if (lexical_forms
.size()==0) { // This is an UNKNOWN WORD
145 ret
.append(superficial_form
);
146 } else if ((*lexical_forms
.begin()).second
[0]==L
'*') { //This is an
151 ret
.append(superficial_form
);
152 } else if (lexical_forms
.size()>1) { //This is an ambiguous word
153 ret
.append(lexical_forms
[t
]);
155 ret
.append(lexical_forms
[t
]);
158 if (ret
!= ignored_string
) {
167 //if ((superficial_form.length()>0)&&(superficial_form[superficial_form.length()-1]=='\''))
168 // //Si la forma superficial termina en apostrofo metemos un espacio en blanco tras la cadena '/$'
169 // //o '/'. De no hacerlo en la traducción aparecerán dos palabras sin blanco alguno.
170 // ret+=" "; //Quizá este no sea el sitio apropiado para hacer esto, lo suyo sería un módulo
171 // //antes del tagger o del anmor.
177 TaggerWord::get_all_choosen_tag_first(TTag
&t
, int const TAG_kEOF
) {
180 if (show_ingnored_string
)
181 ret
.append(ignored_string
);
186 if (!previous_plus_cut
)
189 ret
.append(superficial_form
);
191 if (lexical_forms
.size()==0) { // This is an UNKNOWN WORD
193 ret
.append(superficial_form
);
196 ret
.append(lexical_forms
[t
]);
197 if (lexical_forms
.size()>1) {
198 set
<TTag
>::iterator it
;
199 for (it
=tags
.begin(); it
!=tags
.end(); it
++) {
202 ret
.append(lexical_forms
[*it
]);
208 if (ret
!= ignored_string
) {
221 TaggerWord::get_lexical_form_without_ignored_string(TTag
&t
, int const TAG_kEOF
) {
227 if (lexical_forms
.size()==0) { //This is an unknown word
229 ret
.append(superficial_form
);
230 } else if ((*lexical_forms
.begin()).second
[0]=='*') { //This is an unknown word that has been guessed
232 ret
.append(superficial_form
);
235 ret
.append(lexical_forms
[t
]);
238 if (ret
.length() != 0) {
250 TaggerWord::add_ignored_string(wstring
const &s
) {
251 ignored_string
.append(s
);
255 TaggerWord::set_plus_cut(const bool &c
) {
260 TaggerWord::get_plus_cut() {
265 operator<< (wostream
& os
, TaggerWord
&w
) {
266 os
<<w
.get_string_tags()<< L
" \t Word: " << w
.get_superficial_form();
271 TaggerWord::setArrayTags(vector
<wstring
> const &at
)
279 wcout
<< L
"[#" << superficial_form
<< L
"# ";
280 for(set
<TTag
>::iterator it
=tags
.begin(), limit
= tags
.end(); it
!= limit
; it
++)
282 wcout
<< L
"(" << *it
<< L
" " << lexical_forms
[*it
] << L
") ";
288 TaggerWord::outputOriginal(FILE *output
) {
290 wstring s
=superficial_form
;
292 map
<TTag
, wstring
>::iterator it
;
293 for(it
=lexical_forms
.begin(); it
!=lexical_forms
.end(); it
++) {
294 if (it
->second
.length()>0)
297 s
.append(it
->second
);
306 fputws_unlocked(s
.c_str(), output
);