2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 * Word class and MorphoStream class definitions
22 * @author Felipe Sánchez-Martínez
25 #include <apertium/morpho_stream.h>
26 #include <apertium/constant_manager.h>
28 #include <apertium/string_utils.h>
30 using namespace Apertium
;
31 MorphoStream::MorphoStream(FILE *ftxt
, bool d
, TaggerData
*t
)
36 me
= td
->getPatternList().newMatchExe();
37 alphabet
= td
->getPatternList().getAlphabet();
39 ca_any_char
= alphabet(PatternList::ANY_CHAR
);
40 ca_any_tag
= alphabet(PatternList::ANY_TAG
);
42 ConstantManager
&constants
= td
->getConstants();
43 ca_kignorar
= constants
.getConstant(L
"kIGNORAR");
44 ca_kbarra
= constants
.getConstant(L
"kBARRA");
45 ca_kdollar
= constants
.getConstant(L
"kDOLLAR");
46 ca_kbegin
= constants
.getConstant(L
"kBEGIN");
47 ca_kmot
= constants
.getConstant(L
"kMOT");
48 ca_kmas
= constants
.getConstant(L
"kMAS");
49 ca_kunknown
= constants
.getConstant(L
"kUNKNOWN");
51 map
<wstring
, int, Ltstr
> &tag_index
= td
->getTagIndex();
52 ca_tag_keof
= tag_index
[L
"TAG_kEOF"];
53 ca_tag_kundef
= tag_index
[L
"TAG_kUNDEF"];
56 MorphoStream::~MorphoStream()
62 MorphoStream::get_next_word()
64 if(vwords
.size() != 0)
66 TaggerWord
* word
=vwords
.front();
67 vwords
.erase(vwords
.begin());
68 // cout << *word << endl;
78 vwords
.push_back(new TaggerWord());
82 int symbol
= fgetwc_unlocked(input
);
85 vwords
[ivwords
]->add_tag(ca_tag_keof
, L
"", td
->getPreferRules());
86 return get_next_word();
90 readRestOfWord(ivwords
);
91 return get_next_word();
98 symbol
= fgetwc_unlocked(input
);
101 str
+= static_cast<wchar_t>(symbol
);
103 while(symbol
!= L
'^')
105 symbol
= fgetwc_unlocked(input
);
108 vwords
[ivwords
]->add_ignored_string(str
);
109 vwords
[ivwords
]->add_tag(ca_tag_keof
, L
"", td
->getPreferRules());
110 return get_next_word();
112 else if(symbol
== L
'\\')
115 symbol
= fgetwc_unlocked(input
);
118 vwords
[ivwords
]->add_ignored_string(str
);
119 vwords
[ivwords
]->add_tag(ca_tag_keof
, L
"", td
->getPreferRules());
120 return get_next_word();
122 str
+= static_cast<wchar_t>(symbol
);
124 else if(symbol
== L
'^')
128 vwords
[ivwords
]->add_ignored_string(str
);
130 readRestOfWord(ivwords
);
131 return get_next_word();
135 str
+= static_cast<wchar_t>(symbol
);
143 MorphoStream::lrlmClassify(wstring
const &str
, int &ivwords
)
149 ms
.init(me
->getInitial());
150 for(int i
= 0, limit
= str
.size(); i
!= limit
; i
++)
156 int val
= ms
.classifyFinals(me
->getFinals());
163 ms
.step(towlower(str
[i
]), ca_any_char
);
168 for(int j
= i
+1; j
!= limit
; j
++)
174 else if(str
[j
] == L
'>')
176 tag
= str
.substr(i
, j
-i
+1);
182 int symbol
= alphabet(tag
);
185 ms
.step(symbol
, ca_any_tag
);
195 if(last_pos
!= floor
)
197 vwords
[ivwords
]->add_tag(last_type
,
198 str
.substr(floor
, last_pos
- floor
+ 1),
199 td
->getPreferRules());
200 if(str
[last_pos
+1] == L
'+' && last_pos
+1 < limit
)
202 floor
= last_pos
+ 1;
204 vwords
[ivwords
]->set_plus_cut(true);
205 if (((int)vwords
.size())<=((int)(ivwords
+1)))
206 vwords
.push_back(new TaggerWord(true));
208 ms
.init(me
->getInitial());
216 wcerr
<<L
"Warning: There is not coarse tag for the fine tag '"<< str
.substr(floor
) <<L
"'\n";
217 wcerr
<<L
" This is because of an incomplete tagset definition or a dictionary error\n";
219 vwords
[ivwords
]->add_tag(ca_tag_kundef
, str
.substr(floor
) , td
->getPreferRules());
223 else if(i
== limit
- 1)
225 if(ms
.classifyFinals(me
->getFinals()) == -1)
227 if(last_pos
!= floor
)
229 vwords
[ivwords
]->add_tag(last_type
,
230 str
.substr(floor
, last_pos
- floor
+ 1),
231 td
->getPreferRules());
232 if(str
[last_pos
+1] == L
'+' && last_pos
+1 < limit
)
234 floor
= last_pos
+ 1;
236 vwords
[ivwords
]->set_plus_cut(true);
237 if (((int)vwords
.size())<=((int)(ivwords
+1)))
238 vwords
.push_back(new TaggerWord(true));
240 ms
.init(me
->getInitial());
248 wcerr
<<L
"Warning: There is not coarse tag for the fine tag '"<< str
.substr(floor
) <<L
"'\n";
249 wcerr
<<L
" This is because of an incomplete tagset definition or a dictionary error\n";
251 vwords
[ivwords
]->add_tag(ca_tag_kundef
, str
.substr(floor
) , td
->getPreferRules());
258 int val
= ms
.classifyFinals(me
->getFinals());
264 wcerr
<<L
"Warning: There is not coarse tag for the fine tag '"<< str
.substr(floor
) <<L
"'\n";
265 wcerr
<<L
" This is because of an incomplete tagset definition or a dictionary error\n";
269 vwords
[ivwords
]->add_tag(val
, str
.substr(floor
), td
->getPreferRules());
273 MorphoStream::readRestOfWord(int &ivwords
)
275 // first we have the superficial form
280 int symbol
= fgetwc_unlocked(input
);
285 vwords
[ivwords
]->add_ignored_string(str
);
286 wcerr
<<L
"Warning (internal): kIGNORE was returned while reading a word\n";
287 wcerr
<<L
"Word being read: "<<vwords
[ivwords
]->get_superficial_form()<<L
"\n";
288 wcerr
<<L
"Debug: "<< str
<<L
"\n";
290 vwords
[ivwords
]->add_tag(ca_tag_keof
, L
"", td
->getPreferRules());
293 else if(symbol
== L
'\\')
295 symbol
= fgetwc_unlocked(input
);
297 str
+= static_cast<wchar_t>(symbol
);
299 else if(symbol
== L
'/')
301 vwords
[ivwords
]->set_superficial_form(str
);
305 else if(symbol
== L
'$')
307 vwords
[ivwords
]->set_superficial_form(str
);
308 vwords
[ivwords
]->add_ignored_string(L
"$");
313 str
+= static_cast<wchar_t>(symbol
);
317 // then we read the acceptions
321 int symbol
= fgetwc_unlocked(input
);
326 vwords
[ivwords
]->add_ignored_string(str
);
327 wcerr
<<L
"Warning (internal): kIGNORE was returned while reading a word\n";
328 wcerr
<<L
"Word being read: "<<vwords
[ivwords
]->get_superficial_form()<<L
"\n";
329 wcerr
<<L
"Debug: "<< str
<<L
"\n";
331 vwords
[ivwords
]->add_tag(ca_tag_keof
, L
"", td
->getPreferRules());
334 else if(symbol
== L
'\\')
336 symbol
= fgetwc_unlocked(input
);
338 str
+= static_cast<wchar_t>(symbol
);
339 symbol
= L
'\\'; // to prevent exiting with '\$'
341 else if(symbol
== L
'/')
343 lrlmClassify(str
, ivwords
);
348 else if(symbol
== L
'$')
350 if(str
[0] != L
'*')// do nothing with unknown words
352 lrlmClassify(str
, ivwords
);
358 str
+= static_cast<wchar_t>(symbol
);