Moving non-unicode apertium to branches
[apertium.git] / apertium-unicode / apertium / morpho_stream.cc
bloba498ef457d760b086362d6b77205c03606c55b84
1 /*
2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 * 02111-1307, USA.
19 /**
20 * Word class and MorphoStream class definitions
22 * @author Felipe Sánchez-Martínez
25 #include <apertium/morpho_stream.h>
26 #include <apertium/constant_manager.h>
27 #include <vector>
28 #include <apertium/string_utils.h>
30 using namespace Apertium;
31 MorphoStream::MorphoStream(FILE *ftxt, bool d, TaggerData *t)
33 foundEOF = false;
34 debug=d;
35 td = t;
36 me = td->getPatternList().newMatchExe();
37 alphabet = td->getPatternList().getAlphabet();
38 input = ftxt;
39 ca_any_char = alphabet(PatternList::ANY_CHAR);
40 ca_any_tag = alphabet(PatternList::ANY_TAG);
42 ConstantManager &constants = td->getConstants();
43 ca_kignorar = constants.getConstant(L"kIGNORAR");
44 ca_kbarra = constants.getConstant(L"kBARRA");
45 ca_kdollar = constants.getConstant(L"kDOLLAR");
46 ca_kbegin = constants.getConstant(L"kBEGIN");
47 ca_kmot = constants.getConstant(L"kMOT");
48 ca_kmas = constants.getConstant(L"kMAS");
49 ca_kunknown = constants.getConstant(L"kUNKNOWN");
51 map<wstring, int, Ltstr> &tag_index = td->getTagIndex();
52 ca_tag_keof = tag_index[L"TAG_kEOF"];
53 ca_tag_kundef = tag_index[L"TAG_kUNDEF"];
56 MorphoStream::~MorphoStream()
58 delete me;
61 TaggerWord*
62 MorphoStream::get_next_word()
64 if(vwords.size() != 0)
66 TaggerWord* word=vwords.front();
67 vwords.erase(vwords.begin());
68 // cout << *word << endl;
69 return word;
72 if(feof(input))
74 return NULL;
77 int ivwords = 0;
78 vwords.push_back(new TaggerWord());
80 while(true)
82 int symbol = fgetwc_unlocked(input);
83 if(feof(input))
85 vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules());
86 return get_next_word();
88 if(symbol == L'^')
90 readRestOfWord(ivwords);
91 return get_next_word();
93 else
95 wstring str = L"";
96 if(symbol == L'\\')
98 symbol = fgetwc_unlocked(input);
99 str += L'\\';
101 str += static_cast<wchar_t>(symbol);
103 while(symbol != L'^')
105 symbol = fgetwc_unlocked(input);
106 if(feof(input))
108 vwords[ivwords]->add_ignored_string(str);
109 vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules());
110 return get_next_word();
112 else if(symbol == L'\\')
114 str += L'\\';
115 symbol = fgetwc_unlocked(input);
116 if(feof(input))
118 vwords[ivwords]->add_ignored_string(str);
119 vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules());
120 return get_next_word();
122 str += static_cast<wchar_t>(symbol);
124 else if(symbol == L'^')
126 if(str.size() > 0)
128 vwords[ivwords]->add_ignored_string(str);
130 readRestOfWord(ivwords);
131 return get_next_word();
133 else
135 str += static_cast<wchar_t>(symbol);
142 void
143 MorphoStream::lrlmClassify(wstring const &str, int &ivwords)
145 int floor = 0;
146 int last_type = -1;
147 int last_pos = 0;
149 ms.init(me->getInitial());
150 for(int i = 0, limit = str.size(); i != limit; i++)
152 if(str[i] != L'<')
154 if(str[i] == L'+')
156 int val = ms.classifyFinals(me->getFinals());
157 if(val != -1)
159 last_pos = i-1;
160 last_type = val;
163 ms.step(towlower(str[i]), ca_any_char);
165 else
167 wstring tag = L"";
168 for(int j = i+1; j != limit; j++)
170 if(str[j] == L'\\')
172 j++;
174 else if(str[j] == L'>')
176 tag = str.substr(i, j-i+1);
177 i = j;
178 break;
182 int symbol = alphabet(tag);
183 if(symbol)
185 ms.step(symbol, ca_any_tag);
187 else
189 ms.step(ca_any_tag);
193 if(ms.size() == 0)
195 if(last_pos != floor)
197 vwords[ivwords]->add_tag(last_type,
198 str.substr(floor, last_pos - floor + 1),
199 td->getPreferRules());
200 if(str[last_pos+1] == L'+' && last_pos+1 < limit )
202 floor = last_pos + 1;
203 last_pos = floor;
204 vwords[ivwords]->set_plus_cut(true);
205 if (((int)vwords.size())<=((int)(ivwords+1)))
206 vwords.push_back(new TaggerWord(true));
207 ivwords++;
208 ms.init(me->getInitial());
210 i = floor++;
212 else
214 if (debug)
216 wcerr<<L"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<L"'\n";
217 wcerr<<L" This is because of an incomplete tagset definition or a dictionary error\n";
219 vwords[ivwords]->add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules());
220 return;
223 else if(i == limit - 1)
225 if(ms.classifyFinals(me->getFinals()) == -1)
227 if(last_pos != floor)
229 vwords[ivwords]->add_tag(last_type,
230 str.substr(floor, last_pos - floor + 1),
231 td->getPreferRules());
232 if(str[last_pos+1] == L'+' && last_pos+1 < limit )
234 floor = last_pos + 1;
235 last_pos = floor;
236 vwords[ivwords]->set_plus_cut(true);
237 if (((int)vwords.size())<=((int)(ivwords+1)))
238 vwords.push_back(new TaggerWord(true));
239 ivwords++;
240 ms.init(me->getInitial());
242 i = floor++;
244 else
246 if (debug)
248 wcerr<<L"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<L"'\n";
249 wcerr<<L" This is because of an incomplete tagset definition or a dictionary error\n";
251 vwords[ivwords]->add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules());
252 return;
258 int val = ms.classifyFinals(me->getFinals());
259 if(val == -1)
261 val = ca_tag_kundef;
262 if (debug)
264 wcerr<<L"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<L"'\n";
265 wcerr<<L" This is because of an incomplete tagset definition or a dictionary error\n";
269 vwords[ivwords]->add_tag(val, str.substr(floor), td->getPreferRules());
272 void
273 MorphoStream::readRestOfWord(int &ivwords)
275 // first we have the superficial form
276 wstring str = L"";
278 while(true)
280 int symbol = fgetwc_unlocked(input);
281 if(feof(input))
283 if(str.size() > 0)
285 vwords[ivwords]->add_ignored_string(str);
286 wcerr<<L"Warning (internal): kIGNORE was returned while reading a word\n";
287 wcerr<<L"Word being read: "<<vwords[ivwords]->get_superficial_form()<<L"\n";
288 wcerr<<L"Debug: "<< str <<L"\n";
290 vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules());
291 return;
293 else if(symbol == L'\\')
295 symbol = fgetwc_unlocked(input);
296 str += L'\\';
297 str += static_cast<wchar_t>(symbol);
299 else if(symbol == L'/')
301 vwords[ivwords]->set_superficial_form(str);
302 str = L"";
303 break;
305 else if(symbol == L'$')
307 vwords[ivwords]->set_superficial_form(str);
308 vwords[ivwords]->add_ignored_string(L"$");
309 break;
311 else
313 str += static_cast<wchar_t>(symbol);
317 // then we read the acceptions
319 while(true)
321 int symbol = fgetwc_unlocked(input);
322 if(feof(input))
324 if(str.size() > 0)
326 vwords[ivwords]->add_ignored_string(str);
327 wcerr<<L"Warning (internal): kIGNORE was returned while reading a word\n";
328 wcerr<<L"Word being read: "<<vwords[ivwords]->get_superficial_form()<<L"\n";
329 wcerr<<L"Debug: "<< str <<L"\n";
331 vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules());
332 return;
334 else if(symbol == L'\\')
336 symbol = fgetwc_unlocked(input);
337 str += L'\\';
338 str += static_cast<wchar_t>(symbol);
339 symbol = L'\\'; // to prevent exiting with '\$'
341 else if(symbol == L'/')
343 lrlmClassify(str, ivwords);
344 str = L"";
345 ivwords = 0;
346 continue;
348 else if(symbol == L'$')
350 if(str[0] != L'*')// do nothing with unknown words
352 lrlmClassify(str, ivwords);
354 return;
356 else
358 str += static_cast<wchar_t>(symbol);