These are taken care of
[apertium.git] / lttoolbox-unicode / lttoolbox / pattern_list.cc
blob5c118c1e557bb5e6c198a79e20d1153543888ea6
1 /*
2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 * 02111-1307, USA.
19 #include <lttoolbox/pattern_list.h>
20 #include <lttoolbox/compression.h>
22 #include <cstdlib>
23 #include <iostream>
25 wstring const PatternList::ANY_CHAR = L"<ANY_CHAR>";
26 wstring const PatternList::ANY_TAG = L"<ANY_TAG>";
27 wstring const PatternList::QUEUE = L"<QUEUE>";
29 void
30 PatternList::copy(PatternList const &o)
32 sequence = o.sequence;
33 sequence_data = o.sequence_data;
34 patterns = o.patterns;
35 alphabet = o.alphabet;
36 transducer = o.transducer;
37 final_type = o.final_type;
38 sequence_id = o.sequence_id;
41 void
42 PatternList::destroy()
46 PatternList::PatternList()
48 sequence = false;
49 alphabet.includeSymbol(ANY_TAG);
50 alphabet.includeSymbol(ANY_CHAR);
51 alphabet.includeSymbol(QUEUE);
54 PatternList::~PatternList()
56 destroy();
59 PatternList::PatternList(PatternList const &o)
61 copy(o);
64 PatternList &
65 PatternList::operator =(PatternList const &o)
67 if(this != &o)
69 destroy();
70 copy(o);
72 return *this;
75 void
76 PatternList::beginSequence()
78 if(sequence)
80 cerr << "Error: opening an unended sequence" << endl;
81 exit(EXIT_FAILURE);
83 sequence = true;
84 sequence_data.clear();
87 void
88 PatternList::endSequence()
90 if(!sequence)
92 cerr << "Error: ending an unopened sequence" << endl;
93 exit(EXIT_FAILURE);
95 sequence = false;
97 for(list<vector<int> >::iterator it = sequence_data.begin(),
98 limit = sequence_data.end();
99 it != limit; it++)
101 it->push_back(alphabet(QUEUE));
102 patterns.insert(pair<int, vector<int> >(sequence_id, *it));
106 void
107 PatternList::insertOutOfSequence(wstring const &lemma, wstring const &tags,
108 vector<int> &result)
110 if(lemma == L"")
112 result.push_back(alphabet(ANY_CHAR));
114 else
116 for(unsigned int i = 0, limit = lemma.size(); i < limit; i++)
118 if(lemma[i] == L'*')
120 result.push_back(alphabet(ANY_CHAR));
122 else
124 result.push_back(int((unsigned char) lemma[i]));
128 if(tags == L"")
130 result.push_back(alphabet(ANY_TAG));
132 else
134 for(unsigned int i = 0, limit = tagCount(tags); i < limit; i++)
136 wstring tag = L"<" + tagAt(tags, i) + L">";
138 if(tag == L"<*>")
140 result.push_back(alphabet(ANY_TAG));
142 else
144 alphabet.includeSymbol(tag);
145 result.push_back(alphabet(tag));
151 void
152 PatternList::insertIntoSequence(int const id, wstring const &lemma,
153 wstring const &tags)
155 sequence_id = id;
157 if(sequence_data.size() == 0)
159 vector<int> new_vector;
160 insertOutOfSequence(lemma, tags, new_vector);
161 sequence_data.push_back(new_vector);
163 else
165 list<vector<int> >::iterator it = sequence_data.begin();
166 list<vector<int> >::iterator limit = sequence_data.end();
167 for(; it != limit; it++)
169 it->push_back(L'+');
170 insertOutOfSequence(lemma, tags, *it);
175 void
176 PatternList::insert(int const id, wstring const &lemma, wstring const &tags)
178 if(!sequence)
180 vector<int> local;
181 insertOutOfSequence(lemma, tags, local);
182 local.push_back(alphabet(QUEUE));
183 patterns.insert(pair<int, vector<int> >(id, local));
185 else
187 insertIntoSequence(id, lemma, tags);
191 void
192 PatternList::insert(int const id, int const otherid)
194 if(!sequence)
196 cerr << "Error: using labels outside of a sequence" << endl;
197 exit(EXIT_FAILURE);
200 sequence_id = id;
202 if(sequence_data.size() == 0)
204 PatternRange p = patterns.equal_range(otherid);
205 for(; p.first != p.second; p.first++)
207 sequence_data.push_back(p.first->second);
210 else
212 list<vector<int> > new_sequence_data;
214 for(list<vector<int> >::iterator it = sequence_data.begin(),
215 limit = sequence_data.end(); it != limit; it++)
217 for(PatternRange p = patterns.equal_range(otherid);
218 p.first != p.second; p.first++)
220 vector<int> temp = *it;
221 temp.push_back(L'+');
222 temp.insert(temp.end(), (p.first->second).begin(),
223 (p.first->second).end());
224 new_sequence_data.push_back(temp);
228 sequence_data = new_sequence_data;
233 PatternList::tagCount(wstring const &tags)
235 int count = 0;
237 for(unsigned int i = 0, limit = tags.size(); i < limit; i++)
239 if(i == 0)
241 count++;
243 else if(tags[i] == L'.')
245 count++;
249 return count;
252 wstring
253 PatternList::tagAt(wstring const &tags, int const index)
255 int start = 0;
256 int end = 0;
257 int count = 0;
259 for(unsigned int i = 0, limit = tags.size(); i < limit; i++)
261 if(tags[i] == L'.')
263 count++;
264 if(end == 0)
266 start = 0;
268 else
270 start = end + 1;
272 end = i;
274 if(count == index + 1)
276 return tags.substr(start, end - start);
280 if(index > count)
282 return L"";
284 if(end != 0)
286 return tags.substr(end + 1);
288 else
290 return tags.substr(end);
294 PatternStore const &
295 PatternList::getPatterns()
297 return patterns;
300 void
301 PatternList::buildTransducer()
303 for(PatternStore::const_iterator it = patterns.begin(), limit = patterns.end();
304 it != limit; it++)
306 int state = transducer.getInitial();
307 int prevstate = -1;
308 for(unsigned int i = 0, limit2 = it->second.size(); i != limit2; i++)
310 int const val = it->second[i];
311 if(alphabet(ANY_CHAR) == val || alphabet(ANY_TAG) == val)
313 state = transducer.insertSingleTransduction(val, state);
314 if(prevstate != -1)
316 transducer.linkStates(prevstate, state, val);
317 prevstate = -1;
319 transducer.linkStates(state, state, val);
321 else if(alphabet(QUEUE) == val)
323 if(prevstate != -1)
325 // ignore second (and next) possible consecutive queues
326 continue;
329 // optional queue
330 prevstate = state;
331 state = transducer.insertSingleTransduction(static_cast<int>(L'_'), state);
332 transducer.linkStates(prevstate, state, static_cast<int>(L' '));
333 transducer.linkStates(prevstate, state, static_cast<int>(L'#'));
334 transducer.linkStates(state, state, alphabet(ANY_CHAR));
336 else
338 state = transducer.insertSingleTransduction(val, state);
339 if(prevstate != -1)
341 transducer.linkStates(prevstate, state, val);
342 prevstate = -1;
346 if(prevstate != -1)
348 if(!transducer.isFinal(prevstate))
350 transducer.setFinal(prevstate);
351 final_type[prevstate] = it->first;
353 prevstate = -1;
355 if(!transducer.isFinal(state))
357 transducer.setFinal(state);
358 final_type[state] = it->first;
363 void
364 PatternList::write(FILE *output)
366 alphabet.write(output);
367 wstring const tagger_name = L"tagger";
369 Compression::multibyte_write(1, output);
370 Compression::wstring_write(tagger_name, output);
371 transducer.write(output, alphabet.size());
373 Compression::multibyte_write(final_type.size(), output);
375 for(map<int, int>::const_iterator it = final_type.begin(), limit = final_type.end();
376 it != limit; it++)
378 Compression::multibyte_write(it->first, output);
379 Compression::multibyte_write(it->second, output);
383 void
384 PatternList::read(FILE *input)
386 sequence = false;
387 final_type.clear();
389 alphabet.read(input);
390 if(Compression::multibyte_read(input) == 1)
392 wstring mystr = Compression::wstring_read(input);
393 transducer.read(input, alphabet.size());
395 int finalsize = Compression::multibyte_read(input);
396 for(; finalsize != 0; finalsize--)
398 int key = Compression::multibyte_read(input);
399 final_type[key] = Compression::multibyte_read(input);
404 MatchExe *
405 PatternList::newMatchExe()
407 return new MatchExe(transducer, final_type);
410 Alphabet &
411 PatternList::getAlphabet()
413 return alphabet;