2 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 #include <lttoolbox/pattern_list.h>
20 #include <lttoolbox/compression.h>
25 wstring
const PatternList::ANY_CHAR
= L
"<ANY_CHAR>";
26 wstring
const PatternList::ANY_TAG
= L
"<ANY_TAG>";
27 wstring
const PatternList::QUEUE
= L
"<QUEUE>";
30 PatternList::copy(PatternList
const &o
)
32 sequence
= o
.sequence
;
33 sequence_data
= o
.sequence_data
;
34 patterns
= o
.patterns
;
35 alphabet
= o
.alphabet
;
36 transducer
= o
.transducer
;
37 final_type
= o
.final_type
;
38 sequence_id
= o
.sequence_id
;
42 PatternList::destroy()
46 PatternList::PatternList()
49 alphabet
.includeSymbol(ANY_TAG
);
50 alphabet
.includeSymbol(ANY_CHAR
);
51 alphabet
.includeSymbol(QUEUE
);
54 PatternList::~PatternList()
59 PatternList::PatternList(PatternList
const &o
)
65 PatternList::operator =(PatternList
const &o
)
76 PatternList::beginSequence()
80 cerr
<< "Error: opening an unended sequence" << endl
;
84 sequence_data
.clear();
88 PatternList::endSequence()
92 cerr
<< "Error: ending an unopened sequence" << endl
;
97 for(list
<vector
<int> >::iterator it
= sequence_data
.begin(),
98 limit
= sequence_data
.end();
101 it
->push_back(alphabet(QUEUE
));
102 patterns
.insert(pair
<int, vector
<int> >(sequence_id
, *it
));
107 PatternList::insertOutOfSequence(wstring
const &lemma
, wstring
const &tags
,
112 result
.push_back(alphabet(ANY_CHAR
));
116 for(unsigned int i
= 0, limit
= lemma
.size(); i
< limit
; i
++)
120 result
.push_back(alphabet(ANY_CHAR
));
124 result
.push_back(int((unsigned char) lemma
[i
]));
130 result
.push_back(alphabet(ANY_TAG
));
134 for(unsigned int i
= 0, limit
= tagCount(tags
); i
< limit
; i
++)
136 wstring tag
= L
"<" + tagAt(tags
, i
) + L
">";
140 result
.push_back(alphabet(ANY_TAG
));
144 alphabet
.includeSymbol(tag
);
145 result
.push_back(alphabet(tag
));
152 PatternList::insertIntoSequence(int const id
, wstring
const &lemma
,
157 if(sequence_data
.size() == 0)
159 vector
<int> new_vector
;
160 insertOutOfSequence(lemma
, tags
, new_vector
);
161 sequence_data
.push_back(new_vector
);
165 list
<vector
<int> >::iterator it
= sequence_data
.begin();
166 list
<vector
<int> >::iterator limit
= sequence_data
.end();
167 for(; it
!= limit
; it
++)
170 insertOutOfSequence(lemma
, tags
, *it
);
176 PatternList::insert(int const id
, wstring
const &lemma
, wstring
const &tags
)
181 insertOutOfSequence(lemma
, tags
, local
);
182 local
.push_back(alphabet(QUEUE
));
183 patterns
.insert(pair
<int, vector
<int> >(id
, local
));
187 insertIntoSequence(id
, lemma
, tags
);
192 PatternList::insert(int const id
, int const otherid
)
196 cerr
<< "Error: using labels outside of a sequence" << endl
;
202 if(sequence_data
.size() == 0)
204 PatternRange p
= patterns
.equal_range(otherid
);
205 for(; p
.first
!= p
.second
; p
.first
++)
207 sequence_data
.push_back(p
.first
->second
);
212 list
<vector
<int> > new_sequence_data
;
214 for(list
<vector
<int> >::iterator it
= sequence_data
.begin(),
215 limit
= sequence_data
.end(); it
!= limit
; it
++)
217 for(PatternRange p
= patterns
.equal_range(otherid
);
218 p
.first
!= p
.second
; p
.first
++)
220 vector
<int> temp
= *it
;
221 temp
.push_back(L
'+');
222 temp
.insert(temp
.end(), (p
.first
->second
).begin(),
223 (p
.first
->second
).end());
224 new_sequence_data
.push_back(temp
);
228 sequence_data
= new_sequence_data
;
233 PatternList::tagCount(wstring
const &tags
)
237 for(unsigned int i
= 0, limit
= tags
.size(); i
< limit
; i
++)
243 else if(tags
[i
] == L
'.')
253 PatternList::tagAt(wstring
const &tags
, int const index
)
259 for(unsigned int i
= 0, limit
= tags
.size(); i
< limit
; i
++)
274 if(count
== index
+ 1)
276 return tags
.substr(start
, end
- start
);
286 return tags
.substr(end
+ 1);
290 return tags
.substr(end
);
295 PatternList::getPatterns()
301 PatternList::buildTransducer()
303 for(PatternStore::const_iterator it
= patterns
.begin(), limit
= patterns
.end();
306 int state
= transducer
.getInitial();
308 for(unsigned int i
= 0, limit2
= it
->second
.size(); i
!= limit2
; i
++)
310 int const val
= it
->second
[i
];
311 if(alphabet(ANY_CHAR
) == val
|| alphabet(ANY_TAG
) == val
)
313 state
= transducer
.insertSingleTransduction(val
, state
);
316 transducer
.linkStates(prevstate
, state
, val
);
319 transducer
.linkStates(state
, state
, val
);
321 else if(alphabet(QUEUE
) == val
)
325 // ignore second (and next) possible consecutive queues
331 state
= transducer
.insertSingleTransduction(static_cast<int>(L
'_'), state
);
332 transducer
.linkStates(prevstate
, state
, static_cast<int>(L
' '));
333 transducer
.linkStates(prevstate
, state
, static_cast<int>(L
'#'));
334 transducer
.linkStates(state
, state
, alphabet(ANY_CHAR
));
338 state
= transducer
.insertSingleTransduction(val
, state
);
341 transducer
.linkStates(prevstate
, state
, val
);
348 if(!transducer
.isFinal(prevstate
))
350 transducer
.setFinal(prevstate
);
351 final_type
[prevstate
] = it
->first
;
355 if(!transducer
.isFinal(state
))
357 transducer
.setFinal(state
);
358 final_type
[state
] = it
->first
;
364 PatternList::write(FILE *output
)
366 alphabet
.write(output
);
367 wstring
const tagger_name
= L
"tagger";
369 Compression::multibyte_write(1, output
);
370 Compression::wstring_write(tagger_name
, output
);
371 transducer
.write(output
, alphabet
.size());
373 Compression::multibyte_write(final_type
.size(), output
);
375 for(map
<int, int>::const_iterator it
= final_type
.begin(), limit
= final_type
.end();
378 Compression::multibyte_write(it
->first
, output
);
379 Compression::multibyte_write(it
->second
, output
);
384 PatternList::read(FILE *input
)
389 alphabet
.read(input
);
390 if(Compression::multibyte_read(input
) == 1)
392 wstring mystr
= Compression::wstring_read(input
);
393 transducer
.read(input
, alphabet
.size());
395 int finalsize
= Compression::multibyte_read(input
);
396 for(; finalsize
!= 0; finalsize
--)
398 int key
= Compression::multibyte_read(input
);
399 final_type
[key
] = Compression::multibyte_read(input
);
405 PatternList::newMatchExe()
407 return new MatchExe(transducer
, final_type
);
411 PatternList::getAlphabet()