2 * Copyright (C) 2007 Francis Tyers
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
25 * This class is an implementation of a reverse Trie, the
26 * trie for 'function', 'fractal', 'fraction' and 'full' looks
50 unsigned int lemma
= 0;
55 // wcout << L"TrieNode()" << endl;
60 // wcout << L"~TrieNode()" << endl;
64 TrieNode::add(wstring s
)
67 wchar_t c
= s
.at(s
.length() - 1); // this is stupid but i
68 wstring x
= L
""; // can't think of a
69 x
+= c
; // better way to do it
73 wcout
<< L
"TrieNode::add( " << s
<< L
" ) [" << x
<< L
"]" << endl
;
76 if(children
.count(x
) == 0) {
78 wcout
<< L
"* Creating new TrieNode for `" << x
<< L
"'" << endl
;
80 children
[x
] = TrieNode();
83 wstring n
= s
.substr(0, s
.length() - 1);
85 wcout
<< L
" ? Adding: " << n
<< endl
;
91 wcout
<< L
"Returning from TrieNode::add()" << endl
;
98 * returns true if the string was found, false if it wasn't
102 TrieNode::find(wstring s
)
106 wcout
<< L
"Found." << endl
;
111 wstring x
= L
""; x
+= c
;
114 wcout
<< L
"TrieNode::find( " << s
<< L
" ) [" << x
<< L
" : " << s
.length() << L
"]" << endl
;
116 if(children
.count(x
) == 0) {
118 wcout
<< L
"Not found." << endl
;
121 children
[x
].find(s
.substr(1));
128 * Return a list of candidate lemmata from a suffix
129 * The 'candidates' argument is ammended and returned.
135 TrieNode::candidates(wstring sfx
, vector
<wstring
> & candidates
)
137 if(sfx
.empty()) { // reached the end of the suffix.
140 wcout
<< L
"Found." << endl
;
143 vector
<wstring
>::iterator eliter
= candidates
.begin();
144 eliter
= candidates
.insert(eliter
, L
"");
147 wcout
<< L
"TrieNode::candidates() [children: " << children
.size() << L
"]" << endl
;
150 print2(candidates
); // call this something less stupid
156 // we got to the end of the suffix so now we need to count up the
157 // candidates. we also need to reverse them to retrieve the original
159 for(eliter
= candidates
.begin(); eliter
!= candidates
.end(); eliter
++) {
160 wstring reversed
= (*eliter
);
161 unsigned int count
= 0;
163 for(unsigned int i
= (*eliter
).length(); i
!= 0; i
--) {
164 reversed
[count
++] = (*eliter
).at(i
-1);
167 (*eliter
) = reversed
;
170 wcout
<< L
" @ " << reversed
<< endl
;
176 } else { // we haven't reached the end of the suffix
178 wchar_t c
= sfx
.at(0);
179 wstring x
= L
""; x
+= c
;
182 wcout
<< L
"TrieNode::find( " << sfx
<< L
" ) [" << x
<< L
" : " << sfx
.length() << L
"]" << endl
;
185 if(children
.count(x
) == 0) {
186 // if there are no children to this character then the suffix has
187 // no candidate stems -- and we should return what we currently have
189 wcout
<< L
"Not found." << endl
;
193 // take the next character in the suffix and recurse down
194 // removing the current character from the suffix.
195 return children
[x
].candidates(sfx
.substr(1), candidates
);
200 /* THE MANGLING HAPPENS HERE */
203 TrieNode::print2(vector
<wstring
> & candidates
)
205 map
<wstring
,TrieNode
>::iterator iter
;
206 vector
<wstring
>::iterator eliter
= candidates
.end();
208 if(children
.empty()) {
210 wcout
<< L
" + candidates.size [e] = " << candidates
.size() << endl
;
214 } else if(children
.size() > 1) {
217 wcout
<< L
" + candidates.size [1] = " << candidates
.size() << L
" : " << children
.size() << L
" : " << candidates
.at(lemma
) << endl
;
220 for(unsigned int i
= 1; i
< children
.size(); i
++) {
221 eliter
= candidates
.insert(eliter
, candidates
.at(lemma
));
224 wcout
<< L
"(" << children
.size() << L
") We didn't do shit. Children wasn't empty and size wasn't > 1" << endl
;
227 for(iter
= children
.begin(); iter
!= children
.end(); iter
++) {
229 wcout
<< L
" + candidates.size [f] = " << candidates
.size() << L
" - " << candidates
[lemma
] << L
" += " << (*iter
).first
<< endl
;
231 candidates
[lemma
] += (*iter
).first
;
233 wcout
<< L
"[" << (*iter
).first
<< L
"] (" << children
.size() << L
") " << endl
;
235 children
[(*iter
).first
].print2(candidates
);
239 Trie::Trie(bool _verbose
)
253 // wcout << L"~Trie()" << endl;
260 wcout
<< L
"adding: " << s
<< L
" / " << endl
;
269 Trie::find(wstring s
)
272 wcout
<< L
"Trie::find( " << s
<< L
" ) " << endl
;
278 * Return a vector of candidate lemmas for a particular suffix
281 Trie::candidates(wstring sfx
)
283 vector
<wstring
> candidates
;
284 wstring reversed
= sfx
;
285 unsigned int count
= 0;
287 // make sure this is clear.
291 // we have to reverse the string because the Trie
292 // is in reverse order -- no standard library
293 // function for reversing strings iirc.
294 for(unsigned int i
= sfx
.length(); i
!= 0; i
--) {
295 reversed
[count
++] = sfx
.at(i
-1);
299 wcout
<< L
"sfx: " << sfx
<< "; rev: " << reversed
<< endl
;
302 return root
.candidates(reversed
, candidates
);