HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Lexicon.cpp
blobd9a6b664ced5608ab414227a8fac6a1c39ed5fba
1 // Implementation of CLexicon’s core methods
2 // Copyright © 2009 The University of Chicago
3 #include "Lexicon.h"
5 #include "Config.h"
6 #include <iostream>
7 #include <memory>
8 #include <Q3TextStream>
9 #include <QMessageBox>
10 #include <QTime>
11 #include <QList>
12 #include <QString>
13 #include "linguisticamainwindow.h"
14 #include "ui/Status.h"
15 #include "MiniLexicon.h"
16 #include "LPreferences.h"
17 #include "DCNcorpussyl.h"
18 #include "DLHistory.h"
19 #include "StateEmitHMM.h"
20 #include "CorpusWord.h"
21 #include "Signature.h"
22 #include "Linker.h"
23 #include "Suffix.h"
24 #include "Prefix.h"
25 #include "Stem.h"
26 #include "CorpusWordCollection.h"
27 #include "SignatureCollection.h"
28 #include "CompoundCollection.h"
29 #include "LinkerCollection.h"
30 #include "SuffixCollection.h"
31 #include "PrefixCollection.h"
32 #include "WordCollection.h"
33 #include "StemCollection.h"
34 #include "generaldefinitions.h"
35 #include "ScrubRules.h"
36 #include "Typedefs.h"
37 #include "Slice.h"
39 namespace {
40 QString escapes(QString start)
42 QString end = start;
44 end.replace( "[", "\\[" );
45 end.replace( "]", "\\]" );
46 end.replace( "(", "\\(" );
47 end.replace( ")", "\\)" );
48 end.replace( "!", "\\!" );
49 end.replace( "?", "\\?" );
50 end.replace( "^", "\\^" );
51 end.replace( "$", "\\$" );
53 return end;
56 enum CStem::type FindType(QString word)
58 int HyphenCount = 0;
59 int PuncCount = 0;
60 int NumberPunc = 0;
61 int DigitCount = 0;
63 int Length = word.length();
65 for ( int i = 0; i < Length; i++)
67 if( word[i].category() == QChar::Punctuation_Dash )
69 HyphenCount++;
72 if( word[i].isPunct() )
74 PuncCount++;
77 if( word[i] == '.' ||
78 word[i] == ',' ||
79 word[i].category() == QChar::Symbol_Currency ||
80 word[i].isNumber() )
82 NumberPunc ++;
85 if( word[i].isDigit() )
87 DigitCount++;
91 if( DigitCount == 0 && HyphenCount == 0 && PuncCount == 0 )
93 return CStem::NORMAL;
96 if( Length > 2 && HyphenCount == 1 && PuncCount == 1 )
98 if (DigitCount > 0 && DigitCount == Length - 1)
99 return CStem::NUMBER;
100 return CStem::BIWORD_COMPOUND;
103 if( DigitCount > 0 && DigitCount + NumberPunc >= Length )
105 return CStem::NUMBER;
108 if( Length > 3 && HyphenCount > 1 && PuncCount == HyphenCount )
110 return CStem::MULTIPLE_COMPOUND;
113 return CStem::UNKNOWN;
116 namespace primes {
117 const int NUM_PRIMES = 1229;
118 const int NUM_PRIME_STEPS = 38;
119 const int LARGE_NON_PRIME = 99460729;
120 const int LARGER_PRIME = 99460747;
121 const int LARGEST_PRIME_STEP = 10000019;
123 int getNextPrime( int number )
125 int i, j;
126 int primes[NUM_PRIMES] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29,
127 31, 37, 41, 43, 47, 53, 59, 61, 67, 71,
128 73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
129 127, 131, 137, 139, 149, 151, 157, 163, 167, 173,
130 179, 181, 191, 193, 197, 199, 211, 223, 227, 229,
131 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
132 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
133 353, 359, 367, 373, 379, 383, 389, 397, 401, 409,
134 419, 421, 431, 433, 439, 443, 449, 457, 461, 463,
135 467, 479, 487, 491, 499, 503, 509, 521, 523, 541,
136 547, 557, 563, 569, 571, 577, 587, 593, 599, 601,
137 607, 613, 617, 619, 631, 641, 643, 647, 653, 659,
138 661, 673, 677, 683, 691, 701, 709, 719, 727, 733,
139 739, 743, 751, 757, 761, 769, 773, 787, 797, 809,
140 811, 821, 823, 827, 829, 839, 853, 857, 859, 863,
141 877, 881, 883, 887, 907, 911, 919, 929, 937, 941,
142 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013,
143 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
144 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151,
145 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
146 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291,
147 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373,
148 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451,
149 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
150 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583,
151 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
152 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733,
153 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
154 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889,
155 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
156 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053,
157 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
158 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213,
159 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287,
160 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357,
161 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
162 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531,
163 2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617,
164 2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687,
165 2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
166 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819,
167 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
168 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999,
169 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
170 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181,
171 3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257,
172 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331,
173 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
174 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511,
175 3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571,
176 3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643,
177 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
178 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821,
179 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
180 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989,
181 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
182 4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139,
183 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231,
184 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297,
185 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
186 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493,
187 4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583,
188 4591, 4597, 4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657,
189 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
190 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, 4817, 4831,
191 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
192 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003,
193 5009, 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
194 5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179,
195 5189, 5197, 5209, 5227, 5231, 5233, 5237, 5261, 5273, 5279,
196 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, 5381, 5387,
197 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
198 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521,
199 5527, 5531, 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639,
200 5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683, 5689, 5693,
201 5701, 5711, 5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
202 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857,
203 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939,
204 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053,
205 6067, 6073, 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
206 6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211, 6217, 6221,
207 6229, 6247, 6257, 6263, 6269, 6271, 6277, 6287, 6299, 6301,
208 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, 6361, 6367,
209 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
210 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571,
211 6577, 6581, 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673,
212 6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737, 6761,
213 6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
214 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911, 6917,
215 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997,
216 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103,
217 7109, 7121, 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
218 7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253, 7283, 7297,
219 7307, 7309, 7321, 7331, 7333, 7349, 7351, 7369, 7393, 7411,
220 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, 7489, 7499,
221 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
222 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643,
223 7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723,
224 7727, 7741, 7753, 7757, 7759, 7789, 7793, 7817, 7823, 7829,
225 7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
226 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, 8011, 8017,
227 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111,
228 8117, 8123, 8147, 8161, 8167, 8171, 8179, 8191, 8209, 8219,
229 8221, 8231, 8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
230 8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369, 8377, 8387,
231 8389, 8419, 8423, 8429, 8431, 8443, 8447, 8461, 8467, 8501,
232 8513, 8521, 8527, 8537, 8539, 8543, 8563, 8573, 8581, 8597,
233 8599, 8609, 8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
234 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731, 8737, 8741,
235 8747, 8753, 8761, 8779, 8783, 8803, 8807, 8819, 8821, 8831,
236 8837, 8839, 8849, 8861, 8863, 8867, 8887, 8893, 8923, 8929,
237 8933, 8941, 8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
238 9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091, 9103, 9109,
239 9127, 9133, 9137, 9151, 9157, 9161, 9173, 9181, 9187, 9199,
240 9203, 9209, 9221, 9227, 9239, 9241, 9257, 9277, 9281, 9283,
241 9293, 9311, 9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
242 9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433, 9437, 9439,
243 9461, 9463, 9467, 9473, 9479, 9491, 9497, 9511, 9521, 9533,
244 9539, 9547, 9551, 9587, 9601, 9613, 9619, 9623, 9629, 9631,
245 9643, 9649, 9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
246 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791, 9803, 9811,
247 9817, 9829, 9833, 9839, 9851, 9857, 9859, 9871, 9883, 9887,
248 9901, 9907, 9923, 9929, 9931, 9941, 9949, 9967, 9973 };
250 if( number >= LARGE_NON_PRIME ) return LARGER_PRIME;
252 if( number > primes[ NUM_PRIMES - 1 ] )
254 for( i = 0; i < number; i++ )
256 for( j = 0; j < NUM_PRIMES; j++ )
258 if( ( number + i ) % j == 0 ) continue;
259 if( j * j >= number + i ) return number + i;
263 else
265 for( i = 0; i < NUM_PRIMES; i++ )
267 if( primes[i] > number ) return primes[i];
271 return 0;
274 int getPrevPrime( int number )
276 int i, j;
277 int primes[NUM_PRIMES] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29,
278 31, 37, 41, 43, 47, 53, 59, 61, 67, 71,
279 73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
280 127, 131, 137, 139, 149, 151, 157, 163, 167, 173,
281 179, 181, 191, 193, 197, 199, 211, 223, 227, 229,
282 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
283 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
284 353, 359, 367, 373, 379, 383, 389, 397, 401, 409,
285 419, 421, 431, 433, 439, 443, 449, 457, 461, 463,
286 467, 479, 487, 491, 499, 503, 509, 521, 523, 541,
287 547, 557, 563, 569, 571, 577, 587, 593, 599, 601,
288 607, 613, 617, 619, 631, 641, 643, 647, 653, 659,
289 661, 673, 677, 683, 691, 701, 709, 719, 727, 733,
290 739, 743, 751, 757, 761, 769, 773, 787, 797, 809,
291 811, 821, 823, 827, 829, 839, 853, 857, 859, 863,
292 877, 881, 883, 887, 907, 911, 919, 929, 937, 941,
293 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013,
294 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
295 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151,
296 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
297 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291,
298 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373,
299 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451,
300 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
301 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583,
302 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
303 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733,
304 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
305 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889,
306 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
307 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053,
308 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
309 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213,
310 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287,
311 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357,
312 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
313 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531,
314 2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617,
315 2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687,
316 2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
317 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819,
318 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
319 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999,
320 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
321 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181,
322 3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257,
323 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331,
324 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
325 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511,
326 3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571,
327 3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643,
328 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
329 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821,
330 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
331 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989,
332 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
333 4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139,
334 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231,
335 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297,
336 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
337 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493,
338 4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583,
339 4591, 4597, 4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657,
340 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
341 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, 4817, 4831,
342 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
343 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003,
344 5009, 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
345 5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179,
346 5189, 5197, 5209, 5227, 5231, 5233, 5237, 5261, 5273, 5279,
347 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, 5381, 5387,
348 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
349 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521,
350 5527, 5531, 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639,
351 5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683, 5689, 5693,
352 5701, 5711, 5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
353 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857,
354 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939,
355 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053,
356 6067, 6073, 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
357 6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211, 6217, 6221,
358 6229, 6247, 6257, 6263, 6269, 6271, 6277, 6287, 6299, 6301,
359 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, 6361, 6367,
360 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
361 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571,
362 6577, 6581, 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673,
363 6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737, 6761,
364 6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
365 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911, 6917,
366 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997,
367 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103,
368 7109, 7121, 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
369 7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253, 7283, 7297,
370 7307, 7309, 7321, 7331, 7333, 7349, 7351, 7369, 7393, 7411,
371 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, 7489, 7499,
372 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
373 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643,
374 7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723,
375 7727, 7741, 7753, 7757, 7759, 7789, 7793, 7817, 7823, 7829,
376 7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
377 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, 8011, 8017,
378 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111,
379 8117, 8123, 8147, 8161, 8167, 8171, 8179, 8191, 8209, 8219,
380 8221, 8231, 8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
381 8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369, 8377, 8387,
382 8389, 8419, 8423, 8429, 8431, 8443, 8447, 8461, 8467, 8501,
383 8513, 8521, 8527, 8537, 8539, 8543, 8563, 8573, 8581, 8597,
384 8599, 8609, 8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
385 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731, 8737, 8741,
386 8747, 8753, 8761, 8779, 8783, 8803, 8807, 8819, 8821, 8831,
387 8837, 8839, 8849, 8861, 8863, 8867, 8887, 8893, 8923, 8929,
388 8933, 8941, 8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
389 9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091, 9103, 9109,
390 9127, 9133, 9137, 9151, 9157, 9161, 9173, 9181, 9187, 9199,
391 9203, 9209, 9221, 9227, 9239, 9241, 9257, 9277, 9281, 9283,
392 9293, 9311, 9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
393 9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433, 9437, 9439,
394 9461, 9463, 9467, 9473, 9479, 9491, 9497, 9511, 9521, 9533,
395 9539, 9547, 9551, 9587, 9601, 9613, 9619, 9623, 9629, 9631,
396 9643, 9649, 9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
397 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791, 9803, 9811,
398 9817, 9829, 9833, 9839, 9851, 9857, 9859, 9871, 9883, 9887,
399 9901, 9907, 9923, 9929, 9931, 9941, 9949, 9967, 9973 };
401 if( number > LARGER_PRIME ) return LARGER_PRIME;
403 if( number > primes[ NUM_PRIMES - 1 ] )
405 for( i = number - 1; i >= primes[ NUM_PRIMES - 1 ]; i-- )
407 for( j = 0; j < NUM_PRIMES; j++ )
409 if( i % j == 0 ) continue;
410 if( j * j >= i ) return i;
414 else
416 for( i = NUM_PRIMES - 1; i >= 0; i-- )
418 if( primes[i] < number ) return primes[i];
422 return 0;
425 int getNextPrimeStep( int number )
427 int prime_steps[NUM_PRIME_STEPS] = { 17, 37, 67,
428 131, 257, 521,
429 1031, 2053, 4099, 8209,
430 10007, 20011, 30011, 40009, 50021, 60013, 70001, 80021, 90001,
431 100003, 200003, 300007, 400009, 500009, 600011, 700001, 800011, 900001,
432 1000003, 2000003, 3000017, 4000037, 5000011, 6000011, 7000003, 8000009, 9000011,
433 10000019 };
435 for( int i = 0; i < NUM_PRIME_STEPS; i++ )
437 if( prime_steps[i] > number ) return prime_steps[i];
440 return getNextPrime( number );
443 int getPrevPrimeStep( int number )
445 if( number < 17 ) return 17;
447 int prime_steps[NUM_PRIME_STEPS] = { 17, 37, 67,
448 131, 257, 521,
449 1031, 2053, 4099, 8209,
450 10007, 20011, 30011, 40009, 50021, 60013, 70001, 80021, 90001,
451 100003, 200003, 300007, 400009, 500009, 600011, 700001, 800011, 900001,
452 1000003, 2000003, 3000017, 4000037, 5000011, 6000011, 7000003, 8000009, 9000011,
453 10000019 };
455 for( int i = NUM_PRIME_STEPS - 1; i >= 0; i-- )
457 if( prime_steps[i] < number ) return prime_steps[i];
460 return getPrevPrime( number );
463 using namespace primes;
466 //////////////////////////////////////////////////////////////////////
467 // Construction/Destruction
468 //////////////////////////////////////////////////////////////////////
470 /** \page page1 How to add a variable to the User Preference Window
472 \section sec The main screen
473 In the Lexicon.cpp file, the main constructor function:
475 CLexicon( LinguisticaMainWindow* pDoc )
477 has a block of code following the comment:
478 Set parameter defaults for all lexicon functions;
479 each line takes the form:
481 m_ParamDefaults["Main\\MinimumMorphemeLength"] = "2".
483 These lines create the entries in the Windows Registry which can be used for Linguistica to read from
484 in various functions. So you have to do two things: add a line to this block of code, and also
485 add a line of code in a function where you want a variable to get its value from the Registry. Such
486 a line of code would look like this:
488 int LoopLimit = m_pLexicon->GetIntParameter( "CheckSignatures\\LoopLimit", 1 );
490 You'll be calling a Linguisitica function "GetIntParameter", which returns an int from a hash.
492 If you want to get a QString rather than an int, you use the function "GetStringParameter".
497 CLexicon::CLexicon(LinguisticaMainWindow* pDoc)
498 : m_pDoc(pDoc),
499 m_ParamDefaults(), // initialized below
501 m_Corpus(),
502 m_pCorpusWords(new CCorpusWordCollection(this)),
503 m_CorpusMap(),
505 m_pMiniLexica(new Q3PtrVector<CMiniLexicon>(4)) /* initialized below */,
506 m_ActiveMini(-1),
507 m_WordUpdates(),
508 // Global morpheme collections
509 m_AllPrefixes(), // initialized below
510 m_AllPrefixSigs(), // initialized below
511 m_AllStems(), // initialized below
512 m_AllSuffixes(), // initialized below
513 m_AllSuffixSigs(), // initialized below
514 m_AllWords(), // initialized below
516 m_pCompounds(new CCompoundCollection(this)),
517 m_pLinkers(new CLinkerCollection(this)),
518 m_CompoundUpdates(),
520 m_pInFilter(),
521 m_pOutFilter(),
523 m_NumberOfCharacterTypes(26),
524 m_tokenCount(0),
525 m_pDLHistory(new CDLHistory(status_display())),
526 m_DescriptionLength(),
528 m_pSEDWords(new CWordCollection()),
530 DCNsylTrainCorpus(NULL),
531 DCNsylTestCorpus(NULL),
532 isDCNtrainRead(false),
533 isDCNtestRead(false),
535 m_HMM(NULL)
537 // Set parameter defaults for all lexicon functions
538 m_ParamDefaults["Neighbors\\DifferenceThreshold"] = "2";
539 m_ParamDefaults["Main\\MinimumMorphemeLength"] = "2";
540 m_ParamDefaults["Main\\MinimumStemLength"] = "3";
541 m_ParamDefaults["Main\\MaximumPrefixLength"] = "5";
542 m_ParamDefaults["Main\\MinimumPrefixLength"] = "2";
543 m_ParamDefaults["Main\\MaximumSuffixLength"] = "5";
544 m_ParamDefaults["Main\\MinimumSuffixLength"] = "2";
545 m_ParamDefaults["Main\\MinimumSignatureLength"] = "1";
546 m_ParamDefaults["Main\\MaxSuccessorFreqScoreByNeighbor"] = "1";
547 m_ParamDefaults["Main\\MinimumNumberOfStemsInSignature"] = "1";
548 m_ParamDefaults["Main\\MinimumSuccessorFrequency"] = "6";
549 m_ParamDefaults["Main\\VerboseDisplayFlag"] = "1";
550 m_ParamDefaults["Main\\MaximumNumberOfMiniLexica"] = "1";
551 m_ParamDefaults["CheckSignatures\\LoopLimit"] = "1";
552 m_ParamDefaults["CheckSignatures\\StemCountThreshold"] = "2";
553 m_ParamDefaults["PredecessorFrequency\\MaxNeighborPredecessorCount"] = "1";
554 m_ParamDefaults["PredecessorFrequency\\MinimumNumberOfAppearancesOfPrefix"] = "3";
555 m_ParamDefaults["PredecessorFrequency\\MinimumNumberOfStemsInSignature"] = "2";
556 m_ParamDefaults["PredecessorFrequency\\MinimumLengthOfSignature"] = "2";
557 m_ParamDefaults["PredecessorFrequency\\LengthOfAStrongSignature"] = "4";
558 m_ParamDefaults["PredecessorFrequency\\LargeNumberOfStems"] = "25";
559 m_ParamDefaults["PredecessorFrequency\\MinimumNumberOfPrefixes"] = "2";
560 m_ParamDefaults["PredecessorFrequency\\MaximumPrefixLength"] = "5";
561 m_ParamDefaults["SuccessorFreq1\\MaxNeighborSuccessorCount"] = "1";
562 m_ParamDefaults["SuccessorFreq1\\MinimumNumberOfAppearancesOfSuffix"] = "3";
563 m_ParamDefaults["SuccessorFreq1\\MinimumNumberOfStemsInSignature"] = "2";
564 m_ParamDefaults["SuccessorFreq1\\MinimumLengthOfSignature"] = "2";
565 m_ParamDefaults["SuccessorFreq1\\LengthOfAStrongSignature"] = "4";
566 m_ParamDefaults["SuccessorFreq1\\LargeNumberOfStems"] = "25";
567 m_ParamDefaults["SuccessorFreq1\\MaxSuccessorFreqScoreByNeighbor"] = "1";
568 m_ParamDefaults["TakeSignaturesFindStems\\SizeThreshold"] = "2";
569 m_ParamDefaults["TakeSignaturesFindStems\\StemCountThreshold"] = "2";
570 m_ParamDefaults["TakeSignaturesFindStems\\SignatureRobustnessThreshold"] = "10";
571 m_ParamDefaults["FromStemsFindSuffixes\\RobustnessThreshold"] = "10";
572 m_ParamDefaults["FromStemsFindSuffixes\\MinimumNumberOfOccurrences"] = "3";
573 m_ParamDefaults["Compounds\\MaximumLinkerLength"] = "0";
574 m_ParamDefaults["SignatureDL\\CorpusBasedAffixCount"] = "0";
575 m_ParamDefaults["SignatureDL\\CorpusBasedStemCount"] = "1";
576 m_ParamDefaults["HMM\\NumberOfStates"] = "2";
577 m_ParamDefaults["HMM\\NumberOfIterations"] = "25";
578 m_ParamDefaults["EarleyParser\\MaximumParseDepth"] = "6";
579 m_ParamDefaults["Boltzmann\\NumberOfSamples"] = "100";
580 m_ParamDefaults["Symbols\\Vowels"] = "a e i o u A E I O U";
582 m_CorpusMap.setAutoDelete(false); // m_pCorpusWords owns these
583 m_pMiniLexica->setAutoDelete(true);
584 // m_WordUpdates.setAutoDelete(false); // mini-lexica own these -- no autodelete in Qt4's QList
585 m_AllPrefixes.setAutoDelete(true);
586 m_AllPrefixSigs.setAutoDelete(true);
587 m_AllStems.setAutoDelete(true);
588 m_AllSuffixes.setAutoDelete(true);
589 m_AllSuffixSigs.setAutoDelete(true);
590 m_AllWords.setAutoDelete(true);
592 // Compare default parameters to user params
593 QMap<QString, QString> params;
594 CLPreferences& prefs = *m_pDoc->GetPreferences();
596 prefs.GetDictionaryPreference("Lxa_Parameters", &params);
597 for (QMap<QString, QString>::const_iterator iter = m_ParamDefaults.begin();
598 iter != m_ParamDefaults.end(); ++iter) {
599 const QString key = iter.key();
601 if (!params.contains(key))
602 params.insert(key, iter.value());
604 prefs.SetDictionaryPreference("Lxa_Parameters", params);
607 CLexicon::~CLexicon()
609 // The mini-lexicon destructor requires a valid lexicon to
610 // work with, and in particular the preceding mini-lexica
611 // must still be valid. It would be nice to just use the
612 // Q3PtrVector destructor, but it deletes its items in the
613 // wrong order (first-to-last instead of last-to-first).
614 for (int i = m_pMiniLexica->size() - 1; i >= 0; --i)
615 m_pMiniLexica->remove(static_cast<unsigned int>(i));
617 delete m_pMiniLexica;
618 delete m_pCorpusWords;
619 delete m_pCompounds;
620 delete m_pLinkers;
621 delete m_pInFilter;
622 delete m_pOutFilter;
623 delete m_pDLHistory;
624 delete m_pSEDWords;
625 delete m_HMM;
626 delete DCNsylTrainCorpus;
627 delete DCNsylTestCorpus;
631 void CLexicon::AddToScreen( QString text )
633 m_pDoc->m_commandLine->setText( text );
637 void CLexicon::ClearScreen()
639 m_pDoc->m_commandLine->setText ("");
643 void CLexicon::FindPrefixes( bool AutoLayer )
645 CMiniLexicon* mini;
646 int index = 0;
648 //unused variable: const int LayerCount = GetIntParameter( "Main\\MaximumNumberOfMiniLexica", 1 );
650 if( m_ActiveMini >= 0 )
652 (*m_pMiniLexica)[m_ActiveMini]->LogFile(QString("MINI-LEXICON %1").arg(m_ActiveMini+1));
654 if( !(*m_pMiniLexica)[m_ActiveMini]->FindPrefixes() )
656 // The active mini may already have a suffix analysis
657 if( (*m_pMiniLexica)[m_ActiveMini]->GetStems()->GetCount() )
659 index = NewMiniLexicon();
660 mini = GetMiniLexicon( index );
661 if( mini && mini->SetAffixLocation( STEM_INITIAL ) )
662 mini->AddToWordCollection( GetMiniLexicon( m_ActiveMini )->GetWords() );
664 else
666 mini = (*m_pMiniLexica)[m_ActiveMini];
667 mini->SetAffixLocation( STEM_INITIAL );
670 mini->FindPrefixes();
672 SetActiveMiniIndex( index );
675 if( AutoLayer )
677 while (1)
679 index = NewMiniLexicon();
680 mini = GetMiniLexicon( index );
681 mini->AddToWordCollection( GetMiniLexicon( index-1 )->GetWords() );
682 mini->AddToWordCollection( GetMiniLexicon( index-1 )->GetStems() );
684 /* if( LogFileOn() )
686 *GetLogFileStream() << endl << endl << "================================================================" << endl
687 << QString("MINI-LEXICON %1").arg(index+1) << endl
688 << "================================================================" << endl << endl;
691 mini->FindPrefixes();
693 if( mini && mini->GetSignatures()->GetSize() < 1 )
695 DeleteMiniLexicon( index );
696 break;
704 void CLexicon::FindSuffixes( bool AutoLayer )
706 if (m_ActiveMini < 0)
707 // No mini-lexicon selected
708 return;
710 // XXX. Log which mini-lexicon this is
712 CMiniLexicon& active_mini = *GetMiniLexicon(m_ActiveMini);
714 if (active_mini.FindSuffixes() == 0) {
715 int index;
716 // The active mini may already have a prefix analysis
717 if (active_mini.GetStems()->GetCount() != 0) {
718 index = NewMiniLexicon();
719 CMiniLexicon* pMini = GetMiniLexicon(index);
720 pMini->AddToWordCollection(active_mini.GetWords());
721 } else {
722 index = m_ActiveMini;
723 active_mini.SetAffixLocation( STEM_FINAL );
726 CMiniLexicon& mini = *GetMiniLexicon(index);
727 mini.FindSuffixes();
728 mini.CalculateDescriptionLength();
729 SetActiveMiniIndex(index);
732 // avoid infinite loop
733 // XXX. Add a better termination condition.
734 const unsigned int layer_max = 100;
736 if (AutoLayer)
737 for (unsigned int layer = layer_max; layer != 0; --layer) {
738 int index = NewMiniLexicon();
739 CMiniLexicon& mini = *GetMiniLexicon(index);
740 CMiniLexicon& prev = *GetMiniLexicon(index - 1);
742 mini.AddToWordCollection(prev.GetWords());
743 mini.AddToWordCollection(prev.GetStems());
745 // XXX. Log which mini-lexicon this is.
747 mini.FindSuffixes();
749 if (mini.GetSignatures()->GetSize() == 0) {
750 DeleteMiniLexicon(index);
751 break;
756 int CLexicon::Tokenize( QStringList& lines, StringToInt& tokens )
758 QString token;
759 int token_count = 0;
761 foreach (QString line_text, lines) {
762 QTextStream line(&line_text, QIODevice::ReadOnly);
763 while (!line.atEnd()) {
764 line >> token;
765 // If the tokenization becomes more sophisticated than 'break at space', this is where it should be defined
766 tokens[token]++;
767 token_count++;
770 return token_count;
773 int CLexicon::ReadDX1File(QString FileName, int NumberOfWords )
775 QString line;
776 CParse prsLine;
777 //unused variable:
778 // int token_count = 0;
779 CSS Spelling;
780 //unused variable:
781 // int corpuscount = 0;
782 CStem* pWord;
783 CWordCollection* Words;
784 CCorpusWord* pCorpusWord;
786 linguistica::ui::status_user_agent& status = status_display();
787 status.major_operation = "Reading dictionary word list.";
788 status.progress.clear();
790 // reset lexicon!
791 ClearAll();
793 // Add the base mini lexicon
794 CMiniLexicon* mini = new CMiniLexicon( this, 0 );
795 m_ActiveMini = 0;
796 m_pMiniLexica->insert( 0, mini );
797 m_pMiniLexica->setAutoDelete( true );
798 Words = mini->GetWords();
800 Q_ASSERT( !FileName.isEmpty() );
801 QFile File( FileName );
802 if (!File.open(QIODevice::ReadOnly)) {
803 status.major_operation.clear();
804 return 1;
807 Q3TextStream stream(&File);
809 // Unicode or ASCII encoding? Depends on corpus file.
810 stream.setEncoding ( Q3TextStream::Locale );
812 if (NumberOfWords < 0)
813 NumberOfWords = 1000000;
814 status.progress.set_denominator(NumberOfWords);
816 int wordcount;
817 for (wordcount = 1; wordcount < NumberOfWords + 1; ++wordcount) {
818 status.progress = wordcount;
819 if (stream.atEnd())
820 break;
821 QString line = stream.readLine().simplifyWhiteSpace();
823 // Add line to corpus
824 m_Corpus.append( line );
826 prsLine.Collapse(CSS (line) );
828 // Add to mini word list
829 pWord = *Words << prsLine[1].Display();
830 //pWord->SetCorpusCount( prsLine[2] );
831 pWord->SetCorpusCount ( 1 );
833 // Add corpus word
834 pCorpusWord = *m_pCorpusWords << pWord->GetKey();
835 m_CorpusMap.insert( prsLine[1].Display(), pCorpusWord );
836 m_pCorpusWords->IncrementCorpusCount( pWord->GetCorpusCount() - 1 ); // already incremented one in operator<<
837 pCorpusWord->IncrementCorpusCount( pWord->GetCorpusCount() - 1 );
839 // Remove the two non phonology pieces and set
840 // the phonology tier
841 prsLine.RemovePiece(1);
842 prsLine.RemovePiece(1);
843 pWord->SetPhonology_Tier1 ( &prsLine );
845 status.progress.clear();
846 status.major_operation.clear();
847 return wordcount;
850 int CLexicon::ReadCorpus(QString FileName, int NumberOfWords)
852 Q_ASSERT(!FileName.isEmpty());
854 QTime t;
855 t.start();
857 if (FileName.right(4) == ".dx1")
858 return ReadDX1File(FileName, NumberOfWords);
860 linguistica::ui::status_user_agent& status = status_display();
861 status.major_operation = "Reading corpus";
862 status.progress.clear();
864 m_tokenCount = 0;
866 const int MinStemLength = GetIntParameter( "Main\\MinimumStemLength", 3 );
867 // XXX. use Lower_Case preference?
869 // Remove everything from Lexicon
870 m_pCorpusWords->Empty();
871 ClearAll();
873 // Add the base mini lexicon
874 std::auto_ptr<CMiniLexicon> new_mini(new CMiniLexicon(this, 0));
875 CMiniLexicon* mini = new_mini.get();
876 m_ActiveMini = 0;
877 Q_ASSERT(m_pMiniLexica->autoDelete());
878 m_pMiniLexica->insert(0, new_mini.release());
879 CWordCollection* Words = mini->GetWords();
881 // Set up filters
882 m_pInFilter = 0;
883 m_pOutFilter = 0;
884 QStringList items;
885 m_pDoc->GetPreferences()->GetStringListPreference(
886 "Character_Combinations", &items);
887 SetFilters(&items);
889 // Read file.
891 Q_ASSERT(!FileName.isEmpty());
892 QFile File(FileName);
893 if (!File.open(QIODevice::ReadOnly)) {
894 status.major_operation.clear();
895 return -1;
898 QTextStream stream(&File);
899 stream.setCodec(QTextCodec::codecForName("UTF-8"));
900 Q_ASSERT(stream.autoDetectUnicode());
902 status.major_operation = "Reading word tokens from corpus...";
903 status.progress.clear();
904 status.progress.set_denominator(NumberOfWords);
905 while (!stream.atEnd() && m_tokenCount <= NumberOfWords) {
906 status.progress = m_tokenCount;
907 QString line = stream.readLine().trimmed();
909 // Deal with line-final dashes
910 if (line.endsWith("-")) {
911 QString action = m_pDoc->GetPreferences()
912 ->GetPreference("Line_Final_Dash");
913 if (action.isEmpty())
914 action = "Join_With_Dash";
916 if (action == "Do_Not_Join")
917 // Do nothing:
918 // dashes should be handled by scrubbing.
920 else if (!stream.atEnd()) {
921 QString next;
922 stream >> next;
924 if (action == "Join_Without_Dash")
925 line.truncate(line.length() - 1);
927 line += next;
931 m_tokenCount += line.count(' ') + 1;
932 m_Corpus.append(line);
934 status.progress.clear();
935 status.major_operation.clear();
938 // Tokenize
939 QMap<QString, int> types;
940 Tokenize(m_Corpus, types);
942 // Prepare scrubbing
943 QStringList scrubRules;
944 m_pDoc->GetPreferences()->GetStringListPreference(
945 "Scrub_Replacements", &scrubRules);
947 QMap<QString, QString> replacements;
948 QStringList regExps;
949 QRegExp preceding, internal, following;
950 const QString arrow_str = " --> ";
951 for (QStringList::const_iterator iter = scrubRules.begin();
952 iter != scrubRules.end(); ++iter) {
953 const QString rule = *iter;
954 int arrow = rule.lastIndexOf(arrow_str);
956 if (arrow != -1) {
957 Q_ASSERT(arrow >= 0);
958 Q_ASSERT(rule.lastIndexOf(arrow_str, arrow - 1) == -1);
960 const QString lhs = rule.left(arrow);
961 const QString rhs = rule.mid(arrow + arrow_str.size());
962 regExps.append(lhs);
963 replacements.replace(lhs, rhs);
964 continue;
967 regExps.append(rule);
968 if (rule == SCR_REMOVE_PRECEDING_PUNCT) {
969 ++iter;
970 preceding = QRegExp(QString("^[%1]+")
971 .arg(escapes(*iter)));
972 } else if (rule == SCR_REMOVE_INTERNAL_PUNCT) {
973 ++iter;
974 internal = QRegExp(QString("(\\S)[%1]+(\\S)")
975 .arg(escapes(*iter)));
976 } else if (rule == SCR_REMOVE_FOLLOWING_PUNCT) {
977 ++iter;
978 following = QRegExp(QString("[%1]+$")
979 .arg(escapes(*iter)));
983 status.major_operation = "Processing word types...";
984 status.progress.clear();
985 status.progress.set_denominator(types.size());
987 // Process types individually
988 int words = 0;
989 int maxNumberOfRoots = 2;
990 QMap<QChar,int> allChars;
991 QList<CStem*> dashCompounds, dashComponents;
992 for (QMap<QString, int>::const_iterator iter = types.begin();
993 iter != types.end(); ++iter, ++words) {
994 QString word = iter.key();
995 if (word.isEmpty())
996 continue;
998 status.progress = words;
1000 // Scrub word
1001 const QRegExp numeral("[0-9]");
1002 foreach (QString regexp, regExps) {
1003 if (regexp == SCR_MAKE_LOWER_CASE)
1004 word = word.lower();
1005 else if (regexp == SCR_REMOVE_NUMBERS)
1006 word.replace(numeral, "");
1007 else if (regexp == SCR_REMOVE_PRECEDING_PUNCT)
1008 word.replace(preceding, "");
1009 else if (regexp == SCR_REMOVE_INTERNAL_PUNCT)
1010 word.replace(internal, "\\1\\2");
1011 else if (regexp == SCR_REMOVE_FOLLOWING_PUNCT)
1012 word.replace(following, "");
1013 else
1014 word.replace(QRegExp(regexp),
1015 replacements[regexp]);
1018 // Combine n-graph combos into single character
1019 word = Filter(m_pInFilter, word);
1021 if (word.isEmpty())
1022 continue;
1024 // Process words with internal dashes according to user preferences
1025 int dashPos = word.indexOf('-');
1026 const bool hasDash = (dashPos != -1);
1028 const QString wordInternalDash = m_pDoc->GetPreferences()
1029 ->GetPreference("Word_Internal_Dash");
1030 const QString wordWithDash = m_pDoc->GetPreferences()
1031 ->GetPreference("Word_With_Dash");
1033 bool pieceTooShort = false;
1034 if (hasDash) {
1035 QStringList pieces = QStringList::split('-', word, true);
1036 foreach (QString piece, pieces) {
1037 if (piece.size() < MinStemLength) {
1038 pieceTooShort = true;
1039 break;
1044 const bool split_at_dash =
1045 (wordWithDash == "Include_Substrings_Only" ||
1046 wordWithDash == "Include_Full_Word_And_Substrings");
1047 if (hasDash && !pieceTooShort && split_at_dash) {
1048 int dashCount = word.count('-');
1049 if (dashCount > maxNumberOfRoots)
1050 maxNumberOfRoots = dashCount + 1;
1052 QString components = word;
1054 for (; !components.isEmpty();
1055 dashPos = components.indexOf('-')) {
1056 QString component;
1058 if (dashPos != -1) {
1059 component = components.left(dashPos);
1060 components = components.mid(dashPos + 1);
1061 } else {
1062 component = components;
1063 components = "";
1066 if (!component.isEmpty()) {
1067 dashPos = components.indexOf('-');
1068 continue;
1071 foreach (QChar ch, component)
1072 ++allChars[ch];
1074 CStem stem(component, mini);
1075 CStem* pWord = *Words << stem;
1076 // operator<< already incremented the corpus
1077 // counts by 1, but that wasn’t right --- so
1078 // we adjust.
1079 Words->IncrementCorpusCount(iter.value() - 1);
1080 pWord->IncrementCorpusCount(iter.value() - 1);
1082 pWord->SetWordType(FindType(component));
1083 pWord->IncrementCompoundCount();
1084 dashComponents.append(pWord);
1085 *m_pSEDWords << stem;
1089 if (!hasDash || wordWithDash != "Include_Substrings_Only") {
1090 if (hasDash && wordInternalDash == "Remove")
1091 word = word.replace('-', "");
1093 foreach (QChar ch, word)
1094 ++allChars[ch];
1096 CStem stem(word, mini);
1097 CStem* pWord = *Words << stem;
1098 // operator<< already incremented the corpus
1099 // counts by 1, but that wasn’t right --- so
1100 // we adjust.
1101 Words->IncrementCorpusCount(iter.value() - 1);
1102 pWord->IncrementCorpusCount(iter.value() - 1);
1104 pWord->SetWordType(FindType(word));
1105 if (hasDash && wordInternalDash != "Remove" &&
1106 !pieceTooShort)
1107 dashCompounds.append(pWord);
1108 *m_pSEDWords << stem;
1110 CCorpusWord* pCorpusWord =
1111 *m_pCorpusWords << pWord->GetKey();
1112 // operator<< already incremented the corpus
1113 // counts by 1, but that wasn’t right.
1114 m_pCorpusWords->IncrementCorpusCount(iter.value() - 1);
1115 pCorpusWord->IncrementCorpusCount(iter.value() - 1);
1117 pCorpusWord->SetMorpheme(1, pWord);
1118 m_CorpusMap.insert(iter.key(), pCorpusWord);
1121 status.progress.clear();
1123 // Connect compounds to word components and create parses
1124 if (!dashCompounds.isEmpty() && !dashComponents.isEmpty()) {
1125 FromStemsFindFlatCompounds(&dashCompounds,
1126 &dashComponents, "-", maxNumberOfRoots);
1127 // m_pCompounds->FindMostProbableParse();
1130 m_NumberOfCharacterTypes = allChars.count();
1132 Words->GetTrie()->Alphabetize();
1133 // phonology: move this eventually to the menu -- JG
1134 mini->GetWords()->DoPhonology();
1136 status.major_operation.clear();
1138 // XXX. necessary?
1139 status.progress.clear();
1140 status.details.clear();
1142 Words->SetSortStyle(KEY);
1144 // Add description length to history
1145 const QString mini_name("Mini-Lexicon 1");
1146 const QString remark("Before analysis; words only");
1147 m_pDLHistory->append(mini_name, remark, mini);
1149 std::cout << "ReadCorpus:: Time elapsed: " <<
1150 t.elapsed() << "ms." << std::endl;
1151 return 0;
1154 int CLexicon::RereadCorpus(QString FileName, int NumberOfWords)
1156 Q_ASSERT(!FileName.isEmpty());
1157 ClearAll();
1158 return ReadCorpus(FileName, NumberOfWords);
1163 void CLexicon::ClearAll()
1165 CMiniLexicon* mini;
1167 // Clear all MiniLexica
1168 for( int i = m_pMiniLexica->size()-1; i >= 0; i-- )
1170 mini = m_pMiniLexica->take(i);
1171 if( mini )
1173 mini->ClearAll();
1174 delete mini;
1178 m_AllPrefixes .clear();
1179 m_AllPrefixSigs .clear();
1180 m_AllStems .clear();
1181 m_AllSuffixes .clear();
1182 m_AllSuffixSigs .clear();
1183 m_AllWords .clear();
1184 m_pCompounds ->Empty(); delete m_pCompounds;
1185 m_pCompounds = new CCompoundCollection( this );
1186 m_pLinkers ->Empty(); delete m_pLinkers;
1187 m_pLinkers = new CLinkerCollection( this );
1188 m_Corpus .clear();
1189 m_CorpusMap .clear();
1190 m_pCorpusWords ->Empty();
1192 delete m_pDLHistory;
1193 m_pDLHistory = new CDLHistory(status_display());
1195 delete m_HMM;
1196 m_HMM = 0;
1200 CCorpusWord* CLexicon::FindAWord( CStem* pStem, CSuffix* pSuffix )
1202 QString Word = pStem->Display();
1204 if ( !pSuffix->GetKey().IsNULL() )
1206 Word += pSuffix->Display();
1209 return *m_pCorpusWords ^= Word;
1213 int CLexicon::ReadProjectFile( QString FileName )
1215 Q_ASSERT( !FileName.isEmpty() );
1216 QFile file( FileName );
1218 QString buffer;
1220 int usedTokens,
1221 types,
1222 minis;
1224 if( file.exists() && file.open( QIODevice::ReadOnly ) )
1226 Q3TextStream inf(&file);
1227 inf.setEncoding ( Q3TextStream::Locale );
1229 buffer = inf.readLine();
1230 Q_ASSERT( buffer[0] == '#' );
1232 buffer = inf.readLine();
1233 Q_ASSERT( buffer[0] == '#' );
1235 inf >> m_tokenCount;
1237 buffer = inf.readLine(); // end of read tokens line
1238 Q_ASSERT( buffer.length() == 0 );
1240 buffer = inf.readLine();
1241 Q_ASSERT( buffer[0] == '#' );
1243 inf >> usedTokens;
1245 buffer = inf.readLine(); // end of used tokens line
1246 Q_ASSERT( buffer.length() == 0 );
1248 buffer = inf.readLine();
1249 Q_ASSERT( buffer[0] == '#' );
1251 inf >> types;
1253 buffer = inf.readLine(); // end of types line
1254 Q_ASSERT( buffer.length() == 0 );
1256 buffer = inf.readLine();
1257 Q_ASSERT( buffer[0] == '#' );
1259 inf >> m_NumberOfCharacterTypes;
1261 buffer = inf.readLine(); // end of characters line
1262 Q_ASSERT( buffer.length() == 0 );
1264 buffer = inf.readLine();
1265 Q_ASSERT( buffer[0] == '#' );
1267 inf >> minis;
1269 buffer = inf.readLine(); // end of minis line
1270 Q_ASSERT( buffer.length() == 0 );
1272 file.close();
1275 return minis;
1279 void CLexicon::OutputStats( QString FileName )
1281 Q_ASSERT( !FileName.isEmpty() );
1282 QFile file( FileName );
1284 int i;
1286 int slash = FileName.findRev( "\\" );
1287 if( slash < 0 ) slash = FileName.findRev( "/" );
1288 int dot = FileName.findRev( ".prj" );
1290 if( file.open( QIODevice::WriteOnly ) )
1292 Q3TextStream outf( &file );
1293 outf.setEncoding ( Q3TextStream::Unicode );
1295 outf <<
1296 "# LEXICON (\'" + FileName.mid( slash+1, dot-3-slash ) + "\')" << endl <<
1297 "# Number of word tokens read: \n\t" << GetTokenCount() << endl <<
1298 "# Number of word tokens used: \n\t" << m_pCorpusWords->GetCorpusCount() << endl <<
1299 "# Number of word types: \n\t" << m_pCorpusWords->GetCount() << endl <<
1300 "# Number of character types: \n\t" << m_NumberOfCharacterTypes << endl <<
1301 "# Number of mini-lexica: \n\t" << GetMiniCount() << endl << endl;
1303 for( i = 0; i < GetMiniSize(); i++ )
1305 CMiniLexicon* mini = GetMiniLexicon(i);
1306 if( !mini ) continue;
1308 outf << "# ------------------------" << endl <<
1309 QString( "# MINI-LEXICON %1" ).arg( i+1 ) << endl <<
1310 "# Number of words: \n\t" << mini->GetWords()->GetCount() << endl <<
1311 "# Number of stems: \n\t" << mini->GetStems()->GetCount() << endl;
1313 if( mini->GetSuffixes() )
1315 outf << "# Number of regular suffixes: \n\t" << mini->GetSuffixes()->GetCount() << endl <<
1316 "# Number of signatures with regular suffixes: \n\t" << mini->GetSignatures()->GetCount() << endl << endl;
1318 if( mini->GetPrefixes() )
1320 outf << "# Number of regular prefixes: \n\t " << mini->GetPrefixes()->GetCount() << endl <<
1321 "# Number of signatures with regular prefixes: \n\t" << mini->GetSignatures()->GetCount() << endl << endl;
1325 file.close();
1330 void CLexicon::SetFilters( QStringList* items )
1332 // Note: this function is private because we don't want to change
1333 // the filters after words have already been read into the lexicon
1334 // if we do, then they may filter out to display and log incorrectly
1335 if( m_pInFilter ) delete m_pInFilter;
1336 if( m_pOutFilter ) delete m_pOutFilter;
1338 m_pInFilter = new StringToString;
1339 m_pOutFilter = new StringToString;
1341 int pound, i = 0;
1342 QString item;
1343 for ( QStringList::Iterator it = items->begin(); it != items->end(); ++it )
1345 // Remove comments
1346 pound = (*it).find('#');
1347 if( pound >= 0 ) item = (*it).left( pound ).stripWhiteSpace();
1348 else item = *it;
1350 // Do nothing if the remaining string has spaces
1351 // or nothing
1352 if( item.length() == 0 || item.find(' ') >= 0 ) break;
1354 // Insert into both filters
1355 QString character = QChar( FILTER_BASE + i );
1356 m_pInFilter->insert( item, character );
1357 m_pOutFilter->insert( character, item );
1358 i++;
1362 CMiniLexicon* CLexicon::GetMiniLexicon(int i)
1364 if (i == -1)
1365 return 0;
1367 Q_ASSERT(i >= 0 && i < GetMiniSize());
1368 return (*m_pMiniLexica)[i];
1371 void CLexicon::SetActiveMiniIndex( int i )
1373 int j;
1375 if( i < 0 || GetMiniLexicon(i) ) m_ActiveMini = i;
1376 else
1378 for( j = i-1; j >= 0; j-- )
1380 if( GetMiniLexicon(j) )
1382 m_ActiveMini = j;
1383 return;
1386 for( j = i+1; j < GetMiniSize(); j++ )
1388 if( GetMiniLexicon(j) )
1390 m_ActiveMini = j;
1391 return;
1397 linguistica::ui::status_user_agent& CLexicon::status_display()
1398 { return m_pDoc->status_display(); }
1400 int CLexicon::GetIntParameter(QString param, int iDefault)
1402 QMap<QString, QString> m;
1403 m_pDoc->GetPreferences()->GetDictionaryPreference(
1404 "Lxa_Parameters", &m);
1406 const QMap<QString, QString>::const_iterator iter =
1407 m.constFind(param);
1408 if (iter == m.constEnd())
1409 return iDefault;
1411 // see CLPreferences::GetIntPreference()
1412 bool ok;
1413 const int rv = iter.value().toInt(&ok);
1414 if (!ok)
1415 return iDefault;
1416 return rv;
1419 QString CLexicon::GetStringParameter(QString param)
1421 QMap<QString, QString> m;
1422 m_pDoc->GetPreferences()->GetDictionaryPreference(
1423 "Lxa_Parameters", &m);
1425 const QMap<QString, QString>::const_iterator iter =
1426 m.constFind(param);
1427 if (iter == m.constEnd())
1428 return QString();
1430 return iter.value();
1433 void CLexicon::MakeBrokenCorpus( QString outputFileName )
1435 Q_ASSERT( !outputFileName.isEmpty() );
1436 QFile outFile( outputFileName );
1438 CCorpusWord* pCorpusWord;
1440 if( outFile.open( QIODevice::WriteOnly ) )
1442 Q3TextStream out( &outFile );
1443 out.setEncoding( Q3TextStream::Unicode );
1445 for( QStringList::Iterator lineIt = m_Corpus.begin(); lineIt != m_Corpus.end(); ++lineIt )
1447 QStringList line = QStringList::split( " ", *lineIt );
1449 for( QStringList::Iterator wordIt = line.begin(); wordIt != line.end(); ++wordIt )
1451 pCorpusWord = m_CorpusMap.find( *wordIt );
1453 if( pCorpusWord ) out << pCorpusWord->Display( '+', m_pOutFilter ) << " ";
1454 else out << *wordIt;
1457 out << endl;
1465 int CLexicon::NewMiniLexicon()
1467 int pos;
1469 for( pos = m_pMiniLexica->size()-1; pos >= 0; pos-- )
1471 if( (*m_pMiniLexica)[pos] )
1473 break;
1476 pos++;
1478 CMiniLexicon* mini = new CMiniLexicon( this, pos );
1480 if (pos >= 0 && static_cast<unsigned int>(pos) >=
1481 m_pMiniLexica->size())
1482 m_pMiniLexica->resize(m_pMiniLexica->size()*2);
1484 m_pMiniLexica->insert( pos, mini );
1486 return pos;
1490 void CLexicon::DeleteMiniLexicon( int pos )
1492 int next, last;
1494 if( pos == 0 )
1496 QMessageBox::information( NULL, QString( "Linguistica" ), QString( "Sorry, you cannot delete the first mini-lexicon. If you want \nto clear all data, use \'Clear Lexicon\' in the \'Edit\' menu." ) );
1497 return;
1500 m_pMiniLexica->remove(pos);
1502 for( last = m_pMiniLexica->size()-1; last >= 0; last-- )
1504 if( (*m_pMiniLexica)[last] )
1506 break;
1510 if( last < (static_cast <int> (m_pMiniLexica->size()))/2 - 1 ) m_pMiniLexica->resize( m_pMiniLexica->size()/2 );
1512 if( pos-1 >= 0 && (*m_pMiniLexica)[pos-1] ) SetActiveMiniIndex( pos - 1 );
1513 else
1515 for( next = pos; next <= last; next++ )
1517 if( (*m_pMiniLexica)[next] )
1519 SetActiveMiniIndex( next );
1525 void CLexicon::ClearMiniLexicon( int pos )
1527 (*m_pMiniLexica)[pos]->ClearAll();
1530 int CLexicon::GetCorpusCount() { return m_pCorpusWords->GetCorpusCount(); }
1532 // All Stems
1533 ////////////////////////////////////////////////////////////////////
1535 QList<CStem*>* CLexicon::GetStemSet( const CStringSurrogate& stem )
1537 return m_AllStems.find( stem.Display() );
1541 bool CLexicon::InsertStem( CStem* stem )
1543 if( !stem ) return FALSE;
1545 // Get or create the set
1546 QList<CStem*>* set = m_AllStems.find( stem->Display() );
1547 if( !set )
1549 // Make sure the dictionary is large enough
1550 if( m_AllStems.count() >= m_AllStems.size() )
1552 m_AllStems.resize( getNextPrimeStep( m_AllStems.size() ) );
1555 set = new QList<CStem*>();
1556 m_AllStems.insert( stem->Display(), set );
1559 // Do not insert duplicates
1560 if( set->indexOf( stem ) >= 0 ) return FALSE;
1562 // Insert the stem
1563 set->prepend( stem );
1565 return TRUE;
1569 bool CLexicon::RemoveStem( CStem* stem )
1571 if( !stem ) return FALSE;
1573 // Get the set
1574 QList<CStem*>* set = m_AllStems.find( stem->Display() );
1575 if( !set ) return FALSE;
1577 // Remove the stem
1578 if( !set->remove( stem ) ) return FALSE;
1580 // Remove the set also if there are no more stems
1581 if( set->isEmpty() )
1583 m_AllStems.remove( stem->Display() );
1585 // Shrink the dictionary if it's too large
1586 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllStems.size() ) );
1587 if( static_cast <int> ( m_AllStems.count() ) < smaller_size )
1589 m_AllStems.resize( smaller_size );
1593 return TRUE;
1597 // All Suffixes
1598 ////////////////////////////////////////////////////////////////////
1600 QList<CSuffix*>* CLexicon::GetSuffixSet( const CStringSurrogate& suffix )
1602 return m_AllSuffixes.find( suffix.Display() );
1606 bool CLexicon::InsertSuffix( CSuffix* suffix )
1608 if( !suffix ) return FALSE;
1610 // Get or create the set
1611 QList<CSuffix*>* set = m_AllSuffixes.find( suffix->Display() );
1612 if( !set )
1614 // Make sure the dictionary is large enough
1615 if( m_AllSuffixes.count() >= m_AllSuffixes.size() )
1617 m_AllSuffixes.resize( getNextPrimeStep( m_AllSuffixes.size() ) );
1620 set = new QList<CSuffix*>();
1621 m_AllSuffixes.insert( suffix->Display(), set );
1624 // Do not insert duplicates
1625 if( set->indexOf( suffix ) >= 0 ) return FALSE;
1627 // Insert the suffix
1628 set->prepend( suffix );
1630 return TRUE;
1634 bool CLexicon::RemoveSuffix( CSuffix* suffix )
1636 if( !suffix ) return FALSE;
1638 // Get the set
1639 QList<CSuffix*>* set = m_AllSuffixes.find( suffix->Display() );
1640 if( !set ) return FALSE;
1642 // Remove the suffix
1643 if( !set->remove( suffix ) ) return FALSE;
1645 // Remove the set also if there are no more suffixes
1646 if( set->isEmpty() )
1648 m_AllSuffixes.remove( suffix->Display() );
1650 // Shrink the dictionary if it's too large
1651 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllSuffixes.size() ) );
1652 if( static_cast <int> ( m_AllSuffixes.count() ) < smaller_size )
1654 m_AllSuffixes.resize( smaller_size );
1658 return TRUE;
1662 // All Suffix Signatures
1663 ////////////////////////////////////////////////////////////////////
1665 QList<CSignature*>* CLexicon::GetSuffixSigSet( const CStringSurrogate& sig )
1667 return m_AllSuffixSigs.find( sig.Display() );
1671 bool CLexicon::InsertSuffixSig( CSignature* sig )
1673 if( !sig ) return FALSE;
1675 // Get or create the set
1676 QList<CSignature*>* set = m_AllSuffixSigs.find( sig->Display() );
1677 if( !set )
1679 // Make sure the dictionary is large enough
1680 if( m_AllSuffixSigs.count() >= m_AllSuffixSigs.size() )
1682 m_AllSuffixSigs.resize( getNextPrimeStep( m_AllSuffixSigs.size() ) );
1685 set = new QList<CSignature*>();
1686 m_AllSuffixSigs.insert( sig->Display(), set );
1689 // Do not insert duplicates
1690 if( set->indexOf( sig ) >= 0 ) return FALSE;
1692 // Insert the signature
1693 set->prepend( sig );
1695 return TRUE;
1699 bool CLexicon::RemoveSuffixSig( CSignature* sig )
1701 if( !sig ) return FALSE;
1703 // Get the set
1704 QList<CSignature*>* set = m_AllSuffixSigs.find( sig->Display() );
1705 if( !set ) return FALSE;
1707 // Remove the signature
1708 if( !set->remove( sig ) ) return FALSE;
1710 // Remove the set also if there are no more signatures
1711 if( set->isEmpty() )
1713 m_AllSuffixSigs.remove( sig->Display() );
1715 // Shrink the dictionary if it's too large
1716 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllSuffixSigs.size() ) );
1717 if( static_cast <int> (m_AllSuffixSigs.count()) < smaller_size )
1719 m_AllSuffixSigs.resize( smaller_size );
1723 return TRUE;
1727 // All Prefixes
1728 ////////////////////////////////////////////////////////////////////
1730 QList<CPrefix*>* CLexicon::GetPrefixSet( const CStringSurrogate& prefix )
1732 return m_AllPrefixes.find( prefix.Display() );
1736 bool CLexicon::InsertPrefix( CPrefix* prefix )
1738 if( !prefix ) return FALSE;
1740 // Get or create the set
1741 QList<CPrefix*>* set = m_AllPrefixes.find( prefix->Display() );
1742 if( !set )
1744 // Make sure the dictionary is large enough
1745 if( m_AllPrefixes.count() >= m_AllPrefixes.size() )
1747 m_AllPrefixes.resize( getNextPrimeStep( m_AllPrefixes.size() ) );
1750 set = new QList<CPrefix*>();
1751 m_AllPrefixes.insert( prefix->Display(), set );
1754 // Do not insert duplicates
1755 if( set->indexOf( prefix ) >= 0 ) return FALSE;
1757 // Insert the prefix
1758 set->prepend( prefix );
1760 return TRUE;
1764 bool CLexicon::RemovePrefix( CPrefix* prefix )
1766 if( !prefix ) return FALSE;
1768 // Get the set
1769 QList<CPrefix*>* set = m_AllPrefixes.find( prefix->Display() );
1770 if( !set ) return FALSE;
1772 // Remove the prefix
1773 if( !set->remove( prefix ) ) return FALSE;
1775 // Remove the set also if there are no more prefixes
1776 if( set->isEmpty() )
1778 m_AllPrefixes.remove( prefix->Display() );
1780 // Shrink the dictionary if it's too large
1781 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllPrefixes.size() ) );
1782 if(static_cast <int> ( m_AllPrefixes.count()) < smaller_size )
1784 m_AllPrefixes.resize( smaller_size );
1788 return TRUE;
1792 // All Prefix Signatures
1793 ////////////////////////////////////////////////////////////////////
1795 QList<CSignature*>* CLexicon::GetPrefixSigSet( const CStringSurrogate& sig )
1797 return m_AllPrefixSigs.find( sig.Display() );
1801 bool CLexicon::InsertPrefixSig( CSignature* sig )
1803 if( !sig ) return FALSE;
1805 // Get or create the set
1806 QList<CSignature*>* set = m_AllPrefixSigs.find( sig->Display() );
1807 if( !set )
1809 // Make sure the dictionary is large enough
1810 if( m_AllPrefixSigs.count() >= m_AllPrefixSigs.size() )
1812 m_AllPrefixSigs.resize( getNextPrimeStep( m_AllPrefixSigs.size() ) );
1815 set = new QList<CSignature*>();
1816 m_AllPrefixSigs.insert( sig->Display(), set );
1819 // Do not insert duplicates
1820 if( set->indexOf( sig ) >= 0 ) return FALSE;
1822 // Insert the signature
1823 set->prepend( sig );
1825 return TRUE;
1829 bool CLexicon::RemovePrefixSig( CSignature* sig )
1831 if( !sig ) return FALSE;
1833 // Get the set
1834 QList<CSignature*>* set = m_AllPrefixSigs.find( sig->Display() );
1835 if( !set ) return FALSE;
1837 // Remove the signature
1838 if( !set->remove( sig ) ) return FALSE;
1840 // Remove the set also if there are no more signatures
1841 if( set->isEmpty() )
1843 m_AllPrefixSigs.remove( sig->Display() );
1845 // Shrink the dictionary if it's too large
1846 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllPrefixSigs.size() ) );
1847 if( static_cast <int> (m_AllPrefixSigs.count()) < smaller_size )
1849 m_AllPrefixSigs.resize( smaller_size );
1853 return TRUE;
1857 // All Words
1858 ////////////////////////////////////////////////////////////////////
1860 QList<CStem*>* CLexicon::GetWordSet( const CStringSurrogate& word )
1862 return m_AllWords.find( word.Display() );
1866 bool CLexicon::InsertWord( CStem* word )
1868 if( !word ) return FALSE;
1870 // Get or create the set
1871 QList<CStem*>* set = m_AllWords.find( word->Display() );
1872 if( !set )
1874 // Make sure the dictionary is large enough
1875 if( m_AllWords.count() >= m_AllWords.size() )
1877 m_AllWords.resize( getNextPrimeStep( m_AllWords.size() ) );
1880 set = new QList<CStem*>();
1881 m_AllWords.insert( word->Display(), set );
1884 // Do not insert duplicates
1885 if( set->indexOf( word ) >= 0 ) return FALSE;
1887 // Insert the stem
1888 set->prepend( word );
1890 return TRUE;
1894 bool CLexicon::RemoveWord( CStem* word )
1896 if( !word ) return FALSE;
1898 // Get the set
1899 QList<CStem*>* set = m_AllWords.find( word->Display() );
1900 if( !set ) return FALSE;
1902 // Remove the stem
1903 if( !set->remove( word ) ) return FALSE;
1905 // Remove the set also if there are no more stems
1906 if( set->isEmpty() )
1908 m_AllWords.remove( word->Display() );
1910 // Shrink the dictionary if it's too large
1911 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllWords.size() ) );
1912 if( static_cast <int> (m_AllWords.count()) < smaller_size )
1914 m_AllWords.resize( smaller_size );
1918 return TRUE;
1922 bool CLexicon::LogFileOn()
1924 return m_pDoc->LogFileOn();
1928 QTextStream* CLexicon::GetLogFileStream()
1930 return m_pDoc->GetLogFileStream();
1933 StateEmitHMM* CLexicon::GetHMM()
1935 return m_HMM;
1938 StateEmitHMM* CLexicon::CreateNewHMM()
1940 if (m_HMM) { delete m_HMM;}
1941 m_HMM = new StateEmitHMM (this);
1942 return m_HMM;
1944 void CLexicon::UpdateCompound( QString compound )
1946 if( m_CompoundUpdates.find( compound ) == m_CompoundUpdates.end() )
1948 m_CompoundUpdates.append( compound );
1953 void CLexicon::UpdateWord( CStem* pWord )
1955 if( m_WordUpdates.indexOf( pWord ) < 0 )
1957 m_WordUpdates.append( pWord );
1962 void CLexicon::DoWordUpdates()
1964 CStem* pStem,
1965 * qStem,
1966 * rStem;
1968 CPrefix* pPrefix;
1969 CSuffix* pSuffix;
1970 CCompound* pCompound;
1972 CCorpusWord* pCorpusWord;
1973 int myMiniIndex, start, end;
1974 bool startValid, endValid;
1975 int* wPieces, * sPieces;
1976 QList<CStem*>* myWords;
1977 QList<CStem*> wordQueue;
1979 m_pCorpusWords->SetUpdateFlags( FALSE );
1981 while( !m_WordUpdates.isEmpty() )
1983 pStem = m_WordUpdates.takeAt(0);
1985 // Check if this word has already been updated
1986 pCorpusWord = *m_pCorpusWords ^= pStem->GetKey();
1987 if( !pCorpusWord || pCorpusWord->IsUpdated() ) continue;
1989 // If it's also a compound, override
1990 pCompound = *m_pCompounds ^= pStem->GetKey();
1991 if( pCompound )
1993 pCorpusWord->SetUpdated( TRUE );
1994 continue;
1997 wordQueue.append( pStem );
1998 while( !wordQueue.isEmpty() )
2000 qStem = wordQueue.takeAt(0);
2002 if( pStem->GetMyMini()->GetPrefixes() )
2004 pPrefix = *pStem->GetMyMini()->GetPrefixes() ^= pStem->GetPrefix();
2006 else pPrefix = NULL;
2008 if( pStem->GetMyMini()->GetSuffixes() )
2010 pSuffix = *pStem->GetMyMini()->GetSuffixes() ^= pStem->GetSuffix();
2012 else pSuffix = NULL;
2014 if( pStem->GetMyMini()->GetStems() )
2016 CSS cssStem = pStem->GetStem();
2017 rStem = *pStem->GetMyMini()->GetStems() ^= cssStem;
2019 else rStem = NULL;
2021 // Update this stem in the corpus if it exists
2022 // as a word
2023 pCorpusWord = *m_pCorpusWords ^= qStem->GetKey();
2024 if( pCorpusWord )
2026 if( pCorpusWord->IsUpdated() ) continue;
2027 pCorpusWord->SetUpdated();
2029 wPieces = pCorpusWord->GetPieces();
2030 sPieces = pStem->GetPieces();
2032 start = pCorpusWord->Display().find( pStem->Display() );
2033 end = start + pStem->GetKeyLength();
2035 startValid = FALSE;
2036 endValid = FALSE;
2037 for (int i = 0; i <= pCorpusWord->Size(); ++i) {
2038 if( start == wPieces[i] ) startValid = TRUE;
2039 if( end == wPieces[i] ) endValid = TRUE;
2042 Q_ASSERT( startValid && endValid );
2043 if( startValid && endValid )
2045 for (int i = 0, j = 0;
2046 i <= pCorpusWord->Size() &&
2047 j <= pStem->Size(); ++i) {
2048 if( wPieces[i] > start )
2050 j++;
2051 if( j > pStem->Size() ) break;
2053 if( wPieces[i] < sPieces[j] )
2055 pCorpusWord->MergePieces(i);
2056 break;
2058 else if( sPieces[j] < wPieces[i] )
2060 pCorpusWord->CutRightBeforeHere( sPieces[j] );
2061 if( pSuffix )
2063 pCorpusWord->SetMorpheme( i, rStem );
2064 pCorpusWord->SetMorpheme( i+1, pSuffix );
2066 else if( pPrefix )
2068 pCorpusWord->SetMorpheme( i, pPrefix );
2069 pCorpusWord->SetMorpheme( i+1, rStem );
2071 break;
2078 // Add "parent" "words"
2079 myMiniIndex = qStem->GetMyMini()->GetIndex();
2080 Q_ASSERT( myMiniIndex >= 0 );
2081 Q_ASSERT( myMiniIndex < static_cast <int> ( m_pMiniLexica->size()) );
2083 for (unsigned int i = 0; i < m_pMiniLexica->size(); ++i) {
2084 if (myMiniIndex >= 0 &&
2085 static_cast<unsigned int>(myMiniIndex) == i)
2086 continue;
2087 if( (*m_pMiniLexica)[i] == NULL ) continue;
2089 rStem = *(*m_pMiniLexica)[i]->GetStems() ^= qStem->GetKey();
2091 if( !rStem ) continue;
2093 myWords = rStem->GetWordPtrList();
2095 //for( rStem = myWords->first(); rStem; rStem = myWords->next() )
2096 for (int z = 0; z < myWords->size(); z++)
2097 { rStem = myWords->at (z);
2098 wordQueue.append( rStem );
2104 QString compound;
2106 // Update from compounds list
2107 for( QStringList::Iterator it = m_CompoundUpdates.begin(); it != m_CompoundUpdates.end(); ++it )
2109 compound = (*it);
2110 pCompound = *m_pCompounds ^= CSS( compound );
2111 pCorpusWord = *m_pCorpusWords ^= CSS( compound );
2114 if( !pCorpusWord )
2116 // This happens when the last component is a stem
2117 // found in the suffix analysis of one of the
2118 // mini-lexica or the first component is a stem
2119 // found in the prefix analysis of one of the
2120 // mini-lexica
2122 wordQueue.clear();
2124 // Add all stems containing this compound to the queue
2125 for (unsigned int i = 0; i < m_pMiniLexica->size(); ++i) {
2126 if( (*m_pMiniLexica)[i] == NULL ) continue;
2128 rStem = *(*m_pMiniLexica)[i]->GetStems() ^= compound;
2130 if( !rStem ) continue;
2132 myWords = rStem->GetWordPtrList();
2134 //for( rStem = myWords->first(); rStem; rStem = myWords->next() )
2135 for (int y = 0; y < myWords->size(); y++)
2136 { rStem = myWords->at(y);
2137 wordQueue.append( rStem );
2141 // Update all ancestor corpus words
2142 while( !wordQueue.isEmpty() )
2144 pStem = wordQueue.takeAt(0);
2146 // Update this stem in the corpus if it exists
2147 // as a word
2148 pCorpusWord = *m_pCorpusWords ^= pStem->GetKey();
2149 if( pCorpusWord )
2151 if( pCorpusWord->IsUpdated() ) continue;
2152 pCorpusWord->SetUpdated();
2154 start = pCorpusWord->Display().find( compound );
2155 end = start + compound.length();
2157 wPieces = pCorpusWord->GetPieces();
2159 startValid = FALSE;
2160 endValid = FALSE;
2161 for (int i = 0; i <= pCorpusWord->Size(); ++i) {
2162 if( start == wPieces[i] ) startValid = TRUE;
2163 if( end == wPieces[i] ) endValid = TRUE;
2166 // We need to match the cuts in the compound
2167 // to the substring of the corpus word or remove
2168 // cuts in the substring if pCompound doesn't
2169 // exist. Also, we need to add parents of each
2170 // word to the queue and do those too.
2171 if( startValid && endValid )
2173 if( pCompound )
2175 sPieces = pCompound->GetPieces();
2176 for (int i = 0, j = 0;
2177 i <= pCorpusWord->Size() && j <= pCompound->Size(); i++ )
2179 if( wPieces[i] > start )
2181 j++;
2182 if( j > pCompound->Size() ) break;
2184 if( wPieces[i] < sPieces[j] )
2186 pCorpusWord->MergePieces(i);
2188 else if( sPieces[j] < wPieces[i] )
2191 pCorpusWord->CutRightBeforeHere( sPieces[j] );
2196 // Now set the morphemes of the corpus word
2197 // equal to those of the compound
2198 for (int i = 1; i <= pCompound->Size(); ++i) {
2199 if( pCompound->GetComponent(i) ) pCorpusWord->SetMorpheme( i, pCompound->GetComponent(i)->at(0) );
2200 else if( pCompound->GetLinker(i) ) pCorpusWord->SetMorpheme( i, pCompound->GetLinker(i) );
2203 else
2205 int first = -1;
2206 for (int i = 0; i <= pCorpusWord->Size(); ++i) {
2207 if( wPieces[i] > start && wPieces[i] < end )
2209 pCorpusWord->MergePieces(i);
2210 //qDebug( pCorpusWord->Display('.') );
2211 if( first < 0 ) first = i;
2215 // Now set the morpheme of the corpus word
2216 // equal to the stem or word the old compound
2217 // came from
2218 StemSet* pStemSet = GetAllStems()->find( compound );
2219 if( !pStemSet ) pStemSet = GetAllWords()->find( compound );
2220 if( pStemSet ) pCorpusWord->SetMorpheme( first, pStemSet->at(0) );
2226 continue;
2229 if( !pCompound )
2231 // Compound deleted, remove all cuts
2232 pCorpusWord->SimplifyParseStructure();
2234 else
2236 // Make cuts match compound
2237 pCorpusWord->CopyParseStructure( pCompound );
2239 //if( pCompound ) qDebug( pCompound->Display( '.' ) );
2242 m_CompoundUpdates.clear();