Lexicon.cpp

   1 // Implementation of CLexicon’s core methods
   2 // Copyright © 2009 The University of Chicago
   3 #include "Lexicon.h"
   4
   5 #include "Config.h"
   6 #include <iostream>
   7 #include <memory>
   8 #include <Q3TextStream>
   9 #include <QMessageBox>
  10 #include <QTime>
  11 #include <QList>
  12 #include <QString>
  13 #include "linguisticamainwindow.h"
  14 #include "ui/Status.h"
  15 #include "MiniLexicon.h"
  16 #include "LPreferences.h"
  17 #include "DCNcorpussyl.h"
  18 #include "DLHistory.h"
  19 #include "StateEmitHMM.h"
  20 #include "CorpusWord.h"
  21 #include "Signature.h"
  22 #include "Linker.h"
  23 #include "Suffix.h"
  24 #include "Prefix.h"
  25 #include "Stem.h"
  26 #include "CorpusWordCollection.h"
  27 #include "SignatureCollection.h"
  28 #include "CompoundCollection.h"
  29 #include "LinkerCollection.h"
  30 #include "SuffixCollection.h"
  31 #include "PrefixCollection.h"
  32 #include "WordCollection.h"
  33 #include "StemCollection.h"
  34 #include "generaldefinitions.h"
  35 #include "ScrubRules.h"
  36 #include "Typedefs.h"
  37 #include "Slice.h"
  38
  39 namespace {
  40         QString escapes(QString start)
  41         {
  42                 QString end = start;
  43
  44                 end.replace( "[", "\\[" );
  45                 end.replace( "]", "\\]" );
  46                 end.replace( "(", "\\(" );
  47                 end.replace( ")", "\\)" );
  48                 end.replace( "!", "\\!" );
  49                 end.replace( "?", "\\?" );
  50                 end.replace( "^", "\\^" );
  51                 end.replace( "$", "\\$" );
  52
  53                 return end;
  54         }
  55
  56         enum CStem::type FindType(QString word)
  57         {
  58                 int HyphenCount = 0;
  59                 int PuncCount = 0;
  60                 int NumberPunc = 0;
  61                 int DigitCount = 0;
  62
  63                 int Length = word.length();
  64
  65                 for ( int i = 0; i < Length; i++)
  66                 {
  67                         if( word[i].category() == QChar::Punctuation_Dash )
  68                         {
  69                                 HyphenCount++;
  70                         }
  71
  72                         if( word[i].isPunct() )
  73                         {
  74                                 PuncCount++;
  75                         }
  76
  77                         if( word[i] == '.' ||
  78                                 word[i] == ',' ||
  79                                 word[i].category() == QChar::Symbol_Currency ||
  80                                 word[i].isNumber() )
  81                         {
  82                                 NumberPunc ++;
  83                         }
  84
  85                         if( word[i].isDigit() )
  86                         {
  87                                 DigitCount++;
  88                         }
  89                 }
  90
  91                 if( DigitCount == 0 && HyphenCount == 0 && PuncCount == 0 )
  92                 {
  93                         return CStem::NORMAL;
  94                 }
  95
  96                 if( Length > 2 && HyphenCount == 1 && PuncCount == 1 )
  97                 {
  98                         if (DigitCount > 0 && DigitCount == Length - 1)
  99                                 return CStem::NUMBER;
 100                         return CStem::BIWORD_COMPOUND;
 101                 }
 102
 103                 if( DigitCount > 0 && DigitCount + NumberPunc >= Length )
 104                 {
 105                         return CStem::NUMBER;
 106                 }
 107
 108                 if( Length > 3 && HyphenCount > 1 && PuncCount == HyphenCount )
 109                 {
 110                         return CStem::MULTIPLE_COMPOUND;
 111                 }
 112
 113                 return CStem::UNKNOWN;
 114         }
 115
 116         namespace primes {
 117                 const int NUM_PRIMES = 1229;
 118                 const int NUM_PRIME_STEPS = 38;
 119                 const int LARGE_NON_PRIME = 99460729;
 120                 const int LARGER_PRIME = 99460747;
 121                 const int LARGEST_PRIME_STEP = 10000019;
 122
 123                 int getNextPrime( int number )
 124                 {
 125                         int i, j;
 126                         int primes[NUM_PRIMES] = {      2, 3, 5, 7, 11, 13, 17, 19, 23, 29,
 127                                                                                 31, 37, 41, 43, 47, 53, 59, 61, 67, 71,
 128                                                                                 73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
 129                                                                                 127, 131, 137, 139, 149, 151, 157, 163, 167, 173,
 130                                                                                 179, 181, 191, 193, 197, 199, 211, 223, 227, 229,
 131                                                                                 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
 132                                                                                 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
 133                                                                                 353, 359, 367, 373, 379, 383, 389, 397, 401, 409,
 134                                                                                 419, 421, 431, 433, 439, 443, 449, 457, 461, 463,
 135                                                                                 467, 479, 487, 491, 499, 503, 509, 521, 523, 541,
 136                                                                                 547, 557, 563, 569, 571, 577, 587, 593, 599, 601,
 137                                                                                 607, 613, 617, 619, 631, 641, 643, 647, 653, 659,
 138                                                                                 661, 673, 677, 683, 691, 701, 709, 719, 727, 733,
 139                                                                                 739, 743, 751, 757, 761, 769, 773, 787, 797, 809,
 140                                                                                 811, 821, 823, 827, 829, 839, 853, 857, 859, 863,
 141                                                                                 877, 881, 883, 887, 907, 911, 919, 929, 937, 941,
 142                                                                                 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013,
 143                                                                                 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
 144                                                                                 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151,
 145                                                                                 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
 146                                                                                 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291,
 147                                                                                 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373,
 148                                                                                 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451,
 149                                                                                 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
 150                                                                                 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583,
 151                                                                                 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
 152                                                                                 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733,
 153                                                                                 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
 154                                                                                 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889,
 155                                                                                 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
 156                                                                                 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053,
 157                                                                                 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
 158                                                                                 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213,
 159                                                                                 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287,
 160                                                                                 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357,
 161                                                                                 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
 162                                                                                 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531,
 163                                                                                 2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617,
 164                                                                                 2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687,
 165                                                                                 2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
 166                                                                                 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819,
 167                                                                                 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
 168                                                                                 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999,
 169                                                                                 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
 170                                                                                 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181,
 171                                                                                 3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257,
 172                                                                                 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331,
 173                                                                                 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
 174                                                                                 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511,
 175                                                                                 3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571,
 176                                                                                 3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643,
 177                                                                                 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
 178                                                                                 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821,
 179                                                                                 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
 180                                                                                 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989,
 181                                                                                 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
 182                                                                                 4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139,
 183                                                                                 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231,
 184                                                                                 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297,
 185                                                                                 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
 186                                                                                 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493,
 187                                                                                 4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583,
 188                                                                                 4591, 4597, 4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657,
 189                                                                                 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
 190                                                                                 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, 4817, 4831,
 191                                                                                 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
 192                                                                                 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003,
 193                                                                                 5009, 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
 194                                                                                 5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179,
 195                                                                                 5189, 5197, 5209, 5227, 5231, 5233, 5237, 5261, 5273, 5279,
 196                                                                                 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, 5381, 5387,
 197                                                                                 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
 198                                                                                 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521,
 199                                                                                 5527, 5531, 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639,
 200                                                                                 5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683, 5689, 5693,
 201                                                                                 5701, 5711, 5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
 202                                                                                 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857,
 203                                                                                 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939,
 204                                                                                 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053,
 205                                                                                 6067, 6073, 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
 206                                                                                 6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211, 6217, 6221,
 207                                                                                 6229, 6247, 6257, 6263, 6269, 6271, 6277, 6287, 6299, 6301,
 208                                                                                 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, 6361, 6367,
 209                                                                                 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
 210                                                                                 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571,
 211                                                                                 6577, 6581, 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673,
 212                                                                                 6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737, 6761,
 213                                                                                 6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
 214                                                                                 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911, 6917,
 215                                                                                 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997,
 216                                                                                 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103,
 217                                                                                 7109, 7121, 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
 218                                                                                 7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253, 7283, 7297,
 219                                                                                 7307, 7309, 7321, 7331, 7333, 7349, 7351, 7369, 7393, 7411,
 220                                                                                 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, 7489, 7499,
 221                                                                                 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
 222                                                                                 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643,
 223                                                                                 7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723,
 224                                                                                 7727, 7741, 7753, 7757, 7759, 7789, 7793, 7817, 7823, 7829,
 225                                                                                 7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
 226                                                                                 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, 8011, 8017,
 227                                                                                 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111,
 228                                                                                 8117, 8123, 8147, 8161, 8167, 8171, 8179, 8191, 8209, 8219,
 229                                                                                 8221, 8231, 8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
 230                                                                                 8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369, 8377, 8387,
 231                                                                                 8389, 8419, 8423, 8429, 8431, 8443, 8447, 8461, 8467, 8501,
 232                                                                                 8513, 8521, 8527, 8537, 8539, 8543, 8563, 8573, 8581, 8597,
 233                                                                                 8599, 8609, 8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
 234                                                                                 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731, 8737, 8741,
 235                                                                                 8747, 8753, 8761, 8779, 8783, 8803, 8807, 8819, 8821, 8831,
 236                                                                                 8837, 8839, 8849, 8861, 8863, 8867, 8887, 8893, 8923, 8929,
 237                                                                                 8933, 8941, 8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
 238                                                                                 9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091, 9103, 9109,
 239                                                                                 9127, 9133, 9137, 9151, 9157, 9161, 9173, 9181, 9187, 9199,
 240                                                                                 9203, 9209, 9221, 9227, 9239, 9241, 9257, 9277, 9281, 9283,
 241                                                                                 9293, 9311, 9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
 242                                                                                 9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433, 9437, 9439,
 243                                                                                 9461, 9463, 9467, 9473, 9479, 9491, 9497, 9511, 9521, 9533,
 244                                                                                 9539, 9547, 9551, 9587, 9601, 9613, 9619, 9623, 9629, 9631,
 245                                                                                 9643, 9649, 9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
 246                                                                                 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791, 9803, 9811,
 247                                                                                 9817, 9829, 9833, 9839, 9851, 9857, 9859, 9871, 9883, 9887,
 248                                                                                 9901, 9907, 9923, 9929, 9931, 9941, 9949, 9967, 9973 };
 249
 250                         if( number >= LARGE_NON_PRIME ) return LARGER_PRIME;
 251
 252                         if( number > primes[ NUM_PRIMES - 1 ] )
 253                         {
 254                                 for( i = 0; i < number; i++ )
 255                                 {
 256                                         for( j = 0; j < NUM_PRIMES; j++ )
 257                                         {
 258                                                 if( ( number + i ) % j == 0 ) continue;
 259                                                 if( j * j >= number + i ) return number + i;
 260                                         }
 261                                 }
 262                         }
 263                         else
 264                         {
 265                                 for( i = 0; i < NUM_PRIMES; i++ )
 266                                 {
 267                                         if( primes[i] > number ) return primes[i];
 268                                 }
 269                         }
 270
 271                         return 0;
 272                 }
 273
 274                 int getPrevPrime( int number )
 275                 {
 276                         int i, j;
 277                         int primes[NUM_PRIMES] = {      2, 3, 5, 7, 11, 13, 17, 19, 23, 29,
 278                                                                                 31, 37, 41, 43, 47, 53, 59, 61, 67, 71,
 279                                                                                 73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
 280                                                                                 127, 131, 137, 139, 149, 151, 157, 163, 167, 173,
 281                                                                                 179, 181, 191, 193, 197, 199, 211, 223, 227, 229,
 282                                                                                 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
 283                                                                                 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
 284                                                                                 353, 359, 367, 373, 379, 383, 389, 397, 401, 409,
 285                                                                                 419, 421, 431, 433, 439, 443, 449, 457, 461, 463,
 286                                                                                 467, 479, 487, 491, 499, 503, 509, 521, 523, 541,
 287                                                                                 547, 557, 563, 569, 571, 577, 587, 593, 599, 601,
 288                                                                                 607, 613, 617, 619, 631, 641, 643, 647, 653, 659,
 289                                                                                 661, 673, 677, 683, 691, 701, 709, 719, 727, 733,
 290                                                                                 739, 743, 751, 757, 761, 769, 773, 787, 797, 809,
 291                                                                                 811, 821, 823, 827, 829, 839, 853, 857, 859, 863,
 292                                                                                 877, 881, 883, 887, 907, 911, 919, 929, 937, 941,
 293                                                                                 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013,
 294                                                                                 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
 295                                                                                 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151,
 296                                                                                 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
 297                                                                                 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291,
 298                                                                                 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373,
 299                                                                                 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451,
 300                                                                                 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
 301                                                                                 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583,
 302                                                                                 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
 303                                                                                 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733,
 304                                                                                 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
 305                                                                                 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889,
 306                                                                                 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
 307                                                                                 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053,
 308                                                                                 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
 309                                                                                 2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213,
 310                                                                                 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287,
 311                                                                                 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357,
 312                                                                                 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
 313                                                                                 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531,
 314                                                                                 2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617,
 315                                                                                 2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687,
 316                                                                                 2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
 317                                                                                 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819,
 318                                                                                 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
 319                                                                                 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999,
 320                                                                                 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
 321                                                                                 3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181,
 322                                                                                 3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257,
 323                                                                                 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331,
 324                                                                                 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
 325                                                                                 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511,
 326                                                                                 3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571,
 327                                                                                 3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643,
 328                                                                                 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
 329                                                                                 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821,
 330                                                                                 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
 331                                                                                 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989,
 332                                                                                 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
 333                                                                                 4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139,
 334                                                                                 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231,
 335                                                                                 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297,
 336                                                                                 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
 337                                                                                 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493,
 338                                                                                 4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583,
 339                                                                                 4591, 4597, 4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657,
 340                                                                                 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
 341                                                                                 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, 4817, 4831,
 342                                                                                 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
 343                                                                                 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003,
 344                                                                                 5009, 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
 345                                                                                 5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179,
 346                                                                                 5189, 5197, 5209, 5227, 5231, 5233, 5237, 5261, 5273, 5279,
 347                                                                                 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, 5381, 5387,
 348                                                                                 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
 349                                                                                 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521,
 350                                                                                 5527, 5531, 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639,
 351                                                                                 5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683, 5689, 5693,
 352                                                                                 5701, 5711, 5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
 353                                                                                 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857,
 354                                                                                 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939,
 355                                                                                 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053,
 356                                                                                 6067, 6073, 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
 357                                                                                 6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211, 6217, 6221,
 358                                                                                 6229, 6247, 6257, 6263, 6269, 6271, 6277, 6287, 6299, 6301,
 359                                                                                 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, 6361, 6367,
 360                                                                                 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
 361                                                                                 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571,
 362                                                                                 6577, 6581, 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673,
 363                                                                                 6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737, 6761,
 364                                                                                 6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
 365                                                                                 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911, 6917,
 366                                                                                 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997,
 367                                                                                 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103,
 368                                                                                 7109, 7121, 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
 369                                                                                 7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253, 7283, 7297,
 370                                                                                 7307, 7309, 7321, 7331, 7333, 7349, 7351, 7369, 7393, 7411,
 371                                                                                 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, 7489, 7499,
 372                                                                                 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
 373                                                                                 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643,
 374                                                                                 7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723,
 375                                                                                 7727, 7741, 7753, 7757, 7759, 7789, 7793, 7817, 7823, 7829,
 376                                                                                 7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
 377                                                                                 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, 8011, 8017,
 378                                                                                 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111,
 379                                                                                 8117, 8123, 8147, 8161, 8167, 8171, 8179, 8191, 8209, 8219,
 380                                                                                 8221, 8231, 8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
 381                                                                                 8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369, 8377, 8387,
 382                                                                                 8389, 8419, 8423, 8429, 8431, 8443, 8447, 8461, 8467, 8501,
 383                                                                                 8513, 8521, 8527, 8537, 8539, 8543, 8563, 8573, 8581, 8597,
 384                                                                                 8599, 8609, 8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
 385                                                                                 8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731, 8737, 8741,
 386                                                                                 8747, 8753, 8761, 8779, 8783, 8803, 8807, 8819, 8821, 8831,
 387                                                                                 8837, 8839, 8849, 8861, 8863, 8867, 8887, 8893, 8923, 8929,
 388                                                                                 8933, 8941, 8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
 389                                                                                 9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091, 9103, 9109,
 390                                                                                 9127, 9133, 9137, 9151, 9157, 9161, 9173, 9181, 9187, 9199,
 391                                                                                 9203, 9209, 9221, 9227, 9239, 9241, 9257, 9277, 9281, 9283,
 392                                                                                 9293, 9311, 9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
 393                                                                                 9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433, 9437, 9439,
 394                                                                                 9461, 9463, 9467, 9473, 9479, 9491, 9497, 9511, 9521, 9533,
 395                                                                                 9539, 9547, 9551, 9587, 9601, 9613, 9619, 9623, 9629, 9631,
 396                                                                                 9643, 9649, 9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
 397                                                                                 9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791, 9803, 9811,
 398                                                                                 9817, 9829, 9833, 9839, 9851, 9857, 9859, 9871, 9883, 9887,
 399                                                                                 9901, 9907, 9923, 9929, 9931, 9941, 9949, 9967, 9973 };
 400
 401                         if( number > LARGER_PRIME ) return LARGER_PRIME;
 402
 403                         if( number > primes[ NUM_PRIMES - 1 ] )
 404                         {
 405                                 for( i = number - 1; i >= primes[ NUM_PRIMES - 1 ]; i-- )
 406                                 {
 407                                         for( j = 0; j < NUM_PRIMES; j++ )
 408                                         {
 409                                                 if( i % j == 0 ) continue;
 410                                                 if( j * j >= i ) return i;
 411                                         }
 412                                 }
 413                         }
 414                         else
 415                         {
 416                                 for( i = NUM_PRIMES - 1; i >= 0; i-- )
 417                                 {
 418                                         if( primes[i] < number ) return primes[i];
 419                                 }
 420                         }
 421
 422                         return 0;
 423                 }
 424
 425                 int getNextPrimeStep( int number )
 426                 {
 427                         int prime_steps[NUM_PRIME_STEPS] = { 17, 37, 67,
 428                                                                                                  131, 257, 521,
 429                                                                                                  1031, 2053, 4099, 8209,
 430                                                                                                  10007, 20011, 30011, 40009, 50021, 60013, 70001, 80021, 90001,
 431                                                                                                  100003, 200003, 300007, 400009, 500009, 600011, 700001, 800011, 900001,
 432                                                                                                  1000003, 2000003, 3000017, 4000037, 5000011, 6000011, 7000003, 8000009, 9000011,
 433                                                                                                  10000019 };
 434
 435                         for( int i = 0; i < NUM_PRIME_STEPS; i++ )
 436                         {
 437                                 if( prime_steps[i] > number ) return prime_steps[i];
 438                         }
 439
 440                         return getNextPrime( number );
 441                 }
 442
 443                 int getPrevPrimeStep( int number )
 444                 {
 445                         if( number < 17 ) return 17;
 446
 447                         int prime_steps[NUM_PRIME_STEPS] = { 17, 37, 67,
 448                                                                                                  131, 257, 521,
 449                                                                                                  1031, 2053, 4099, 8209,
 450                                                                                                  10007, 20011, 30011, 40009, 50021, 60013, 70001, 80021, 90001,
 451                                                                                                  100003, 200003, 300007, 400009, 500009, 600011, 700001, 800011, 900001,
 452                                                                                                  1000003, 2000003, 3000017, 4000037, 5000011, 6000011, 7000003, 8000009, 9000011,
 453                                                                                                  10000019 };
 454
 455                         for( int i = NUM_PRIME_STEPS - 1; i >= 0; i-- )
 456                         {
 457                                 if( prime_steps[i] < number ) return prime_steps[i];
 458                         }
 459
 460                         return getPrevPrime( number );
 461                 }
 462         }
 463         using namespace primes;
 464 }
 465
 466 //////////////////////////////////////////////////////////////////////
 467 // Construction/Destruction
 468 //////////////////////////////////////////////////////////////////////
 469
 470 /** \page page1 How to add a variable to the User Preference Window
 471
 472   \section sec The main screen
 473   In the Lexicon.cpp file, the main constructor function:
 474
 475   CLexicon( LinguisticaMainWindow* pDoc )
 476
 477   has a   block of code following the comment:
 478   Set parameter defaults for all lexicon functions;
 479   each line takes the form:
 480
 481   m_ParamDefaults["Main\\MinimumMorphemeLength"] = "2".
 482
 483   These lines create the entries in the Windows Registry which can be used for Linguistica to read from
 484   in various functions. So you have to do two things: add a line to this block of code, and also
 485   add a line of code in a function where you want a variable to get its value from the Registry. Such
 486   a line of code would look like this:
 487
 488   int LoopLimit = m_pLexicon->GetIntParameter( "CheckSignatures\\LoopLimit", 1 );
 489
 490   You'll be calling a Linguisitica function "GetIntParameter", which returns an int from a hash.
 491
 492   If you want to get a QString rather than an int, you use the function "GetStringParameter".
 493
 494 */
 495
 496
 497 CLexicon::CLexicon(LinguisticaMainWindow* pDoc)
 498         : m_pDoc(pDoc),
 499         m_ParamDefaults(),      // initialized below
 500
 501         m_Corpus(),
 502         m_pCorpusWords(new CCorpusWordCollection(this)),
 503         m_CorpusMap(),
 504
 505         m_pMiniLexica(new Q3PtrVector<CMiniLexicon>(4)) /* initialized below */,
 506         m_ActiveMini(-1),
 507         m_WordUpdates(),
 508         // Global morpheme collections
 509         m_AllPrefixes(),        // initialized below
 510         m_AllPrefixSigs(),      // initialized below
 511         m_AllStems(),   // initialized below
 512         m_AllSuffixes(),        // initialized below
 513         m_AllSuffixSigs(),      // initialized below
 514         m_AllWords(),   // initialized below
 515
 516         m_pCompounds(new CCompoundCollection(this)),
 517         m_pLinkers(new CLinkerCollection(this)),
 518         m_CompoundUpdates(),
 519
 520         m_pInFilter(),
 521         m_pOutFilter(),
 522
 523         m_NumberOfCharacterTypes(26),
 524         m_tokenCount(0),
 525         m_pDLHistory(new CDLHistory(status_display())),
 526         m_DescriptionLength(),
 527
 528         m_pSEDWords(new CWordCollection()),
 529
 530         DCNsylTrainCorpus(NULL),
 531         DCNsylTestCorpus(NULL),
 532         isDCNtrainRead(false),
 533         isDCNtestRead(false),
 534
 535         m_HMM(NULL)
 536 {
 537         // Set parameter defaults for all lexicon functions
 538         m_ParamDefaults["Neighbors\\DifferenceThreshold"] = "2";
 539         m_ParamDefaults["Main\\MinimumMorphemeLength"] = "2";
 540         m_ParamDefaults["Main\\MinimumStemLength"] = "3";
 541         m_ParamDefaults["Main\\MaximumPrefixLength"] = "5";
 542         m_ParamDefaults["Main\\MinimumPrefixLength"] = "2";
 543         m_ParamDefaults["Main\\MaximumSuffixLength"] = "5";
 544         m_ParamDefaults["Main\\MinimumSuffixLength"] = "2";
 545         m_ParamDefaults["Main\\MinimumSignatureLength"] = "1";
 546         m_ParamDefaults["Main\\MaxSuccessorFreqScoreByNeighbor"] = "1";
 547         m_ParamDefaults["Main\\MinimumNumberOfStemsInSignature"] = "1";
 548         m_ParamDefaults["Main\\MinimumSuccessorFrequency"] = "6";
 549         m_ParamDefaults["Main\\VerboseDisplayFlag"] = "1";
 550         m_ParamDefaults["Main\\MaximumNumberOfMiniLexica"] = "1";
 551         m_ParamDefaults["CheckSignatures\\LoopLimit"] = "1";
 552         m_ParamDefaults["CheckSignatures\\StemCountThreshold"] = "2";
 553         m_ParamDefaults["PredecessorFrequency\\MaxNeighborPredecessorCount"] = "1";
 554         m_ParamDefaults["PredecessorFrequency\\MinimumNumberOfAppearancesOfPrefix"] = "3";
 555         m_ParamDefaults["PredecessorFrequency\\MinimumNumberOfStemsInSignature"] = "2";
 556         m_ParamDefaults["PredecessorFrequency\\MinimumLengthOfSignature"] = "2";
 557         m_ParamDefaults["PredecessorFrequency\\LengthOfAStrongSignature"] = "4";
 558         m_ParamDefaults["PredecessorFrequency\\LargeNumberOfStems"] = "25";
 559         m_ParamDefaults["PredecessorFrequency\\MinimumNumberOfPrefixes"] = "2";
 560         m_ParamDefaults["PredecessorFrequency\\MaximumPrefixLength"] = "5";
 561         m_ParamDefaults["SuccessorFreq1\\MaxNeighborSuccessorCount"] = "1";
 562         m_ParamDefaults["SuccessorFreq1\\MinimumNumberOfAppearancesOfSuffix"] = "3";
 563         m_ParamDefaults["SuccessorFreq1\\MinimumNumberOfStemsInSignature"] = "2";
 564         m_ParamDefaults["SuccessorFreq1\\MinimumLengthOfSignature"] = "2";
 565         m_ParamDefaults["SuccessorFreq1\\LengthOfAStrongSignature"] = "4";
 566         m_ParamDefaults["SuccessorFreq1\\LargeNumberOfStems"] = "25";
 567         m_ParamDefaults["SuccessorFreq1\\MaxSuccessorFreqScoreByNeighbor"] = "1";
 568         m_ParamDefaults["TakeSignaturesFindStems\\SizeThreshold"] = "2";
 569         m_ParamDefaults["TakeSignaturesFindStems\\StemCountThreshold"] = "2";
 570         m_ParamDefaults["TakeSignaturesFindStems\\SignatureRobustnessThreshold"] = "10";
 571         m_ParamDefaults["FromStemsFindSuffixes\\RobustnessThreshold"] = "10";
 572         m_ParamDefaults["FromStemsFindSuffixes\\MinimumNumberOfOccurrences"] = "3";
 573         m_ParamDefaults["Compounds\\MaximumLinkerLength"] = "0";
 574         m_ParamDefaults["SignatureDL\\CorpusBasedAffixCount"] = "0";
 575         m_ParamDefaults["SignatureDL\\CorpusBasedStemCount"] = "1";
 576         m_ParamDefaults["HMM\\NumberOfStates"] = "2";
 577         m_ParamDefaults["HMM\\NumberOfIterations"] = "25";
 578         m_ParamDefaults["EarleyParser\\MaximumParseDepth"] = "6";
 579         m_ParamDefaults["Boltzmann\\NumberOfSamples"] = "100";
 580         m_ParamDefaults["Symbols\\Vowels"] = "a e i o u A E I O U";
 581
 582         m_CorpusMap.setAutoDelete(false);       // m_pCorpusWords owns these
 583         m_pMiniLexica->setAutoDelete(true);
 584 //      m_WordUpdates.setAutoDelete(false);     // mini-lexica own these -- no autodelete in Qt4's QList
 585         m_AllPrefixes.setAutoDelete(true);
 586         m_AllPrefixSigs.setAutoDelete(true);
 587         m_AllStems.setAutoDelete(true);
 588         m_AllSuffixes.setAutoDelete(true);
 589         m_AllSuffixSigs.setAutoDelete(true);
 590         m_AllWords.setAutoDelete(true);
 591
 592         // Compare default parameters to user params
 593         QMap<QString, QString> params;
 594         CLPreferences& prefs = *m_pDoc->GetPreferences();
 595
 596         prefs.GetDictionaryPreference("Lxa_Parameters", &params);
 597         for (QMap<QString, QString>::const_iterator iter = m_ParamDefaults.begin();
 598                         iter != m_ParamDefaults.end(); ++iter) {
 599                 const QString key = iter.key();
 600
 601                 if (!params.contains(key))
 602                         params.insert(key, iter.value());
 603         }
 604         prefs.SetDictionaryPreference("Lxa_Parameters", params);
 605 }
 606
 607 CLexicon::~CLexicon()
 608 {
 609         // The mini-lexicon destructor requires a valid lexicon to
 610         // work with, and in particular the preceding mini-lexica
 611         // must still be valid.  It would be nice to just use the
 612         // Q3PtrVector destructor, but it deletes its items in the
 613         // wrong order (first-to-last instead of last-to-first).
 614         for (int i = m_pMiniLexica->size() - 1; i >= 0; --i)
 615                 m_pMiniLexica->remove(static_cast<unsigned int>(i));
 616
 617         delete m_pMiniLexica;
 618         delete m_pCorpusWords;
 619         delete m_pCompounds;
 620         delete m_pLinkers;
 621         delete m_pInFilter;
 622         delete m_pOutFilter;
 623         delete m_pDLHistory;
 624         delete m_pSEDWords;
 625         delete m_HMM;
 626         delete DCNsylTrainCorpus;
 627         delete DCNsylTestCorpus;
 628 }
 629
 630
 631 void CLexicon::AddToScreen( QString text )
 632 {
 633   m_pDoc->m_commandLine->setText( text );
 634 }
 635
 636
 637 void CLexicon::ClearScreen()
 638 {
 639   m_pDoc->m_commandLine->setText ("");
 640 }
 641
 642
 643 void CLexicon::FindPrefixes( bool AutoLayer )
 644 {
 645   CMiniLexicon* mini;
 646   int index = 0;
 647
 648  //unused variable:  const int LayerCount = GetIntParameter( "Main\\MaximumNumberOfMiniLexica", 1 );
 649
 650   if( m_ActiveMini >= 0 )
 651   {
 652     (*m_pMiniLexica)[m_ActiveMini]->LogFile(QString("MINI-LEXICON %1").arg(m_ActiveMini+1));
 653
 654     if( !(*m_pMiniLexica)[m_ActiveMini]->FindPrefixes() )
 655     {
 656       // The active mini may already have a suffix analysis
 657       if( (*m_pMiniLexica)[m_ActiveMini]->GetStems()->GetCount() )
 658       {
 659         index = NewMiniLexicon();
 660         mini = GetMiniLexicon( index );
 661         if( mini && mini->SetAffixLocation( STEM_INITIAL ) )
 662           mini->AddToWordCollection( GetMiniLexicon( m_ActiveMini )->GetWords() );
 663       }
 664       else
 665       {
 666         mini = (*m_pMiniLexica)[m_ActiveMini];
 667         mini->SetAffixLocation( STEM_INITIAL );
 668       }
 669
 670       mini->FindPrefixes();
 671
 672       SetActiveMiniIndex( index );
 673     }
 674
 675     if( AutoLayer )
 676     {
 677       while (1)
 678       {
 679         index = NewMiniLexicon();
 680         mini = GetMiniLexicon( index );
 681         mini->AddToWordCollection( GetMiniLexicon( index-1 )->GetWords() );
 682         mini->AddToWordCollection( GetMiniLexicon( index-1 )->GetStems() );
 683
 684 /*        if( LogFileOn() )
 685         {
 686           *GetLogFileStream() << endl << endl << "================================================================" << endl
 687                         << QString("MINI-LEXICON %1").arg(index+1) << endl
 688                         << "================================================================" << endl << endl;
 689         }
 690 */
 691         mini->FindPrefixes();
 692
 693         if( mini && mini->GetSignatures()->GetSize() < 1 )
 694         {
 695           DeleteMiniLexicon( index );
 696           break;
 697         }
 698       }
 699     }
 700   }
 701 }
 702
 703
 704 void CLexicon::FindSuffixes( bool AutoLayer )
 705 {
 706         if (m_ActiveMini < 0)
 707                 // No mini-lexicon selected
 708                 return;
 709
 710         // XXX. Log which mini-lexicon this is
 711
 712         CMiniLexicon& active_mini = *GetMiniLexicon(m_ActiveMini);
 713
 714         if (active_mini.FindSuffixes() == 0) {
 715                 int index;
 716                 // The active mini may already have a prefix analysis
 717                 if (active_mini.GetStems()->GetCount() != 0) {
 718                         index = NewMiniLexicon();
 719                         CMiniLexicon* pMini = GetMiniLexicon(index);
 720                         pMini->AddToWordCollection(active_mini.GetWords());
 721                 } else {
 722                         index = m_ActiveMini;
 723                         active_mini.SetAffixLocation( STEM_FINAL );
 724                 }
 725
 726                 CMiniLexicon& mini = *GetMiniLexicon(index);
 727                 mini.FindSuffixes();
 728                 mini.CalculateDescriptionLength();
 729                 SetActiveMiniIndex(index);
 730         }
 731
 732         // avoid infinite loop
 733         // XXX. Add a better termination condition.
 734         const unsigned int layer_max = 100;
 735
 736         if (AutoLayer)
 737                 for (unsigned int layer = layer_max; layer != 0; --layer) {
 738                         int index = NewMiniLexicon();
 739                         CMiniLexicon& mini = *GetMiniLexicon(index);
 740                         CMiniLexicon& prev = *GetMiniLexicon(index - 1);
 741
 742                         mini.AddToWordCollection(prev.GetWords());
 743                         mini.AddToWordCollection(prev.GetStems());
 744
 745                         // XXX. Log which mini-lexicon this is.
 746
 747                         mini.FindSuffixes();
 748
 749                         if (mini.GetSignatures()->GetSize() == 0) {
 750                                 DeleteMiniLexicon(index);
 751                                 break;
 752                         }
 753                 }
 754 }
 755
 756 int CLexicon::Tokenize( QStringList& lines, StringToInt& tokens )
 757 {
 758         QString token;
 759         int token_count = 0;
 760
 761         foreach (QString line_text, lines) {
 762                 QTextStream line(&line_text, QIODevice::ReadOnly);
 763                 while (!line.atEnd()) {
 764                         line >> token;
 765                         // If the tokenization becomes more sophisticated than 'break at space', this is where it should be defined
 766                         tokens[token]++;
 767                         token_count++;
 768                 }
 769         }
 770         return token_count;
 771 }
 772
 773 int CLexicon::ReadDX1File(QString FileName, int NumberOfWords )
 774 {
 775         QString         line;
 776         CParse          prsLine;
 777 //unused variable:
 778 //  int                 token_count = 0;
 779         CSS                     Spelling;
 780 //unused variable:
 781 //      int                     corpuscount = 0;
 782         CStem*          pWord;
 783         CWordCollection*        Words;
 784         CCorpusWord* pCorpusWord;
 785
 786         linguistica::ui::status_user_agent& status = status_display();
 787         status.major_operation = "Reading dictionary word list.";
 788         status.progress.clear();
 789
 790         // reset lexicon!
 791         ClearAll();
 792
 793         // Add the base mini lexicon
 794         CMiniLexicon* mini = new CMiniLexicon( this, 0 );
 795         m_ActiveMini = 0;
 796         m_pMiniLexica->insert( 0, mini );
 797         m_pMiniLexica->setAutoDelete( true );
 798         Words = mini->GetWords();
 799
 800         Q_ASSERT( !FileName.isEmpty() );
 801         QFile File( FileName );
 802         if (!File.open(QIODevice::ReadOnly)) {
 803                 status.major_operation.clear();
 804                 return 1;
 805         }
 806
 807         Q3TextStream stream(&File);
 808
 809         // Unicode or ASCII encoding? Depends on corpus file.
 810         stream.setEncoding ( Q3TextStream::Locale );
 811
 812         if (NumberOfWords < 0)
 813                 NumberOfWords = 1000000;
 814         status.progress.set_denominator(NumberOfWords);
 815
 816         int wordcount;
 817         for (wordcount = 1; wordcount < NumberOfWords + 1; ++wordcount) {
 818                 status.progress = wordcount;
 819                 if (stream.atEnd())
 820                         break;
 821                 QString line = stream.readLine().simplifyWhiteSpace();
 822
 823                 // Add line to corpus
 824                 m_Corpus.append( line );
 825
 826                 prsLine.Collapse(CSS (line) );
 827
 828                 // Add to mini word list
 829                 pWord = *Words <<   prsLine[1].Display();
 830                 //pWord->SetCorpusCount( prsLine[2] );
 831                 pWord->SetCorpusCount ( 1 );
 832
 833                 // Add corpus word
 834                 pCorpusWord = *m_pCorpusWords << pWord->GetKey();
 835                 m_CorpusMap.insert( prsLine[1].Display(), pCorpusWord );
 836                 m_pCorpusWords->IncrementCorpusCount( pWord->GetCorpusCount() - 1 ); // already incremented one in operator<<
 837                 pCorpusWord->IncrementCorpusCount( pWord->GetCorpusCount() - 1 );
 838
 839                 // Remove the two non phonology pieces and set
 840                 // the phonology tier
 841                 prsLine.RemovePiece(1);
 842                 prsLine.RemovePiece(1);
 843                 pWord->SetPhonology_Tier1 ( &prsLine );
 844         }
 845         status.progress.clear();
 846         status.major_operation.clear();
 847         return wordcount;
 848 }
 849
 850 int CLexicon::ReadCorpus(QString FileName, int NumberOfWords)
 851 {
 852         Q_ASSERT(!FileName.isEmpty());
 853
 854         QTime t;
 855         t.start();
 856
 857         if (FileName.right(4) == ".dx1")
 858                 return ReadDX1File(FileName, NumberOfWords);
 859
 860         linguistica::ui::status_user_agent& status = status_display();
 861         status.major_operation = "Reading corpus";
 862         status.progress.clear();
 863
 864         m_tokenCount = 0;
 865
 866         const int MinStemLength = GetIntParameter( "Main\\MinimumStemLength", 3 );
 867         // XXX. use Lower_Case preference?
 868
 869         // Remove everything from Lexicon
 870         m_pCorpusWords->Empty();
 871         ClearAll();
 872
 873         // Add the base mini lexicon
 874         std::auto_ptr<CMiniLexicon> new_mini(new CMiniLexicon(this, 0));
 875         CMiniLexicon* mini = new_mini.get();
 876         m_ActiveMini = 0;
 877         Q_ASSERT(m_pMiniLexica->autoDelete());
 878         m_pMiniLexica->insert(0, new_mini.release());
 879         CWordCollection* Words = mini->GetWords();
 880
 881         // Set up filters
 882         m_pInFilter = 0;
 883         m_pOutFilter = 0;
 884         QStringList items;
 885         m_pDoc->GetPreferences()->GetStringListPreference(
 886                         "Character_Combinations", &items);
 887         SetFilters(&items);
 888
 889         // Read file.
 890         {
 891                 Q_ASSERT(!FileName.isEmpty());
 892                 QFile File(FileName);
 893                 if (!File.open(QIODevice::ReadOnly)) {
 894                         status.major_operation.clear();
 895                         return -1;
 896                 }
 897
 898                 QTextStream stream(&File);
 899                 stream.setCodec(QTextCodec::codecForName("UTF-8"));
 900                 Q_ASSERT(stream.autoDetectUnicode());
 901
 902                 status.major_operation = "Reading word tokens from corpus...";
 903                 status.progress.clear();
 904                 status.progress.set_denominator(NumberOfWords);
 905                 while (!stream.atEnd() && m_tokenCount <= NumberOfWords) {
 906                         status.progress = m_tokenCount;
 907                         QString line = stream.readLine().trimmed();
 908
 909                         // Deal with line-final dashes
 910                         if (line.endsWith("-")) {
 911                                 QString action = m_pDoc->GetPreferences()
 912                                         ->GetPreference("Line_Final_Dash");
 913                                 if (action.isEmpty())
 914                                         action = "Join_With_Dash";
 915
 916                                 if (action == "Do_Not_Join")
 917                                         // Do nothing:
 918                                         // dashes should be handled by scrubbing.
 919                                         ;
 920                                 else if (!stream.atEnd()) {
 921                                         QString next;
 922                                         stream >> next;
 923
 924                                         if (action == "Join_Without_Dash")
 925                                                 line.truncate(line.length() - 1);
 926
 927                                         line += next;
 928                                 }
 929                         }
 930
 931                         m_tokenCount += line.count(' ') + 1;
 932                         m_Corpus.append(line);
 933                 }
 934                 status.progress.clear();
 935                 status.major_operation.clear();
 936         }
 937
 938         // Tokenize
 939         QMap<QString, int> types;
 940         Tokenize(m_Corpus, types);
 941
 942         // Prepare scrubbing
 943         QStringList scrubRules;
 944         m_pDoc->GetPreferences()->GetStringListPreference(
 945                 "Scrub_Replacements", &scrubRules);
 946
 947         QMap<QString, QString> replacements;
 948         QStringList regExps;
 949         QRegExp preceding, internal, following;
 950         const QString arrow_str = " --> ";
 951         for (QStringList::const_iterator iter = scrubRules.begin();
 952                         iter != scrubRules.end(); ++iter) {
 953                 const QString rule = *iter;
 954                 int arrow = rule.lastIndexOf(arrow_str);
 955
 956                 if (arrow != -1) {
 957                         Q_ASSERT(arrow >= 0);
 958                         Q_ASSERT(rule.lastIndexOf(arrow_str, arrow - 1) == -1);
 959
 960                         const QString lhs = rule.left(arrow);
 961                         const QString rhs = rule.mid(arrow + arrow_str.size());
 962                         regExps.append(lhs);
 963                         replacements.replace(lhs, rhs);
 964                         continue;
 965                 }
 966
 967                 regExps.append(rule);
 968                 if (rule == SCR_REMOVE_PRECEDING_PUNCT) {
 969                         ++iter;
 970                         preceding = QRegExp(QString("^[%1]+")
 971                                         .arg(escapes(*iter)));
 972                 } else if (rule == SCR_REMOVE_INTERNAL_PUNCT) {
 973                         ++iter;
 974                         internal = QRegExp(QString("(\\S)[%1]+(\\S)")
 975                                         .arg(escapes(*iter)));
 976                 } else if (rule == SCR_REMOVE_FOLLOWING_PUNCT) {
 977                         ++iter;
 978                         following = QRegExp(QString("[%1]+$")
 979                                         .arg(escapes(*iter)));
 980                 }
 981         }
 982
 983         status.major_operation = "Processing word types...";
 984         status.progress.clear();
 985         status.progress.set_denominator(types.size());
 986
 987         // Process types individually
 988         int words = 0;
 989         int maxNumberOfRoots = 2;
 990         QMap<QChar,int> allChars;
 991         QList<CStem*> dashCompounds, dashComponents;
 992         for (QMap<QString, int>::const_iterator iter = types.begin();
 993                         iter != types.end(); ++iter, ++words) {
 994                 QString word = iter.key();
 995                 if (word.isEmpty())
 996                         continue;
 997
 998                 status.progress = words;
 999
1000                 // Scrub word
1001                 const QRegExp numeral("[0-9]");
1002                 foreach (QString regexp, regExps) {
1003                         if (regexp == SCR_MAKE_LOWER_CASE)
1004                                 word = word.lower();
1005                         else if (regexp == SCR_REMOVE_NUMBERS)
1006                                 word.replace(numeral, "");
1007                         else if (regexp == SCR_REMOVE_PRECEDING_PUNCT)
1008                                 word.replace(preceding, "");
1009                         else if (regexp == SCR_REMOVE_INTERNAL_PUNCT)
1010                                 word.replace(internal, "\\1\\2");
1011                         else if (regexp == SCR_REMOVE_FOLLOWING_PUNCT)
1012                                 word.replace(following, "");
1013                         else
1014                                 word.replace(QRegExp(regexp),
1015                                         replacements[regexp]);
1016                 }
1017
1018                 // Combine n-graph combos into single character
1019                 word = Filter(m_pInFilter, word);
1020
1021                 if (word.isEmpty())
1022                         continue;
1023
1024                 // Process words with internal dashes according to user preferences
1025                 int dashPos = word.indexOf('-');
1026                 const bool hasDash = (dashPos != -1);
1027
1028                 const QString wordInternalDash = m_pDoc->GetPreferences()
1029                                 ->GetPreference("Word_Internal_Dash");
1030                 const QString wordWithDash = m_pDoc->GetPreferences()
1031                                 ->GetPreference("Word_With_Dash");
1032
1033                 bool pieceTooShort = false;
1034                 if (hasDash) {
1035                         QStringList pieces = QStringList::split('-', word, true);
1036                         foreach (QString piece, pieces) {
1037                                 if (piece.size() < MinStemLength) {
1038                                         pieceTooShort = true;
1039                                         break;
1040                                 }
1041                         }
1042                 }
1043
1044                 const bool split_at_dash =
1045                         (wordWithDash == "Include_Substrings_Only" ||
1046                         wordWithDash == "Include_Full_Word_And_Substrings");
1047                 if (hasDash && !pieceTooShort && split_at_dash) {
1048                         int dashCount = word.count('-');
1049                         if (dashCount > maxNumberOfRoots)
1050                                 maxNumberOfRoots = dashCount + 1;
1051
1052                         QString components = word;
1053
1054                         for (; !components.isEmpty();
1055                                         dashPos = components.indexOf('-')) {
1056                                 QString component;
1057
1058                                 if (dashPos != -1) {
1059                                         component = components.left(dashPos);
1060                                         components = components.mid(dashPos + 1);
1061                                 } else {
1062                                         component = components;
1063                                         components = "";
1064                                 }
1065
1066                                 if (!component.isEmpty()) {
1067                                         dashPos = components.indexOf('-');
1068                                         continue;
1069                                 }
1070
1071                                 foreach (QChar ch, component)
1072                                         ++allChars[ch];
1073
1074                                 CStem stem(component, mini);
1075                                 CStem* pWord = *Words << stem;
1076                                 // operator<< already incremented the corpus
1077                                 // counts by 1, but that wasn’t right --- so
1078                                 // we adjust.
1079                                 Words->IncrementCorpusCount(iter.value() - 1);
1080                                 pWord->IncrementCorpusCount(iter.value() - 1);
1081
1082                                 pWord->SetWordType(FindType(component));
1083                                 pWord->IncrementCompoundCount();
1084                                 dashComponents.append(pWord);
1085                                 *m_pSEDWords << stem;
1086                         }
1087                 }
1088
1089                 if (!hasDash || wordWithDash != "Include_Substrings_Only") {
1090                         if (hasDash && wordInternalDash == "Remove")
1091                                 word = word.replace('-', "");
1092
1093                         foreach (QChar ch, word)
1094                                 ++allChars[ch];
1095
1096                         CStem stem(word, mini);
1097                         CStem* pWord = *Words << stem;
1098                         // operator<< already incremented the corpus
1099                         // counts by 1, but that wasn’t right --- so
1100                         // we adjust.
1101                         Words->IncrementCorpusCount(iter.value() - 1);
1102                         pWord->IncrementCorpusCount(iter.value() - 1);
1103
1104                         pWord->SetWordType(FindType(word));
1105                         if (hasDash && wordInternalDash != "Remove" &&
1106                                         !pieceTooShort)
1107                                 dashCompounds.append(pWord);
1108                         *m_pSEDWords << stem;
1109
1110                         CCorpusWord* pCorpusWord =
1111                                 *m_pCorpusWords << pWord->GetKey();
1112                         // operator<< already incremented the corpus
1113                         // counts by 1, but that wasn’t right.
1114                         m_pCorpusWords->IncrementCorpusCount(iter.value() - 1);
1115                         pCorpusWord->IncrementCorpusCount(iter.value() - 1);
1116
1117                         pCorpusWord->SetMorpheme(1, pWord);
1118                         m_CorpusMap.insert(iter.key(), pCorpusWord);
1119                 }
1120         }
1121         status.progress.clear();
1122
1123         // Connect compounds to word components and create parses
1124         if (!dashCompounds.isEmpty() && !dashComponents.isEmpty()) {
1125                 FromStemsFindFlatCompounds(&dashCompounds,
1126                         &dashComponents, "-", maxNumberOfRoots);
1127 //              m_pCompounds->FindMostProbableParse();
1128         }
1129
1130         m_NumberOfCharacterTypes = allChars.count();
1131
1132         Words->GetTrie()->Alphabetize();
1133         // phonology: move this eventually to the menu -- JG
1134         mini->GetWords()->DoPhonology();
1135
1136         status.major_operation.clear();
1137
1138         // XXX. necessary?
1139         status.progress.clear();
1140         status.details.clear();
1141
1142         Words->SetSortStyle(KEY);
1143
1144         // Add description length to history
1145         const QString mini_name("Mini-Lexicon 1");
1146         const QString remark("Before analysis; words only");
1147         m_pDLHistory->append(mini_name, remark, mini);
1148
1149         std::cout << "ReadCorpus:: Time elapsed: " <<
1150                 t.elapsed() << "ms." << std::endl;
1151         return 0;
1152 }
1153
1154 int CLexicon::RereadCorpus(QString FileName, int NumberOfWords)
1155 {
1156         Q_ASSERT(!FileName.isEmpty());
1157         ClearAll();
1158         return ReadCorpus(FileName, NumberOfWords);
1159 }
1160
1161
1162
1163 void CLexicon::ClearAll()
1164 {
1165         CMiniLexicon* mini;
1166
1167         // Clear all MiniLexica
1168         for( int i = m_pMiniLexica->size()-1; i >= 0; i-- )
1169         {
1170                 mini = m_pMiniLexica->take(i);
1171                 if( mini )
1172                 {
1173                         mini->ClearAll();
1174                         delete mini;
1175                 }
1176         }
1177
1178         m_AllPrefixes   .clear();
1179         m_AllPrefixSigs .clear();
1180         m_AllStems              .clear();
1181         m_AllSuffixes   .clear();
1182         m_AllSuffixSigs .clear();
1183         m_AllWords              .clear();
1184         m_pCompounds    ->Empty(); delete m_pCompounds;
1185         m_pCompounds    = new CCompoundCollection( this );
1186         m_pLinkers              ->Empty(); delete m_pLinkers;
1187         m_pLinkers              = new CLinkerCollection( this );
1188         m_Corpus                .clear();
1189         m_CorpusMap             .clear();
1190         m_pCorpusWords  ->Empty();
1191
1192         delete m_pDLHistory;
1193         m_pDLHistory = new CDLHistory(status_display());
1194
1195         delete m_HMM;
1196         m_HMM = 0;
1197 }
1198
1199
1200 CCorpusWord*  CLexicon::FindAWord( CStem* pStem, CSuffix* pSuffix )
1201 {
1202         QString Word = pStem->Display();
1203
1204         if ( !pSuffix->GetKey().IsNULL() )
1205         {
1206                 Word += pSuffix->Display();
1207         }
1208
1209         return *m_pCorpusWords ^= Word;
1210 }
1211
1212
1213 int CLexicon::ReadProjectFile( QString FileName )
1214 {
1215         Q_ASSERT( !FileName.isEmpty() );
1216         QFile   file( FileName );
1217
1218         QString buffer;
1219
1220         int     usedTokens,
1221                         types,
1222                         minis;
1223
1224         if( file.exists() && file.open( QIODevice::ReadOnly ) )
1225         {
1226                 Q3TextStream inf(&file);
1227                 inf.setEncoding ( Q3TextStream::Locale );
1228
1229                 buffer = inf.readLine();
1230                 Q_ASSERT( buffer[0] == '#' );
1231
1232                 buffer = inf.readLine();
1233                 Q_ASSERT( buffer[0] == '#' );
1234
1235                 inf >> m_tokenCount;
1236
1237                 buffer = inf.readLine();        // end of read tokens line
1238                 Q_ASSERT( buffer.length() == 0 );
1239
1240                 buffer = inf.readLine();
1241                 Q_ASSERT( buffer[0] == '#' );
1242
1243                 inf >> usedTokens;
1244
1245                 buffer = inf.readLine();        // end of used tokens line
1246                 Q_ASSERT( buffer.length() == 0 );
1247
1248                 buffer = inf.readLine();
1249                 Q_ASSERT( buffer[0] == '#' );
1250
1251                 inf >> types;
1252
1253                 buffer = inf.readLine();        // end of types line
1254                 Q_ASSERT( buffer.length() == 0 );
1255
1256                 buffer = inf.readLine();
1257                 Q_ASSERT( buffer[0] == '#' );
1258
1259                 inf >> m_NumberOfCharacterTypes;
1260
1261                 buffer = inf.readLine();        // end of characters line
1262                 Q_ASSERT( buffer.length() == 0 );
1263
1264                 buffer = inf.readLine();
1265                 Q_ASSERT( buffer[0] == '#' );
1266
1267                 inf >> minis;
1268
1269                 buffer = inf.readLine();        // end of minis line
1270                 Q_ASSERT( buffer.length() == 0 );
1271
1272                 file.close();
1273         }
1274
1275         return minis;
1276 }
1277
1278
1279 void CLexicon::OutputStats( QString FileName )
1280 {
1281         Q_ASSERT( !FileName.isEmpty() );
1282         QFile file( FileName );
1283
1284         int i;
1285
1286         int slash = FileName.findRev( "\\" );
1287         if( slash < 0 ) slash = FileName.findRev( "/" );
1288         int dot = FileName.findRev( ".prj" );
1289
1290         if( file.open( QIODevice::WriteOnly ) )
1291         {
1292                 Q3TextStream    outf( &file );
1293                 outf.setEncoding ( Q3TextStream::Unicode );
1294
1295                 outf <<
1296                         "# LEXICON (\'" + FileName.mid( slash+1, dot-3-slash ) + "\')" << endl <<
1297                         "# Number of word tokens read: \n\t" << GetTokenCount() << endl <<
1298                         "# Number of word tokens used: \n\t" << m_pCorpusWords->GetCorpusCount() << endl <<
1299                         "# Number of word types: \n\t" << m_pCorpusWords->GetCount() << endl <<
1300                         "# Number of character types: \n\t" << m_NumberOfCharacterTypes << endl <<
1301                         "# Number of mini-lexica: \n\t" << GetMiniCount() << endl << endl;
1302
1303                 for( i = 0; i < GetMiniSize(); i++ )
1304                 {
1305                         CMiniLexicon* mini = GetMiniLexicon(i);
1306                         if( !mini ) continue;
1307
1308                         outf << "# ------------------------" << endl <<
1309                                 QString( "# MINI-LEXICON %1" ).arg( i+1 ) << endl <<
1310                                 "# Number of words: \n\t" << mini->GetWords()->GetCount() << endl <<
1311                                 "# Number of stems: \n\t" << mini->GetStems()->GetCount() << endl;
1312
1313                         if( mini->GetSuffixes() )
1314                         {
1315                                 outf << "# Number of regular suffixes: \n\t" << mini->GetSuffixes()->GetCount() << endl <<
1316                                         "# Number of signatures with regular suffixes: \n\t" << mini->GetSignatures()->GetCount() << endl << endl;
1317                         }
1318                         if( mini->GetPrefixes() )
1319                         {
1320                                 outf << "# Number of regular prefixes: \n\t  " << mini->GetPrefixes()->GetCount() << endl <<
1321                                         "# Number of signatures with regular prefixes: \n\t" << mini->GetSignatures()->GetCount() << endl << endl;
1322                         }
1323                 }
1324
1325                 file.close();
1326         }
1327 }
1328
1329
1330 void CLexicon::SetFilters( QStringList* items )
1331 {
1332         // Note: this function is private because we don't want to change
1333         // the filters after words have already been read into the lexicon
1334         // if we do, then they may filter out to display and log incorrectly
1335         if( m_pInFilter ) delete m_pInFilter;
1336         if( m_pOutFilter ) delete m_pOutFilter;
1337
1338         m_pInFilter = new StringToString;
1339         m_pOutFilter = new StringToString;
1340
1341         int pound, i = 0;
1342         QString item;
1343         for ( QStringList::Iterator it = items->begin(); it != items->end(); ++it )
1344         {
1345                 // Remove comments
1346                 pound = (*it).find('#');
1347                 if( pound >= 0 ) item = (*it).left( pound ).stripWhiteSpace();
1348                 else item = *it;
1349
1350                 // Do nothing if the remaining string has spaces
1351                 // or nothing
1352                 if( item.length() == 0 || item.find(' ') >= 0 ) break;
1353
1354                 // Insert into both filters
1355                 QString character = QChar( FILTER_BASE + i );
1356                 m_pInFilter->insert( item, character );
1357                 m_pOutFilter->insert( character, item );
1358                 i++;
1359         }
1360 }
1361
1362 CMiniLexicon* CLexicon::GetMiniLexicon(int i)
1363 {
1364         if (i == -1)
1365                 return 0;
1366
1367         Q_ASSERT(i >= 0 &&  i < GetMiniSize());
1368         return (*m_pMiniLexica)[i];
1369 }
1370
1371 void CLexicon::SetActiveMiniIndex( int i )
1372 {
1373         int j;
1374
1375         if( i < 0 || GetMiniLexicon(i) ) m_ActiveMini = i;
1376         else
1377         {
1378                 for( j = i-1; j >= 0; j-- )
1379                 {
1380                         if( GetMiniLexicon(j) )
1381                         {
1382                                 m_ActiveMini = j;
1383                                 return;
1384                         }
1385                 }
1386                 for( j = i+1; j < GetMiniSize(); j++ )
1387                 {
1388                         if( GetMiniLexicon(j) )
1389                         {
1390                                 m_ActiveMini = j;
1391                                 return;
1392                         }
1393                 }
1394         }
1395 }
1396
1397 linguistica::ui::status_user_agent& CLexicon::status_display()
1398         { return m_pDoc->status_display(); }
1399
1400 int CLexicon::GetIntParameter(QString param, int iDefault)
1401 {
1402         QMap<QString, QString> m;
1403         m_pDoc->GetPreferences()->GetDictionaryPreference(
1404                                 "Lxa_Parameters", &m);
1405
1406         const QMap<QString, QString>::const_iterator iter =
1407                 m.constFind(param);
1408         if (iter == m.constEnd())
1409                 return iDefault;
1410
1411         // see CLPreferences::GetIntPreference()
1412         bool ok;
1413         const int rv = iter.value().toInt(&ok);
1414         if (!ok)
1415                 return iDefault;
1416         return rv;
1417 }
1418
1419 QString CLexicon::GetStringParameter(QString param)
1420 {
1421         QMap<QString, QString> m;
1422         m_pDoc->GetPreferences()->GetDictionaryPreference(
1423                                 "Lxa_Parameters", &m);
1424
1425         const QMap<QString, QString>::const_iterator iter =
1426                 m.constFind(param);
1427         if (iter == m.constEnd())
1428                 return QString();
1429
1430         return iter.value();
1431 }
1432
1433 void CLexicon::MakeBrokenCorpus( QString outputFileName )
1434 {
1435         Q_ASSERT( !outputFileName.isEmpty() );
1436         QFile outFile( outputFileName );
1437
1438         CCorpusWord*    pCorpusWord;
1439
1440         if( outFile.open( QIODevice::WriteOnly ) )
1441         {
1442                 Q3TextStream    out( &outFile );
1443                 out.setEncoding( Q3TextStream::Unicode );
1444
1445                 for( QStringList::Iterator lineIt = m_Corpus.begin(); lineIt != m_Corpus.end(); ++lineIt )
1446                 {
1447                         QStringList line = QStringList::split( " ", *lineIt );
1448
1449                         for( QStringList::Iterator wordIt = line.begin(); wordIt != line.end(); ++wordIt )
1450                         {
1451                                 pCorpusWord = m_CorpusMap.find( *wordIt );
1452
1453                                 if( pCorpusWord ) out << pCorpusWord->Display( '+', m_pOutFilter ) << " ";
1454                                 else out << *wordIt;
1455                         }
1456
1457                         out << endl;
1458                 }
1459         }
1460 }
1461
1462
1463
1464
1465 int CLexicon::NewMiniLexicon()
1466 {
1467   int pos;
1468
1469   for( pos = m_pMiniLexica->size()-1; pos >= 0; pos-- )
1470   {
1471           if( (*m_pMiniLexica)[pos] )
1472           {
1473                   break;
1474           }
1475   }
1476   pos++;
1477
1478   CMiniLexicon* mini = new CMiniLexicon( this, pos );
1479
1480         if (pos >= 0 && static_cast<unsigned int>(pos) >=
1481                         m_pMiniLexica->size())
1482                 m_pMiniLexica->resize(m_pMiniLexica->size()*2);
1483
1484   m_pMiniLexica->insert( pos, mini );
1485
1486   return pos;
1487 }
1488
1489
1490 void CLexicon::DeleteMiniLexicon( int pos )
1491 {
1492         int next, last;
1493
1494         if( pos == 0 )
1495         {
1496                  QMessageBox::information( NULL, QString( "Linguistica" ), QString( "Sorry, you cannot delete the first mini-lexicon. If you want \nto clear all data, use \'Clear Lexicon\' in the \'Edit\' menu." ) );
1497                  return;
1498         }
1499
1500         m_pMiniLexica->remove(pos);
1501
1502         for( last = m_pMiniLexica->size()-1; last >= 0; last-- )
1503         {
1504                 if( (*m_pMiniLexica)[last] )
1505                 {
1506                         break;
1507                 }
1508         }
1509
1510         if( last < (static_cast <int> (m_pMiniLexica->size()))/2 - 1 ) m_pMiniLexica->resize( m_pMiniLexica->size()/2 );
1511
1512         if( pos-1 >= 0 && (*m_pMiniLexica)[pos-1] ) SetActiveMiniIndex( pos - 1 );
1513         else
1514         {
1515                 for( next = pos; next <= last; next++ )
1516                 {
1517                         if( (*m_pMiniLexica)[next] )
1518                         {
1519                                 SetActiveMiniIndex( next );
1520                         }
1521                 }
1522         }
1523 }
1524
1525 void CLexicon::ClearMiniLexicon( int pos )
1526 {
1527   (*m_pMiniLexica)[pos]->ClearAll();
1528 }
1529
1530 int CLexicon::GetCorpusCount() { return m_pCorpusWords->GetCorpusCount(); }
1531
1532 // All Stems
1533 ////////////////////////////////////////////////////////////////////
1534
1535 QList<CStem*>* CLexicon::GetStemSet( const CStringSurrogate& stem )
1536 {
1537         return m_AllStems.find( stem.Display() );
1538 }
1539
1540
1541 bool CLexicon::InsertStem( CStem* stem )
1542 {
1543         if( !stem ) return FALSE;
1544
1545         // Get or create the set
1546         QList<CStem*>* set = m_AllStems.find( stem->Display() );
1547         if( !set )
1548         {
1549                 // Make sure the dictionary is large enough
1550                 if( m_AllStems.count() >= m_AllStems.size() )
1551                 {
1552                         m_AllStems.resize( getNextPrimeStep( m_AllStems.size() ) );
1553                 }
1554
1555                 set = new QList<CStem*>();
1556                 m_AllStems.insert( stem->Display(), set );
1557         }
1558
1559         // Do not insert duplicates
1560         if( set->indexOf( stem ) >= 0 ) return FALSE;
1561
1562         // Insert the stem
1563         set->prepend( stem );
1564
1565         return TRUE;
1566 }
1567
1568
1569 bool CLexicon::RemoveStem( CStem* stem )
1570 {
1571         if( !stem ) return FALSE;
1572
1573         // Get the set
1574         QList<CStem*>* set = m_AllStems.find( stem->Display() );
1575         if( !set ) return FALSE;
1576
1577         // Remove the stem
1578         if( !set->remove( stem ) ) return FALSE;
1579
1580         // Remove the set also if there are no more stems
1581         if( set->isEmpty() )
1582         {
1583                 m_AllStems.remove( stem->Display() );
1584
1585                 // Shrink the dictionary if it's too large
1586                 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllStems.size() ) );
1587                 if( static_cast <int> ( m_AllStems.count() ) < smaller_size )
1588                 {
1589                         m_AllStems.resize( smaller_size );
1590                 }
1591         }
1592
1593         return TRUE;
1594 }
1595
1596
1597 // All Suffixes
1598 ////////////////////////////////////////////////////////////////////
1599
1600 QList<CSuffix*>* CLexicon::GetSuffixSet( const CStringSurrogate& suffix )
1601 {
1602         return m_AllSuffixes.find( suffix.Display() );
1603 }
1604
1605
1606 bool CLexicon::InsertSuffix( CSuffix* suffix )
1607 {
1608         if( !suffix ) return FALSE;
1609
1610         // Get or create the set
1611         QList<CSuffix*>* set = m_AllSuffixes.find( suffix->Display() );
1612         if( !set )
1613         {
1614                 // Make sure the dictionary is large enough
1615                 if( m_AllSuffixes.count() >= m_AllSuffixes.size() )
1616                 {
1617                         m_AllSuffixes.resize( getNextPrimeStep( m_AllSuffixes.size() ) );
1618                 }
1619
1620                 set = new QList<CSuffix*>();
1621                 m_AllSuffixes.insert( suffix->Display(), set );
1622         }
1623
1624         // Do not insert duplicates
1625         if( set->indexOf( suffix ) >= 0 ) return FALSE;
1626
1627         // Insert the suffix
1628         set->prepend( suffix );
1629
1630         return TRUE;
1631 }
1632
1633
1634 bool CLexicon::RemoveSuffix( CSuffix* suffix )
1635 {
1636         if( !suffix ) return FALSE;
1637
1638         // Get the set
1639         QList<CSuffix*>* set = m_AllSuffixes.find( suffix->Display() );
1640         if( !set ) return FALSE;
1641
1642         // Remove the suffix
1643         if( !set->remove( suffix ) ) return FALSE;
1644
1645         // Remove the set also if there are no more suffixes
1646         if( set->isEmpty() )
1647         {
1648                 m_AllSuffixes.remove( suffix->Display() );
1649
1650                 // Shrink the dictionary if it's too large
1651                 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllSuffixes.size() ) );
1652                 if( static_cast <int> ( m_AllSuffixes.count() ) < smaller_size )
1653                 {
1654                         m_AllSuffixes.resize( smaller_size );
1655                 }
1656         }
1657
1658         return TRUE;
1659 }
1660
1661
1662 // All Suffix Signatures
1663 ////////////////////////////////////////////////////////////////////
1664
1665 QList<CSignature*>* CLexicon::GetSuffixSigSet( const CStringSurrogate& sig )
1666 {
1667         return m_AllSuffixSigs.find( sig.Display() );
1668 }
1669
1670
1671 bool CLexicon::InsertSuffixSig( CSignature* sig )
1672 {
1673         if( !sig ) return FALSE;
1674
1675         // Get or create the set
1676         QList<CSignature*>* set = m_AllSuffixSigs.find( sig->Display() );
1677         if( !set )
1678         {
1679                 // Make sure the dictionary is large enough
1680                 if( m_AllSuffixSigs.count() >= m_AllSuffixSigs.size() )
1681                 {
1682                         m_AllSuffixSigs.resize( getNextPrimeStep( m_AllSuffixSigs.size() ) );
1683                 }
1684
1685                 set = new QList<CSignature*>();
1686                 m_AllSuffixSigs.insert( sig->Display(), set );
1687         }
1688
1689         // Do not insert duplicates
1690         if( set->indexOf( sig ) >= 0 ) return FALSE;
1691
1692         // Insert the signature
1693         set->prepend( sig );
1694
1695         return TRUE;
1696 }
1697
1698
1699 bool CLexicon::RemoveSuffixSig( CSignature* sig )
1700 {
1701         if( !sig ) return FALSE;
1702
1703         // Get the set
1704         QList<CSignature*>* set = m_AllSuffixSigs.find( sig->Display() );
1705         if( !set ) return FALSE;
1706
1707         // Remove the signature
1708         if( !set->remove( sig ) ) return FALSE;
1709
1710         // Remove the set also if there are no more signatures
1711         if( set->isEmpty() )
1712         {
1713                 m_AllSuffixSigs.remove( sig->Display() );
1714
1715                 // Shrink the dictionary if it's too large
1716                 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllSuffixSigs.size() ) );
1717                 if( static_cast <int> (m_AllSuffixSigs.count()) < smaller_size )
1718                 {
1719                         m_AllSuffixSigs.resize( smaller_size );
1720                 }
1721         }
1722
1723         return TRUE;
1724 }
1725
1726
1727 // All Prefixes
1728 ////////////////////////////////////////////////////////////////////
1729
1730 QList<CPrefix*>* CLexicon::GetPrefixSet( const CStringSurrogate& prefix )
1731 {
1732         return m_AllPrefixes.find( prefix.Display() );
1733 }
1734
1735
1736 bool CLexicon::InsertPrefix( CPrefix* prefix )
1737 {
1738         if( !prefix ) return FALSE;
1739
1740         // Get or create the set
1741         QList<CPrefix*>* set = m_AllPrefixes.find( prefix->Display() );
1742         if( !set )
1743         {
1744                 // Make sure the dictionary is large enough
1745                 if( m_AllPrefixes.count() >= m_AllPrefixes.size() )
1746                 {
1747                         m_AllPrefixes.resize( getNextPrimeStep( m_AllPrefixes.size() ) );
1748                 }
1749
1750                 set = new QList<CPrefix*>();
1751                 m_AllPrefixes.insert( prefix->Display(), set );
1752         }
1753
1754         // Do not insert duplicates
1755         if( set->indexOf( prefix ) >= 0 ) return FALSE;
1756
1757         // Insert the prefix
1758         set->prepend( prefix );
1759
1760         return TRUE;
1761 }
1762
1763
1764 bool CLexicon::RemovePrefix( CPrefix* prefix )
1765 {
1766         if( !prefix ) return FALSE;
1767
1768         // Get the set
1769         QList<CPrefix*>* set = m_AllPrefixes.find( prefix->Display() );
1770         if( !set ) return FALSE;
1771
1772         // Remove the prefix
1773         if( !set->remove( prefix ) ) return FALSE;
1774
1775         // Remove the set also if there are no more prefixes
1776         if( set->isEmpty() )
1777         {
1778                 m_AllPrefixes.remove( prefix->Display() );
1779
1780                 // Shrink the dictionary if it's too large
1781                 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllPrefixes.size() ) );
1782                 if(static_cast <int> ( m_AllPrefixes.count()) < smaller_size )
1783                 {
1784                         m_AllPrefixes.resize( smaller_size );
1785                 }
1786         }
1787
1788         return TRUE;
1789 }
1790
1791
1792 // All Prefix Signatures
1793 ////////////////////////////////////////////////////////////////////
1794
1795 QList<CSignature*>* CLexicon::GetPrefixSigSet( const CStringSurrogate& sig )
1796 {
1797         return m_AllPrefixSigs.find( sig.Display() );
1798 }
1799
1800
1801 bool CLexicon::InsertPrefixSig( CSignature* sig )
1802 {
1803         if( !sig ) return FALSE;
1804
1805         // Get or create the set
1806         QList<CSignature*>* set = m_AllPrefixSigs.find( sig->Display() );
1807         if( !set )
1808         {
1809                 // Make sure the dictionary is large enough
1810                 if( m_AllPrefixSigs.count() >= m_AllPrefixSigs.size() )
1811                 {
1812                         m_AllPrefixSigs.resize( getNextPrimeStep( m_AllPrefixSigs.size() ) );
1813                 }
1814
1815                 set = new QList<CSignature*>();
1816                 m_AllPrefixSigs.insert( sig->Display(), set );
1817         }
1818
1819         // Do not insert duplicates
1820         if( set->indexOf( sig ) >= 0 ) return FALSE;
1821
1822         // Insert the signature
1823         set->prepend( sig );
1824
1825         return TRUE;
1826 }
1827
1828
1829 bool CLexicon::RemovePrefixSig( CSignature* sig )
1830 {
1831         if( !sig ) return FALSE;
1832
1833         // Get the set
1834         QList<CSignature*>* set = m_AllPrefixSigs.find( sig->Display() );
1835         if( !set ) return FALSE;
1836
1837         // Remove the signature
1838         if( !set->remove( sig ) ) return FALSE;
1839
1840         // Remove the set also if there are no more signatures
1841         if( set->isEmpty() )
1842         {
1843                 m_AllPrefixSigs.remove( sig->Display() );
1844
1845                 // Shrink the dictionary if it's too large
1846                 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllPrefixSigs.size() ) );
1847                 if( static_cast <int> (m_AllPrefixSigs.count()) < smaller_size )
1848                 {
1849                         m_AllPrefixSigs.resize( smaller_size );
1850                 }
1851         }
1852
1853         return TRUE;
1854 }
1855
1856
1857 // All Words
1858 ////////////////////////////////////////////////////////////////////
1859
1860 QList<CStem*>* CLexicon::GetWordSet( const CStringSurrogate& word )
1861 {
1862         return m_AllWords.find( word.Display() );
1863 }
1864
1865
1866 bool CLexicon::InsertWord( CStem* word )
1867 {
1868         if( !word ) return FALSE;
1869
1870         // Get or create the set
1871         QList<CStem*>* set = m_AllWords.find( word->Display() );
1872         if( !set )
1873         {
1874                 // Make sure the dictionary is large enough
1875                 if( m_AllWords.count() >= m_AllWords.size() )
1876                 {
1877                         m_AllWords.resize( getNextPrimeStep( m_AllWords.size() ) );
1878                 }
1879
1880                 set = new QList<CStem*>();
1881                 m_AllWords.insert( word->Display(), set );
1882         }
1883
1884         // Do not insert duplicates
1885         if( set->indexOf( word ) >= 0 ) return FALSE;
1886
1887         // Insert the stem
1888         set->prepend( word );
1889
1890         return TRUE;
1891 }
1892
1893
1894 bool CLexicon::RemoveWord( CStem* word )
1895 {
1896         if( !word ) return FALSE;
1897
1898         // Get the set
1899         QList<CStem*>* set = m_AllWords.find( word->Display() );
1900         if( !set ) return FALSE;
1901
1902         // Remove the stem
1903         if( !set->remove( word ) ) return FALSE;
1904
1905         // Remove the set also if there are no more stems
1906         if( set->isEmpty() )
1907         {
1908                 m_AllWords.remove( word->Display() );
1909
1910                 // Shrink the dictionary if it's too large
1911                 int smaller_size = getPrevPrimeStep( getPrevPrimeStep( m_AllWords.size() ) );
1912                 if( static_cast <int> (m_AllWords.count()) < smaller_size )
1913                 {
1914                         m_AllWords.resize( smaller_size );
1915                 }
1916         }
1917
1918         return TRUE;
1919 }
1920
1921
1922 bool CLexicon::LogFileOn()
1923 {
1924         return m_pDoc->LogFileOn();
1925 }
1926
1927
1928 QTextStream* CLexicon::GetLogFileStream()
1929 {
1930         return m_pDoc->GetLogFileStream();
1931 }
1932
1933 StateEmitHMM* CLexicon::GetHMM()
1934 {
1935         return m_HMM;
1936 }
1937
1938 StateEmitHMM* CLexicon::CreateNewHMM()
1939 {
1940         if (m_HMM) { delete m_HMM;}
1941         m_HMM = new StateEmitHMM (this);
1942         return m_HMM;
1943 }
1944 void CLexicon::UpdateCompound( QString compound )
1945 {
1946         if( m_CompoundUpdates.find( compound ) == m_CompoundUpdates.end() )
1947         {
1948                 m_CompoundUpdates.append( compound );
1949         }
1950 }
1951
1952
1953 void CLexicon::UpdateWord( CStem* pWord )
1954 {
1955         if( m_WordUpdates.indexOf( pWord ) < 0 )
1956         {
1957                 m_WordUpdates.append( pWord );
1958         }
1959 }
1960
1961
1962 void CLexicon::DoWordUpdates()
1963 {
1964         CStem* pStem,
1965                  * qStem,
1966                  * rStem;
1967
1968         CPrefix* pPrefix;
1969         CSuffix* pSuffix;
1970         CCompound* pCompound;
1971
1972         CCorpusWord*            pCorpusWord;
1973         int                     myMiniIndex, start, end;
1974         bool                    startValid, endValid;
1975         int*                    wPieces, * sPieces;
1976         QList<CStem*>*        myWords;
1977         QList<CStem*>         wordQueue;
1978
1979     m_pCorpusWords->SetUpdateFlags( FALSE );
1980
1981     while( !m_WordUpdates.isEmpty() )
1982         {
1983                 pStem = m_WordUpdates.takeAt(0);
1984
1985                 // Check if this word has already been updated
1986                 pCorpusWord = *m_pCorpusWords ^= pStem->GetKey();
1987                 if( !pCorpusWord || pCorpusWord->IsUpdated() ) continue;
1988
1989                 // If it's also a compound, override
1990                 pCompound = *m_pCompounds ^= pStem->GetKey();
1991                 if( pCompound )
1992                 {
1993                         pCorpusWord->SetUpdated( TRUE );
1994                         continue;
1995                 }
1996
1997                 wordQueue.append( pStem );
1998                 while( !wordQueue.isEmpty() )
1999                 {
2000                         qStem = wordQueue.takeAt(0);
2001
2002                         if( pStem->GetMyMini()->GetPrefixes() )
2003                         {
2004                                 pPrefix = *pStem->GetMyMini()->GetPrefixes() ^= pStem->GetPrefix();
2005                         }
2006                         else pPrefix = NULL;
2007
2008                         if( pStem->GetMyMini()->GetSuffixes() )
2009                         {
2010                                 pSuffix = *pStem->GetMyMini()->GetSuffixes() ^= pStem->GetSuffix();
2011                         }
2012                         else pSuffix = NULL;
2013
2014                         if( pStem->GetMyMini()->GetStems() )
2015                         {
2016                                 CSS cssStem = pStem->GetStem();
2017                                 rStem = *pStem->GetMyMini()->GetStems() ^= cssStem;
2018                         }
2019                         else rStem = NULL;
2020
2021             // Update this stem in the corpus if it exists
2022                         // as a word
2023                         pCorpusWord = *m_pCorpusWords ^= qStem->GetKey();
2024                         if( pCorpusWord )
2025                         {
2026                                 if( pCorpusWord->IsUpdated() ) continue;
2027                                 pCorpusWord->SetUpdated();
2028
2029                                 wPieces = pCorpusWord->GetPieces();
2030                                 sPieces = pStem->GetPieces();
2031
2032                                 start = pCorpusWord->Display().find( pStem->Display() );
2033                                 end = start + pStem->GetKeyLength();
2034
2035                                 startValid = FALSE;
2036                                 endValid = FALSE;
2037                                 for (int i = 0; i <= pCorpusWord->Size(); ++i) {
2038                                         if( start == wPieces[i] ) startValid = TRUE;
2039                                         if( end == wPieces[i] ) endValid = TRUE;
2040                                 }
2041
2042                                 Q_ASSERT( startValid && endValid );
2043                                 if( startValid && endValid )
2044                                 {
2045                                         for (int i = 0, j = 0;
2046                                                         i <= pCorpusWord->Size() &&
2047                                                         j <= pStem->Size(); ++i) {
2048                                                 if( wPieces[i] > start )
2049                                                 {
2050                                                         j++;
2051                                                         if( j > pStem->Size() ) break;
2052
2053                                                         if( wPieces[i] < sPieces[j] )
2054                                                         {
2055                                                                 pCorpusWord->MergePieces(i);
2056                                                                 break;
2057                                                         }
2058                                                         else if( sPieces[j] < wPieces[i] )
2059                                                         {
2060                                                                 pCorpusWord->CutRightBeforeHere( sPieces[j] );
2061                                                                 if( pSuffix )
2062                                                                 {
2063                                                                         pCorpusWord->SetMorpheme( i, rStem );
2064                                                                         pCorpusWord->SetMorpheme( i+1, pSuffix );
2065                                                                 }
2066                                                                 else if( pPrefix )
2067                                                                 {
2068                                                                         pCorpusWord->SetMorpheme( i, pPrefix );
2069                                                                         pCorpusWord->SetMorpheme( i+1, rStem );
2070                                                                 }
2071                                                                 break;
2072                                                         }
2073                                                 }
2074                                         }
2075                                 }
2076                         }
2077
2078                         // Add "parent" "words"
2079                         myMiniIndex = qStem->GetMyMini()->GetIndex();
2080                         Q_ASSERT( myMiniIndex >= 0 );
2081                         Q_ASSERT( myMiniIndex < static_cast <int> ( m_pMiniLexica->size()) );
2082
2083                         for (unsigned int i = 0; i < m_pMiniLexica->size(); ++i) {
2084                                 if (myMiniIndex >= 0 &&
2085                                                 static_cast<unsigned int>(myMiniIndex) == i)
2086                                         continue;
2087                                 if( (*m_pMiniLexica)[i] == NULL ) continue;
2088
2089                                 rStem = *(*m_pMiniLexica)[i]->GetStems() ^= qStem->GetKey();
2090
2091                                 if( !rStem ) continue;
2092
2093                                 myWords = rStem->GetWordPtrList();
2094
2095                                 //for( rStem = myWords->first(); rStem; rStem = myWords->next() )
2096                                 for (int z = 0; z < myWords->size(); z++)
2097                                 {       rStem = myWords->at (z);
2098                                         wordQueue.append( rStem );
2099                                 }
2100                         }
2101                 }
2102         }
2103
2104         QString compound;
2105
2106         // Update from compounds list
2107         for( QStringList::Iterator it = m_CompoundUpdates.begin(); it != m_CompoundUpdates.end(); ++it )
2108         {
2109                 compound = (*it);
2110                 pCompound = *m_pCompounds ^= CSS( compound );
2111                 pCorpusWord = *m_pCorpusWords ^= CSS( compound );
2112
2113
2114                 if( !pCorpusWord )
2115                 {
2116                         // This happens when the last component is a stem
2117                         // found in the suffix analysis of one of the
2118                         // mini-lexica or the first component is a stem
2119                         // found in the prefix analysis of one of the
2120                         // mini-lexica
2121
2122                         wordQueue.clear();
2123
2124                         // Add all stems containing this compound to the queue
2125                         for (unsigned int i = 0; i < m_pMiniLexica->size(); ++i) {
2126                                 if( (*m_pMiniLexica)[i] == NULL ) continue;
2127
2128                                 rStem = *(*m_pMiniLexica)[i]->GetStems() ^= compound;
2129
2130                                 if( !rStem ) continue;
2131
2132                                 myWords = rStem->GetWordPtrList();
2133
2134                                 //for( rStem = myWords->first(); rStem; rStem = myWords->next() )
2135                                 for (int y = 0; y < myWords->size(); y++)
2136                                 {       rStem = myWords->at(y);
2137                                         wordQueue.append( rStem );
2138                                 }
2139                         }
2140
2141                         // Update all ancestor corpus words
2142                         while( !wordQueue.isEmpty() )
2143                         {
2144                                 pStem = wordQueue.takeAt(0);
2145
2146                                 // Update this stem in the corpus if it exists
2147                                 // as a word
2148                                 pCorpusWord = *m_pCorpusWords ^= pStem->GetKey();
2149                                 if( pCorpusWord )
2150                                 {
2151                                         if( pCorpusWord->IsUpdated() ) continue;
2152                                         pCorpusWord->SetUpdated();
2153
2154                                         start = pCorpusWord->Display().find( compound );
2155                                         end = start + compound.length();
2156
2157                                         wPieces = pCorpusWord->GetPieces();
2158
2159                                         startValid = FALSE;
2160                                         endValid = FALSE;
2161                                         for (int i = 0; i <= pCorpusWord->Size(); ++i) {
2162                                                 if( start == wPieces[i] ) startValid = TRUE;
2163                                                 if( end == wPieces[i] ) endValid = TRUE;
2164                                         }
2165
2166                                         // We need to match the cuts in the compound
2167                                         // to the substring of the corpus word or remove
2168                                         // cuts in the substring if pCompound doesn't
2169                                         // exist. Also, we need to add parents of each
2170                                         // word to the queue and do those too.
2171                                         if( startValid && endValid )
2172                                         {
2173                                                 if( pCompound )
2174                                                 {
2175                                                         sPieces = pCompound->GetPieces();
2176                                                         for (int i = 0, j = 0;
2177                                                                 i <= pCorpusWord->Size() && j <= pCompound->Size(); i++ )
2178                                                         {
2179                                                                 if( wPieces[i] > start )
2180                                                                 {
2181                                                                         j++;
2182                                                                         if( j > pCompound->Size() ) break;
2183
2184                                                                         if( wPieces[i] < sPieces[j] )
2185                                                                         {
2186                                                                                 pCorpusWord->MergePieces(i);
2187                                     }
2188                                                                         else if( sPieces[j] < wPieces[i] )
2189                                                                         {
2190
2191                                                                                 pCorpusWord->CutRightBeforeHere( sPieces[j] );
2192                                     }
2193                                                                 }
2194                                                         }
2195
2196                                                         // Now set the morphemes of the corpus word
2197                                                         // equal to those of the compound
2198                                                         for (int i = 1; i <= pCompound->Size(); ++i) {
2199                                                                 if( pCompound->GetComponent(i) ) pCorpusWord->SetMorpheme( i, pCompound->GetComponent(i)->at(0) );
2200                                                                 else if( pCompound->GetLinker(i) ) pCorpusWord->SetMorpheme( i, pCompound->GetLinker(i) );
2201                                                         }
2202                                                 }
2203                                                 else
2204                                                 {
2205                                                         int first = -1;
2206                                                         for (int i = 0; i <= pCorpusWord->Size(); ++i) {
2207                                                                 if( wPieces[i] > start && wPieces[i] < end )
2208                                                                 {
2209                                                                         pCorpusWord->MergePieces(i);
2210 //qDebug( pCorpusWord->Display('.') );
2211                                                                         if( first < 0 ) first = i;
2212                                                                 }
2213                                                         }
2214
2215                                                         // Now set the morpheme of the corpus word
2216                                                         // equal to the stem or word the old compound
2217                                                         // came from
2218                                                         StemSet* pStemSet = GetAllStems()->find( compound );
2219                                                         if( !pStemSet ) pStemSet = GetAllWords()->find( compound );
2220                                                         if( pStemSet ) pCorpusWord->SetMorpheme( first, pStemSet->at(0) );
2221                                                 }
2222                                         }
2223                                 }
2224                         }
2225
2226                         continue;
2227                 }
2228
2229                 if( !pCompound )
2230                 {
2231                         // Compound deleted, remove all cuts
2232                         pCorpusWord->SimplifyParseStructure();
2233                 }
2234                 else
2235                 {
2236                         // Make cuts match compound
2237                         pCorpusWord->CopyParseStructure( pCompound );
2238                 }
2239 //if( pCompound ) qDebug( pCompound->Display( '.' ) );
2240         }
2241
2242         m_CompoundUpdates.clear();
2243 }