Remove the (undocumented) query-planner control that prevents
[sqlite.git] / ext / fts5 / fts5_unicode2.c
blob1ef56f61567cc72934b32113546f4d9f4c2d17d2
1 /*
2 ** 2012 May 25
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
19 #include <assert.h>
22 ** Return true if the argument corresponds to a unicode codepoint
23 ** classified as either a letter or a number. Otherwise false.
25 ** The results are undefined if the value passed to this function
26 ** is less than zero.
28 int sqlite3Fts5UnicodeIsalnum(int c){
29 /* Each unsigned integer in the following array corresponds to a contiguous
30 ** range of unicode codepoints that are not either letters or numbers (i.e.
31 ** codepoints for which this function should return 0).
33 ** The most significant 22 bits in each 32-bit value contain the first
34 ** codepoint in the range. The least significant 10 bits are used to store
35 ** the size of the range (always at least 1). In other words, the value
36 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
37 ** C. It is not possible to represent a range larger than 1023 codepoints
38 ** using this format.
40 static const unsigned int aEntry[] = {
41 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
42 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
43 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
44 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
45 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
46 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
47 0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
48 0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
49 0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
50 0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
51 0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
52 0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
53 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
54 0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
55 0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
56 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
57 0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
58 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
59 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
60 0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
61 0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
62 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
63 0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
64 0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
65 0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
66 0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
67 0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
68 0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
69 0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
70 0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
71 0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
72 0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
73 0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
74 0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
75 0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
76 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
77 0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
78 0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
79 0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
80 0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
81 0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
82 0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
83 0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
84 0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
85 0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
86 0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
87 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
88 0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
89 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
90 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
91 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
92 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
93 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
94 0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
95 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
96 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
97 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
98 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
99 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
100 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
101 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
102 0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
103 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
104 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
105 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
106 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
107 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
108 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
109 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
110 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
111 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
112 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
113 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
114 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
115 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
116 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
117 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
118 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
119 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
120 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
121 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
122 0x380400F0,
124 static const unsigned int aAscii[4] = {
125 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
128 if( (unsigned int)c<128 ){
129 return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
130 }else if( (unsigned int)c<(1<<22) ){
131 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
132 int iRes = 0;
133 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
134 int iLo = 0;
135 while( iHi>=iLo ){
136 int iTest = (iHi + iLo) / 2;
137 if( key >= aEntry[iTest] ){
138 iRes = iTest;
139 iLo = iTest+1;
140 }else{
141 iHi = iTest-1;
144 assert( aEntry[0]<key );
145 assert( key>=aEntry[iRes] );
146 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
148 return 1;
153 ** If the argument is a codepoint corresponding to a lowercase letter
154 ** in the ASCII range with a diacritic added, return the codepoint
155 ** of the ASCII letter only. For example, if passed 235 - "LATIN
156 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
157 ** E"). The resuls of passing a codepoint that corresponds to an
158 ** uppercase letter are undefined.
160 static int fts5_remove_diacritic(int c){
161 unsigned short aDia[] = {
162 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
163 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
164 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
165 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
166 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
167 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
168 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
169 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
170 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
171 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
172 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
173 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
174 62924, 63050, 63082, 63274, 63390,
176 char aChar[] = {
177 '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
178 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
179 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
180 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
181 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
182 '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
183 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
184 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
185 'e', 'i', 'o', 'u', 'y',
188 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
189 int iRes = 0;
190 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
191 int iLo = 0;
192 while( iHi>=iLo ){
193 int iTest = (iHi + iLo) / 2;
194 if( key >= aDia[iTest] ){
195 iRes = iTest;
196 iLo = iTest+1;
197 }else{
198 iHi = iTest-1;
201 assert( key>=aDia[iRes] );
202 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
207 ** Return true if the argument interpreted as a unicode codepoint
208 ** is a diacritical modifier character.
210 int sqlite3Fts5UnicodeIsdiacritic(int c){
211 unsigned int mask0 = 0x08029FDF;
212 unsigned int mask1 = 0x000361F8;
213 if( c<768 || c>817 ) return 0;
214 return (c < 768+32) ?
215 (mask0 & (1 << (c-768))) :
216 (mask1 & (1 << (c-768-32)));
221 ** Interpret the argument as a unicode codepoint. If the codepoint
222 ** is an upper case character that has a lower case equivalent,
223 ** return the codepoint corresponding to the lower case version.
224 ** Otherwise, return a copy of the argument.
226 ** The results are undefined if the value passed to this function
227 ** is less than zero.
229 int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
230 /* Each entry in the following array defines a rule for folding a range
231 ** of codepoints to lower case. The rule applies to a range of nRange
232 ** codepoints starting at codepoint iCode.
234 ** If the least significant bit in flags is clear, then the rule applies
235 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
236 ** need to be folded). Or, if it is set, then the rule only applies to
237 ** every second codepoint in the range, starting with codepoint C.
239 ** The 7 most significant bits in flags are an index into the aiOff[]
240 ** array. If a specific codepoint C does require folding, then its lower
241 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
243 ** The contents of this array are generated by parsing the CaseFolding.txt
244 ** file distributed as part of the "Unicode Character Database". See
245 ** http://www.unicode.org for details.
247 static const struct TableEntry {
248 unsigned short iCode;
249 unsigned char flags;
250 unsigned char nRange;
251 } aEntry[] = {
252 {65, 14, 26}, {181, 64, 1}, {192, 14, 23},
253 {216, 14, 7}, {256, 1, 48}, {306, 1, 6},
254 {313, 1, 16}, {330, 1, 46}, {376, 116, 1},
255 {377, 1, 6}, {383, 104, 1}, {385, 50, 1},
256 {386, 1, 4}, {390, 44, 1}, {391, 0, 1},
257 {393, 42, 2}, {395, 0, 1}, {398, 32, 1},
258 {399, 38, 1}, {400, 40, 1}, {401, 0, 1},
259 {403, 42, 1}, {404, 46, 1}, {406, 52, 1},
260 {407, 48, 1}, {408, 0, 1}, {412, 52, 1},
261 {413, 54, 1}, {415, 56, 1}, {416, 1, 6},
262 {422, 60, 1}, {423, 0, 1}, {425, 60, 1},
263 {428, 0, 1}, {430, 60, 1}, {431, 0, 1},
264 {433, 58, 2}, {435, 1, 4}, {439, 62, 1},
265 {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
266 {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
267 {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
268 {497, 2, 1}, {498, 1, 4}, {502, 122, 1},
269 {503, 134, 1}, {504, 1, 40}, {544, 110, 1},
270 {546, 1, 18}, {570, 70, 1}, {571, 0, 1},
271 {573, 108, 1}, {574, 68, 1}, {577, 0, 1},
272 {579, 106, 1}, {580, 28, 1}, {581, 30, 1},
273 {582, 1, 10}, {837, 36, 1}, {880, 1, 4},
274 {886, 0, 1}, {902, 18, 1}, {904, 16, 3},
275 {908, 26, 1}, {910, 24, 2}, {913, 14, 17},
276 {931, 14, 9}, {962, 0, 1}, {975, 4, 1},
277 {976, 140, 1}, {977, 142, 1}, {981, 146, 1},
278 {982, 144, 1}, {984, 1, 24}, {1008, 136, 1},
279 {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1},
280 {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1},
281 {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32},
282 {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1},
283 {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38},
284 {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1},
285 {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1},
286 {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6},
287 {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6},
288 {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8},
289 {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2},
290 {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1},
291 {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2},
292 {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2},
293 {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2},
294 {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1},
295 {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16},
296 {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47},
297 {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1},
298 {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1},
299 {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1},
300 {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2},
301 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
302 {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14},
303 {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1},
304 {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1},
305 {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1},
306 {65313, 14, 26},
308 static const unsigned short aiOff[] = {
309 1, 2, 8, 15, 16, 26, 28, 32,
310 37, 38, 40, 48, 63, 64, 69, 71,
311 79, 80, 116, 202, 203, 205, 206, 207,
312 209, 210, 211, 213, 214, 217, 218, 219,
313 775, 7264, 10792, 10795, 23228, 23256, 30204, 54721,
314 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
315 57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
316 65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
317 65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
318 65514, 65521, 65527, 65528, 65529,
321 int ret = c;
323 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
325 if( c<128 ){
326 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
327 }else if( c<65536 ){
328 const struct TableEntry *p;
329 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
330 int iLo = 0;
331 int iRes = -1;
333 assert( c>aEntry[0].iCode );
334 while( iHi>=iLo ){
335 int iTest = (iHi + iLo) / 2;
336 int cmp = (c - aEntry[iTest].iCode);
337 if( cmp>=0 ){
338 iRes = iTest;
339 iLo = iTest+1;
340 }else{
341 iHi = iTest-1;
345 assert( iRes>=0 && c>=aEntry[iRes].iCode );
346 p = &aEntry[iRes];
347 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
348 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
349 assert( ret>0 );
352 if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
355 else if( c>=66560 && c<66600 ){
356 ret = c + 40;
359 return ret;