For the amalgamation-tarball, enable FTS5 and JSON1 by default and
[sqlite.git] / ext / fts3 / fts3_unicode2.c
blobda7251ed0cb015dd2056f2979c4fbec4070cffc9
1 /*
2 ** 2012 May 25
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
18 #ifndef SQLITE_DISABLE_FTS3_UNICODE
19 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
21 #include <assert.h>
24 ** Return true if the argument corresponds to a unicode codepoint
25 ** classified as either a letter or a number. Otherwise false.
27 ** The results are undefined if the value passed to this function
28 ** is less than zero.
30 int sqlite3FtsUnicodeIsalnum(int c){
31 /* Each unsigned integer in the following array corresponds to a contiguous
32 ** range of unicode codepoints that are not either letters or numbers (i.e.
33 ** codepoints for which this function should return 0).
35 ** The most significant 22 bits in each 32-bit value contain the first
36 ** codepoint in the range. The least significant 10 bits are used to store
37 ** the size of the range (always at least 1). In other words, the value
38 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
39 ** C. It is not possible to represent a range larger than 1023 codepoints
40 ** using this format.
42 static const unsigned int aEntry[] = {
43 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
44 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
45 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
46 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
47 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
48 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
49 0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
50 0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
51 0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
52 0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
53 0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
54 0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
55 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
56 0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
57 0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
58 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
59 0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
60 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
61 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
62 0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
63 0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
64 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
65 0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
66 0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
67 0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
68 0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
69 0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
70 0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
71 0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
72 0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
73 0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
74 0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
75 0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
76 0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
77 0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
78 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
79 0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
80 0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
81 0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
82 0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
83 0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
84 0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
85 0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
86 0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
87 0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
88 0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
89 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
90 0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
91 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
92 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
93 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
94 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
95 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
96 0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
97 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
98 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
99 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
100 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
101 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
102 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
103 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
104 0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
105 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
106 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
107 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
108 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
109 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
110 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
111 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
112 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
113 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
114 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
115 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
116 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
117 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
118 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
119 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
120 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
121 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
122 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
123 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
124 0x380400F0,
126 static const unsigned int aAscii[4] = {
127 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
130 if( (unsigned int)c<128 ){
131 return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
132 }else if( (unsigned int)c<(1<<22) ){
133 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
134 int iRes = 0;
135 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
136 int iLo = 0;
137 while( iHi>=iLo ){
138 int iTest = (iHi + iLo) / 2;
139 if( key >= aEntry[iTest] ){
140 iRes = iTest;
141 iLo = iTest+1;
142 }else{
143 iHi = iTest-1;
146 assert( aEntry[0]<key );
147 assert( key>=aEntry[iRes] );
148 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
150 return 1;
155 ** If the argument is a codepoint corresponding to a lowercase letter
156 ** in the ASCII range with a diacritic added, return the codepoint
157 ** of the ASCII letter only. For example, if passed 235 - "LATIN
158 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
159 ** E"). The resuls of passing a codepoint that corresponds to an
160 ** uppercase letter are undefined.
162 static int remove_diacritic(int c){
163 unsigned short aDia[] = {
164 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
165 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
166 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
167 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
168 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
169 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
170 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
171 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
172 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
173 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
174 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
175 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
176 62924, 63050, 63082, 63274, 63390,
178 char aChar[] = {
179 '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
180 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
181 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
182 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
183 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
184 '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
185 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
186 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
187 'e', 'i', 'o', 'u', 'y',
190 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
191 int iRes = 0;
192 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
193 int iLo = 0;
194 while( iHi>=iLo ){
195 int iTest = (iHi + iLo) / 2;
196 if( key >= aDia[iTest] ){
197 iRes = iTest;
198 iLo = iTest+1;
199 }else{
200 iHi = iTest-1;
203 assert( key>=aDia[iRes] );
204 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
209 ** Return true if the argument interpreted as a unicode codepoint
210 ** is a diacritical modifier character.
212 int sqlite3FtsUnicodeIsdiacritic(int c){
213 unsigned int mask0 = 0x08029FDF;
214 unsigned int mask1 = 0x000361F8;
215 if( c<768 || c>817 ) return 0;
216 return (c < 768+32) ?
217 (mask0 & (1 << (c-768))) :
218 (mask1 & (1 << (c-768-32)));
223 ** Interpret the argument as a unicode codepoint. If the codepoint
224 ** is an upper case character that has a lower case equivalent,
225 ** return the codepoint corresponding to the lower case version.
226 ** Otherwise, return a copy of the argument.
228 ** The results are undefined if the value passed to this function
229 ** is less than zero.
231 int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
232 /* Each entry in the following array defines a rule for folding a range
233 ** of codepoints to lower case. The rule applies to a range of nRange
234 ** codepoints starting at codepoint iCode.
236 ** If the least significant bit in flags is clear, then the rule applies
237 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
238 ** need to be folded). Or, if it is set, then the rule only applies to
239 ** every second codepoint in the range, starting with codepoint C.
241 ** The 7 most significant bits in flags are an index into the aiOff[]
242 ** array. If a specific codepoint C does require folding, then its lower
243 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
245 ** The contents of this array are generated by parsing the CaseFolding.txt
246 ** file distributed as part of the "Unicode Character Database". See
247 ** http://www.unicode.org for details.
249 static const struct TableEntry {
250 unsigned short iCode;
251 unsigned char flags;
252 unsigned char nRange;
253 } aEntry[] = {
254 {65, 14, 26}, {181, 64, 1}, {192, 14, 23},
255 {216, 14, 7}, {256, 1, 48}, {306, 1, 6},
256 {313, 1, 16}, {330, 1, 46}, {376, 116, 1},
257 {377, 1, 6}, {383, 104, 1}, {385, 50, 1},
258 {386, 1, 4}, {390, 44, 1}, {391, 0, 1},
259 {393, 42, 2}, {395, 0, 1}, {398, 32, 1},
260 {399, 38, 1}, {400, 40, 1}, {401, 0, 1},
261 {403, 42, 1}, {404, 46, 1}, {406, 52, 1},
262 {407, 48, 1}, {408, 0, 1}, {412, 52, 1},
263 {413, 54, 1}, {415, 56, 1}, {416, 1, 6},
264 {422, 60, 1}, {423, 0, 1}, {425, 60, 1},
265 {428, 0, 1}, {430, 60, 1}, {431, 0, 1},
266 {433, 58, 2}, {435, 1, 4}, {439, 62, 1},
267 {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
268 {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
269 {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
270 {497, 2, 1}, {498, 1, 4}, {502, 122, 1},
271 {503, 134, 1}, {504, 1, 40}, {544, 110, 1},
272 {546, 1, 18}, {570, 70, 1}, {571, 0, 1},
273 {573, 108, 1}, {574, 68, 1}, {577, 0, 1},
274 {579, 106, 1}, {580, 28, 1}, {581, 30, 1},
275 {582, 1, 10}, {837, 36, 1}, {880, 1, 4},
276 {886, 0, 1}, {902, 18, 1}, {904, 16, 3},
277 {908, 26, 1}, {910, 24, 2}, {913, 14, 17},
278 {931, 14, 9}, {962, 0, 1}, {975, 4, 1},
279 {976, 140, 1}, {977, 142, 1}, {981, 146, 1},
280 {982, 144, 1}, {984, 1, 24}, {1008, 136, 1},
281 {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1},
282 {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1},
283 {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32},
284 {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1},
285 {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38},
286 {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1},
287 {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1},
288 {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6},
289 {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6},
290 {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8},
291 {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2},
292 {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1},
293 {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2},
294 {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2},
295 {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2},
296 {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1},
297 {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16},
298 {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47},
299 {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1},
300 {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1},
301 {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1},
302 {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2},
303 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
304 {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14},
305 {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1},
306 {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1},
307 {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1},
308 {65313, 14, 26},
310 static const unsigned short aiOff[] = {
311 1, 2, 8, 15, 16, 26, 28, 32,
312 37, 38, 40, 48, 63, 64, 69, 71,
313 79, 80, 116, 202, 203, 205, 206, 207,
314 209, 210, 211, 213, 214, 217, 218, 219,
315 775, 7264, 10792, 10795, 23228, 23256, 30204, 54721,
316 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
317 57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
318 65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
319 65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
320 65514, 65521, 65527, 65528, 65529,
323 int ret = c;
325 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
327 if( c<128 ){
328 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
329 }else if( c<65536 ){
330 const struct TableEntry *p;
331 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
332 int iLo = 0;
333 int iRes = -1;
335 assert( c>aEntry[0].iCode );
336 while( iHi>=iLo ){
337 int iTest = (iHi + iLo) / 2;
338 int cmp = (c - aEntry[iTest].iCode);
339 if( cmp>=0 ){
340 iRes = iTest;
341 iLo = iTest+1;
342 }else{
343 iHi = iTest-1;
347 assert( iRes>=0 && c>=aEntry[iRes].iCode );
348 p = &aEntry[iRes];
349 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
350 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
351 assert( ret>0 );
354 if( bRemoveDiacritic ) ret = remove_diacritic(ret);
357 else if( c>=66560 && c<66600 ){
358 ret = c + 40;
361 return ret;
363 #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
364 #endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */