Update Scintilla to version 3.7.1
[geany-mirror.git] / scintilla / src / CaseConvert.cxx
blob4fb7559032ee7beb162d0ef4e006d5493076a3d1
1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4 ** Case fold characters and convert them to upper or lower case.
5 ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6 ** Should only be rarely regenerated for new versions of Unicode.
7 **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
11 #include <cstring>
13 #include <stdexcept>
14 #include <string>
15 #include <vector>
16 #include <algorithm>
18 #include "StringCopy.h"
19 #include "CaseConvert.h"
20 #include "UniConversion.h"
21 #include "UnicodeFromUTF8.h"
23 #ifdef SCI_NAMESPACE
24 using namespace Scintilla;
25 #endif
27 namespace {
28 // Use an unnamed namespace to protect the declarations from name conflicts
30 // Unicode code points are ordered by groups and follow patterns.
31 // Most characters (pitch==1) are in ranges for a particular alphabet and their
32 // upper case forms are a fixed distance away.
33 // Another pattern (pitch==2) is where each lower case letter is preceded by
34 // the upper case form. These are also grouped into ranges.
36 int symmetricCaseConversionRanges[] = {
37 //lower, upper, range length, range pitch
38 //++Autogenerated -- start of section automatically generated
39 //**\(\*\n\)
40 97,65,26,1,
41 224,192,23,1,
42 248,216,7,1,
43 257,256,24,2,
44 314,313,8,2,
45 331,330,23,2,
46 462,461,8,2,
47 479,478,9,2,
48 505,504,20,2,
49 547,546,9,2,
50 583,582,5,2,
51 945,913,17,1,
52 963,931,9,1,
53 985,984,12,2,
54 1072,1040,32,1,
55 1104,1024,16,1,
56 1121,1120,17,2,
57 1163,1162,27,2,
58 1218,1217,7,2,
59 1233,1232,44,2,
60 1377,1329,38,1,
61 7681,7680,75,2,
62 7841,7840,48,2,
63 7936,7944,8,1,
64 7952,7960,6,1,
65 7968,7976,8,1,
66 7984,7992,8,1,
67 8000,8008,6,1,
68 8032,8040,8,1,
69 8560,8544,16,1,
70 9424,9398,26,1,
71 11312,11264,47,1,
72 11393,11392,50,2,
73 11520,4256,38,1,
74 42561,42560,23,2,
75 42625,42624,12,2,
76 42787,42786,7,2,
77 42803,42802,31,2,
78 42879,42878,5,2,
79 42913,42912,5,2,
80 65345,65313,26,1,
81 66600,66560,40,1,
83 //--Autogenerated -- end of section automatically generated
86 // Code points that are symmetric but don't fit into a range of similar characters
87 // are listed here.
89 int symmetricCaseConversions[] = {
90 //lower, upper
91 //++Autogenerated -- start of section automatically generated
92 //**1 \(\*\n\)
93 255,376,
94 307,306,
95 309,308,
96 311,310,
97 378,377,
98 380,379,
99 382,381,
100 384,579,
101 387,386,
102 389,388,
103 392,391,
104 396,395,
105 402,401,
106 405,502,
107 409,408,
108 410,573,
109 414,544,
110 417,416,
111 419,418,
112 421,420,
113 424,423,
114 429,428,
115 432,431,
116 436,435,
117 438,437,
118 441,440,
119 445,444,
120 447,503,
121 454,452,
122 457,455,
123 460,458,
124 477,398,
125 499,497,
126 501,500,
127 572,571,
128 575,11390,
129 576,11391,
130 578,577,
131 592,11375,
132 593,11373,
133 594,11376,
134 595,385,
135 596,390,
136 598,393,
137 599,394,
138 601,399,
139 603,400,
140 608,403,
141 611,404,
142 613,42893,
143 614,42922,
144 616,407,
145 617,406,
146 619,11362,
147 623,412,
148 625,11374,
149 626,413,
150 629,415,
151 637,11364,
152 640,422,
153 643,425,
154 648,430,
155 649,580,
156 650,433,
157 651,434,
158 652,581,
159 658,439,
160 881,880,
161 883,882,
162 887,886,
163 891,1021,
164 892,1022,
165 893,1023,
166 940,902,
167 941,904,
168 942,905,
169 943,906,
170 972,908,
171 973,910,
172 974,911,
173 983,975,
174 1010,1017,
175 1016,1015,
176 1019,1018,
177 1231,1216,
178 7545,42877,
179 7549,11363,
180 8017,8025,
181 8019,8027,
182 8021,8029,
183 8023,8031,
184 8048,8122,
185 8049,8123,
186 8050,8136,
187 8051,8137,
188 8052,8138,
189 8053,8139,
190 8054,8154,
191 8055,8155,
192 8056,8184,
193 8057,8185,
194 8058,8170,
195 8059,8171,
196 8060,8186,
197 8061,8187,
198 8112,8120,
199 8113,8121,
200 8144,8152,
201 8145,8153,
202 8160,8168,
203 8161,8169,
204 8165,8172,
205 8526,8498,
206 8580,8579,
207 11361,11360,
208 11365,570,
209 11366,574,
210 11368,11367,
211 11370,11369,
212 11372,11371,
213 11379,11378,
214 11382,11381,
215 11500,11499,
216 11502,11501,
217 11507,11506,
218 11559,4295,
219 11565,4301,
220 42874,42873,
221 42876,42875,
222 42892,42891,
223 42897,42896,
224 42899,42898,
226 //--Autogenerated -- end of section automatically generated
229 // Characters that have complex case conversions are listed here.
230 // This includes cases where more than one character is needed for a conversion,
231 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
232 // lower(upper(x)) != x.
234 const char *complexCaseConversions =
235 // Original | Folded | Upper | Lower |
236 //++Autogenerated -- start of section automatically generated
237 //**2 \(\*\n\)
238 "\xc2\xb5|\xce\xbc|\xce\x9c||"
239 "\xc3\x9f|ss|SS||"
240 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
241 "\xc4\xb1||I||"
242 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
243 "\xc5\xbf|s|S||"
244 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
245 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
246 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
247 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
248 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
249 "\xcd\x85|\xce\xb9|\xce\x99||"
250 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
251 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
252 "\xcf\x82|\xcf\x83|\xce\xa3||"
253 "\xcf\x90|\xce\xb2|\xce\x92||"
254 "\xcf\x91|\xce\xb8|\xce\x98||"
255 "\xcf\x95|\xcf\x86|\xce\xa6||"
256 "\xcf\x96|\xcf\x80|\xce\xa0||"
257 "\xcf\xb0|\xce\xba|\xce\x9a||"
258 "\xcf\xb1|\xcf\x81|\xce\xa1||"
259 "\xcf\xb4|\xce\xb8||\xce\xb8|"
260 "\xcf\xb5|\xce\xb5|\xce\x95||"
261 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
262 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
263 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
264 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
265 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
266 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
267 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
268 "\xe1\xba\x9e|ss||\xc3\x9f|"
269 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
270 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
271 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
272 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
273 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
274 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
275 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
276 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
277 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
278 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
279 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
280 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
281 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
282 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
283 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
284 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
285 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
286 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
287 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
288 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
289 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
290 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
291 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
292 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
293 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
294 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
295 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
296 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
297 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
298 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
299 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
300 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
301 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
302 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
303 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
304 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
305 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
306 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
307 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
308 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
309 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
310 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
311 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
312 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
313 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
314 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
315 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
316 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
317 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
318 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
319 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
320 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
321 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
322 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
323 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
324 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
325 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
326 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
327 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
328 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
329 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
330 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
331 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
332 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
333 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
334 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
335 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
336 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
337 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
338 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
339 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
340 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
341 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
342 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
343 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
344 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
345 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
346 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
347 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
348 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
349 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
350 "\xe2\x84\xaa|k||k|"
351 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
352 "\xef\xac\x80|ff|FF||"
353 "\xef\xac\x81|fi|FI||"
354 "\xef\xac\x82|fl|FL||"
355 "\xef\xac\x83|ffi|FFI||"
356 "\xef\xac\x84|ffl|FFL||"
357 "\xef\xac\x85|st|ST||"
358 "\xef\xac\x86|st|ST||"
359 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
360 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
361 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
362 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
363 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
365 //--Autogenerated -- end of section automatically generated
368 class CaseConverter : public ICaseConverter {
369 // Maximum length of a case conversion result is 6 bytes in UTF-8
370 enum { maxConversionLength=6 };
371 struct ConversionString {
372 char conversion[maxConversionLength+1];
373 ConversionString() {
374 conversion[0] = '\0';
377 // Conversions are initially store in a vector of structs but then decomposed into
378 // parallel arrays as that is about 10% faster to search.
379 struct CharacterConversion {
380 int character;
381 ConversionString conversion;
382 CharacterConversion(int character_=0, const char *conversion_="") : character(character_) {
383 StringCopy(conversion.conversion, conversion_);
385 bool operator<(const CharacterConversion &other) const {
386 return character < other.character;
389 typedef std::vector<CharacterConversion> CharacterToConversion;
390 CharacterToConversion characterToConversion;
391 // The parallel arrays
392 std::vector<int> characters;
393 std::vector<ConversionString> conversions;
395 public:
396 CaseConverter() {
398 bool Initialised() const {
399 return characters.size() > 0;
401 void Add(int character, const char *conversion) {
402 characterToConversion.push_back(CharacterConversion(character, conversion));
404 const char *Find(int character) {
405 const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
406 if (it == characters.end())
407 return 0;
408 else if (*it == character)
409 return conversions[it - characters.begin()].conversion;
410 else
411 return 0;
413 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) {
414 size_t lenConverted = 0;
415 size_t mixedPos = 0;
416 unsigned char bytes[UTF8MaxBytes + 1];
417 while (mixedPos < lenMixed) {
418 const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]);
419 const char *caseConverted = 0;
420 size_t lenMixedChar = 1;
421 if (UTF8IsAscii(leadByte)) {
422 caseConverted = Find(leadByte);
423 } else {
424 bytes[0] = leadByte;
425 const int widthCharBytes = UTF8BytesOfLead[leadByte];
426 for (int b=1; b<widthCharBytes; b++) {
427 bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
429 int classified = UTF8Classify(bytes, widthCharBytes);
430 if (!(classified & UTF8MaskInvalid)) {
431 // valid UTF-8
432 lenMixedChar = classified & UTF8MaskWidth;
433 int character = UnicodeFromUTF8(bytes);
434 caseConverted = Find(character);
437 if (caseConverted) {
438 // Character has a conversion so copy that conversion in
439 while (*caseConverted) {
440 converted[lenConverted++] = *caseConverted++;
441 if (lenConverted >= sizeConverted)
442 return 0;
444 } else {
445 // Character has no conversion so copy the input to output
446 for (size_t i=0; i<lenMixedChar; i++) {
447 converted[lenConverted++] = mixed[mixedPos+i];
448 if (lenConverted >= sizeConverted)
449 return 0;
452 mixedPos += lenMixedChar;
454 return lenConverted;
456 void FinishedAdding() {
457 std::sort(characterToConversion.begin(), characterToConversion.end());
458 characters.reserve(characterToConversion.size());
459 conversions.reserve(characterToConversion.size());
460 for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) {
461 characters.push_back(it->character);
462 conversions.push_back(it->conversion);
464 // Empty the original calculated data completely
465 CharacterToConversion().swap(characterToConversion);
469 CaseConverter caseConvFold;
470 CaseConverter caseConvUp;
471 CaseConverter caseConvLow;
473 void UTF8FromUTF32Character(int uch, char *putf) {
474 size_t k = 0;
475 if (uch < 0x80) {
476 putf[k++] = static_cast<char>(uch);
477 } else if (uch < 0x800) {
478 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
479 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
480 } else if (uch < 0x10000) {
481 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
482 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
483 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
484 } else {
485 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
486 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
487 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
488 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
490 putf[k] = 0;
493 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
494 char lowerUTF8[UTF8MaxBytes+1];
495 UTF8FromUTF32Character(lower, lowerUTF8);
496 char upperUTF8[UTF8MaxBytes+1];
497 UTF8FromUTF32Character(upper, upperUTF8);
499 switch (conversion) {
500 case CaseConversionFold:
501 caseConvFold.Add(upper, lowerUTF8);
502 break;
503 case CaseConversionUpper:
504 caseConvUp.Add(lower, upperUTF8);
505 break;
506 case CaseConversionLower:
507 caseConvLow.Add(upper, lowerUTF8);
508 break;
512 void SetupConversions(enum CaseConversion conversion) {
513 // First initialize for the symmetric ranges
514 for (size_t i=0; i<ELEMENTS(symmetricCaseConversionRanges);) {
515 int lower = symmetricCaseConversionRanges[i++];
516 int upper = symmetricCaseConversionRanges[i++];
517 int length = symmetricCaseConversionRanges[i++];
518 int pitch = symmetricCaseConversionRanges[i++];
519 for (int j=0; j<length*pitch; j+=pitch) {
520 AddSymmetric(conversion, lower+j, upper+j);
523 // Add the symmetric singletons
524 for (size_t i=0; i<ELEMENTS(symmetricCaseConversions);) {
525 int lower = symmetricCaseConversions[i++];
526 int upper = symmetricCaseConversions[i++];
527 AddSymmetric(conversion, lower, upper);
529 // Add the complex cases
530 const char *sComplex = complexCaseConversions;
531 while (*sComplex) {
532 // Longest ligature is 3 character so 5 for safety
533 const size_t lenUTF8 = 5*UTF8MaxBytes+1;
534 char originUTF8[lenUTF8];
535 char foldedUTF8[lenUTF8];
536 char lowerUTF8[lenUTF8];
537 char upperUTF8[lenUTF8];
538 size_t i = 0;
539 while (*sComplex && *sComplex != '|') {
540 originUTF8[i++] = *sComplex;
541 sComplex++;
543 sComplex++;
544 originUTF8[i] = 0;
545 i = 0;
546 while (*sComplex && *sComplex != '|') {
547 foldedUTF8[i++] = *sComplex;
548 sComplex++;
550 sComplex++;
551 foldedUTF8[i] = 0;
552 i = 0;
553 while (*sComplex && *sComplex != '|') {
554 upperUTF8[i++] = *sComplex;
555 sComplex++;
557 sComplex++;
558 upperUTF8[i] = 0;
559 i = 0;
560 while (*sComplex && *sComplex != '|') {
561 lowerUTF8[i++] = *sComplex;
562 sComplex++;
564 sComplex++;
565 lowerUTF8[i] = 0;
567 int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8));
569 if (conversion == CaseConversionFold && foldedUTF8[0]) {
570 caseConvFold.Add(character, foldedUTF8);
573 if (conversion == CaseConversionUpper && upperUTF8[0]) {
574 caseConvUp.Add(character, upperUTF8);
577 if (conversion == CaseConversionLower && lowerUTF8[0]) {
578 caseConvLow.Add(character, lowerUTF8);
582 switch (conversion) {
583 case CaseConversionFold:
584 caseConvFold.FinishedAdding();
585 break;
586 case CaseConversionUpper:
587 caseConvUp.FinishedAdding();
588 break;
589 case CaseConversionLower:
590 caseConvLow.FinishedAdding();
591 break;
595 CaseConverter *ConverterForConversion(enum CaseConversion conversion) {
596 switch (conversion) {
597 case CaseConversionFold:
598 return &caseConvFold;
599 case CaseConversionUpper:
600 return &caseConvUp;
601 case CaseConversionLower:
602 return &caseConvLow;
604 return 0;
609 #ifdef SCI_NAMESPACE
610 namespace Scintilla {
611 #endif
613 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
614 CaseConverter *pCaseConv = ConverterForConversion(conversion);
615 if (!pCaseConv->Initialised())
616 SetupConversions(conversion);
617 return pCaseConv;
620 const char *CaseConvert(int character, enum CaseConversion conversion) {
621 CaseConverter *pCaseConv = ConverterForConversion(conversion);
622 if (!pCaseConv->Initialised())
623 SetupConversions(conversion);
624 return pCaseConv->Find(character);
627 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
628 CaseConverter *pCaseConv = ConverterForConversion(conversion);
629 if (!pCaseConv->Initialised())
630 SetupConversions(conversion);
631 return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
634 std::string CaseConvertString(const std::string &s, enum CaseConversion conversion) {
635 std::string retMapped(s.length() * maxExpansionCaseConversion, 0);
636 size_t lenMapped = CaseConvertString(&retMapped[0], retMapped.length(), s.c_str(), s.length(),
637 conversion);
638 retMapped.resize(lenMapped);
639 return retMapped;
642 #ifdef SCI_NAMESPACE
644 #endif