Update Scintilla to version 3.4.4
[TortoiseGit.git] / ext / scintilla / src / CaseConvert.cxx
blob5108ddcaff442ddd1c0db4a90cda4c2e260b8c09
1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4 ** Case fold characters and convert them to upper or lower case.
5 ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6 ** Should only be rarely regenerated for new versions of Unicode.
7 **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
11 #include <cstring>
13 #include <vector>
14 #include <algorithm>
16 #include "StringCopy.h"
17 #include "CaseConvert.h"
18 #include "UniConversion.h"
19 #include "UnicodeFromUTF8.h"
21 #ifdef SCI_NAMESPACE
22 using namespace Scintilla;
23 #endif
25 namespace {
26 // Use an unnamed namespace to protect the declarations from name conflicts
28 // Unicode code points are ordered by groups and follow patterns.
29 // Most characters (pitch==1) are in ranges for a particular alphabet and their
30 // upper case forms are a fixed distance away.
31 // Another pattern (pitch==2) is where each lower case letter is preceded by
32 // the upper case form. These are also grouped into ranges.
34 int symmetricCaseConversionRanges[] = {
35 //lower, upper, range length, range pitch
36 //++Autogenerated -- start of section automatically generated
37 //**\(\*\n\)
38 97,65,26,1,
39 224,192,23,1,
40 248,216,7,1,
41 257,256,24,2,
42 314,313,8,2,
43 331,330,23,2,
44 462,461,8,2,
45 479,478,9,2,
46 505,504,20,2,
47 547,546,9,2,
48 583,582,5,2,
49 945,913,17,1,
50 963,931,9,1,
51 985,984,12,2,
52 1072,1040,32,1,
53 1104,1024,16,1,
54 1121,1120,17,2,
55 1163,1162,27,2,
56 1218,1217,7,2,
57 1233,1232,44,2,
58 1377,1329,38,1,
59 7681,7680,75,2,
60 7841,7840,48,2,
61 7936,7944,8,1,
62 7952,7960,6,1,
63 7968,7976,8,1,
64 7984,7992,8,1,
65 8000,8008,6,1,
66 8032,8040,8,1,
67 8560,8544,16,1,
68 9424,9398,26,1,
69 11312,11264,47,1,
70 11393,11392,50,2,
71 11520,4256,38,1,
72 42561,42560,23,2,
73 42625,42624,12,2,
74 42787,42786,7,2,
75 42803,42802,31,2,
76 42879,42878,5,2,
77 42913,42912,5,2,
78 65345,65313,26,1,
79 66600,66560,40,1,
81 //--Autogenerated -- end of section automatically generated
84 // Code points that are symmetric but don't fit into a range of similar characters
85 // are listed here.
87 int symmetricCaseConversions[] = {
88 //lower, upper
89 //++Autogenerated -- start of section automatically generated
90 //**1 \(\*\n\)
91 255,376,
92 307,306,
93 309,308,
94 311,310,
95 378,377,
96 380,379,
97 382,381,
98 384,579,
99 387,386,
100 389,388,
101 392,391,
102 396,395,
103 402,401,
104 405,502,
105 409,408,
106 410,573,
107 414,544,
108 417,416,
109 419,418,
110 421,420,
111 424,423,
112 429,428,
113 432,431,
114 436,435,
115 438,437,
116 441,440,
117 445,444,
118 447,503,
119 454,452,
120 457,455,
121 460,458,
122 477,398,
123 499,497,
124 501,500,
125 572,571,
126 575,11390,
127 576,11391,
128 578,577,
129 592,11375,
130 593,11373,
131 594,11376,
132 595,385,
133 596,390,
134 598,393,
135 599,394,
136 601,399,
137 603,400,
138 608,403,
139 611,404,
140 613,42893,
141 614,42922,
142 616,407,
143 617,406,
144 619,11362,
145 623,412,
146 625,11374,
147 626,413,
148 629,415,
149 637,11364,
150 640,422,
151 643,425,
152 648,430,
153 649,580,
154 650,433,
155 651,434,
156 652,581,
157 658,439,
158 881,880,
159 883,882,
160 887,886,
161 891,1021,
162 892,1022,
163 893,1023,
164 940,902,
165 941,904,
166 942,905,
167 943,906,
168 972,908,
169 973,910,
170 974,911,
171 983,975,
172 1010,1017,
173 1016,1015,
174 1019,1018,
175 1231,1216,
176 7545,42877,
177 7549,11363,
178 8017,8025,
179 8019,8027,
180 8021,8029,
181 8023,8031,
182 8048,8122,
183 8049,8123,
184 8050,8136,
185 8051,8137,
186 8052,8138,
187 8053,8139,
188 8054,8154,
189 8055,8155,
190 8056,8184,
191 8057,8185,
192 8058,8170,
193 8059,8171,
194 8060,8186,
195 8061,8187,
196 8112,8120,
197 8113,8121,
198 8144,8152,
199 8145,8153,
200 8160,8168,
201 8161,8169,
202 8165,8172,
203 8526,8498,
204 8580,8579,
205 11361,11360,
206 11365,570,
207 11366,574,
208 11368,11367,
209 11370,11369,
210 11372,11371,
211 11379,11378,
212 11382,11381,
213 11500,11499,
214 11502,11501,
215 11507,11506,
216 11559,4295,
217 11565,4301,
218 42874,42873,
219 42876,42875,
220 42892,42891,
221 42897,42896,
222 42899,42898,
224 //--Autogenerated -- end of section automatically generated
227 // Characters that have complex case conversions are listed here.
228 // This includes cases where more than one character is needed for a conversion,
229 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
230 // lower(upper(x)) != x.
232 const char *complexCaseConversions =
233 // Original | Folded | Upper | Lower |
234 //++Autogenerated -- start of section automatically generated
235 //**2 \(\*\n\)
236 "\xc2\xb5|\xce\xbc|\xce\x9c||"
237 "\xc3\x9f|ss|SS||"
238 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
239 "\xc4\xb1||I||"
240 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
241 "\xc5\xbf|s|S||"
242 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
243 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
244 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
245 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
246 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
247 "\xcd\x85|\xce\xb9|\xce\x99||"
248 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
249 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
250 "\xcf\x82|\xcf\x83|\xce\xa3||"
251 "\xcf\x90|\xce\xb2|\xce\x92||"
252 "\xcf\x91|\xce\xb8|\xce\x98||"
253 "\xcf\x95|\xcf\x86|\xce\xa6||"
254 "\xcf\x96|\xcf\x80|\xce\xa0||"
255 "\xcf\xb0|\xce\xba|\xce\x9a||"
256 "\xcf\xb1|\xcf\x81|\xce\xa1||"
257 "\xcf\xb4|\xce\xb8||\xce\xb8|"
258 "\xcf\xb5|\xce\xb5|\xce\x95||"
259 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
260 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
261 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
262 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
263 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
264 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
265 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
266 "\xe1\xba\x9e|ss||\xc3\x9f|"
267 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
268 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
269 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
270 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
271 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
272 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
273 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
274 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
275 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
276 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
277 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
278 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
279 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
280 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
281 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
282 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
283 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
284 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
285 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
286 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
287 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
288 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
289 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
290 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
291 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
292 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
293 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
294 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
295 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
296 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
297 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
298 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
299 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
300 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
301 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
302 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
303 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
304 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
305 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
306 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
307 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
308 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
309 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
310 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
311 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
312 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
313 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
314 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
315 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
316 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
317 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
318 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
319 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
320 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
321 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
322 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
323 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
324 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
325 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
326 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
327 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
328 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
329 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
330 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
331 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
332 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
333 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
334 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
335 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
336 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
337 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
338 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
339 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
340 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
341 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
342 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
343 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
344 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
345 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
346 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
347 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
348 "\xe2\x84\xaa|k||k|"
349 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
350 "\xef\xac\x80|ff|FF||"
351 "\xef\xac\x81|fi|FI||"
352 "\xef\xac\x82|fl|FL||"
353 "\xef\xac\x83|ffi|FFI||"
354 "\xef\xac\x84|ffl|FFL||"
355 "\xef\xac\x85|st|ST||"
356 "\xef\xac\x86|st|ST||"
357 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
358 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
359 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
360 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
361 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
363 //--Autogenerated -- end of section automatically generated
366 class CaseConverter : public ICaseConverter {
367 // Maximum length of a case conversion result is 6 bytes in UTF-8
368 enum { maxConversionLength=6 };
369 struct ConversionString {
370 char conversion[maxConversionLength+1];
371 ConversionString() {
372 conversion[0] = '\0';
375 // Conversions are initially store in a vector of structs but then decomposed into
376 // parallel arrays as that is about 10% faster to search.
377 struct CharacterConversion {
378 int character;
379 ConversionString conversion;
380 CharacterConversion(int character_=0, const char *conversion_="") : character(character_) {
381 StringCopy(conversion.conversion, conversion_);
383 bool operator<(const CharacterConversion &other) const {
384 return character < other.character;
387 typedef std::vector<CharacterConversion> CharacterToConversion;
388 CharacterToConversion characterToConversion;
389 // The parallel arrays
390 std::vector<int> characters;
391 std::vector<ConversionString> conversions;
393 public:
394 CaseConverter() {
396 bool Initialised() const {
397 return characters.size() > 0;
399 void Add(int character, const char *conversion) {
400 characterToConversion.push_back(CharacterConversion(character, conversion));
402 const char *Find(int character) {
403 const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
404 if (it == characters.end())
405 return 0;
406 else if (*it == character)
407 return conversions[it - characters.begin()].conversion;
408 else
409 return 0;
411 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) {
412 size_t lenConverted = 0;
413 size_t mixedPos = 0;
414 unsigned char bytes[UTF8MaxBytes + 1];
415 while (mixedPos < lenMixed) {
416 const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]);
417 const char *caseConverted = 0;
418 size_t lenMixedChar = 1;
419 if (UTF8IsAscii(leadByte)) {
420 caseConverted = Find(leadByte);
421 } else {
422 bytes[0] = leadByte;
423 const int widthCharBytes = UTF8BytesOfLead[leadByte];
424 for (int b=1; b<widthCharBytes; b++) {
425 bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
427 int classified = UTF8Classify(bytes, widthCharBytes);
428 if (!(classified & UTF8MaskInvalid)) {
429 // valid UTF-8
430 lenMixedChar = classified & UTF8MaskWidth;
431 int character = UnicodeFromUTF8(bytes);
432 caseConverted = Find(character);
435 if (caseConverted) {
436 // Character has a conversion so copy that conversion in
437 while (*caseConverted) {
438 converted[lenConverted++] = *caseConverted++;
439 if (lenConverted >= sizeConverted)
440 return 0;
442 } else {
443 // Character has no conversion so copy the input to output
444 for (size_t i=0; i<lenMixedChar; i++) {
445 converted[lenConverted++] = mixed[mixedPos+i];
446 if (lenConverted >= sizeConverted)
447 return 0;
450 mixedPos += lenMixedChar;
452 return lenConverted;
454 void FinishedAdding() {
455 std::sort(characterToConversion.begin(), characterToConversion.end());
456 characters.reserve(characterToConversion.size());
457 conversions.reserve(characterToConversion.size());
458 for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) {
459 characters.push_back(it->character);
460 conversions.push_back(it->conversion);
462 // Empty the original calculated data completely
463 CharacterToConversion().swap(characterToConversion);
467 CaseConverter caseConvFold;
468 CaseConverter caseConvUp;
469 CaseConverter caseConvLow;
471 void UTF8FromUTF32Character(int uch, char *putf) {
472 size_t k = 0;
473 if (uch < 0x80) {
474 putf[k++] = static_cast<char>(uch);
475 } else if (uch < 0x800) {
476 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
477 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
478 } else if (uch < 0x10000) {
479 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
480 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
481 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
482 } else {
483 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
484 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
485 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
486 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
488 putf[k] = 0;
491 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
492 char lowerUTF8[UTF8MaxBytes+1];
493 UTF8FromUTF32Character(lower, lowerUTF8);
494 char upperUTF8[UTF8MaxBytes+1];
495 UTF8FromUTF32Character(upper, upperUTF8);
497 switch (conversion) {
498 case CaseConversionFold:
499 caseConvFold.Add(upper, lowerUTF8);
500 break;
501 case CaseConversionUpper:
502 caseConvUp.Add(lower, upperUTF8);
503 break;
504 case CaseConversionLower:
505 caseConvLow.Add(upper, lowerUTF8);
506 break;
510 void SetupConversions(enum CaseConversion conversion) {
511 // First initialize for the symmetric ranges
512 for (size_t i=0; i<ELEMENTS(symmetricCaseConversionRanges);) {
513 int lower = symmetricCaseConversionRanges[i++];
514 int upper = symmetricCaseConversionRanges[i++];
515 int length = symmetricCaseConversionRanges[i++];
516 int pitch = symmetricCaseConversionRanges[i++];
517 for (int j=0; j<length*pitch; j+=pitch) {
518 AddSymmetric(conversion, lower+j, upper+j);
521 // Add the symmetric singletons
522 for (size_t i=0; i<ELEMENTS(symmetricCaseConversions);) {
523 int lower = symmetricCaseConversions[i++];
524 int upper = symmetricCaseConversions[i++];
525 AddSymmetric(conversion, lower, upper);
527 // Add the complex cases
528 const char *sComplex = complexCaseConversions;
529 while (*sComplex) {
530 // Longest ligature is 3 character so 5 for safety
531 const size_t lenUTF8 = 5*UTF8MaxBytes+1;
532 char originUTF8[lenUTF8];
533 char foldedUTF8[lenUTF8];
534 char lowerUTF8[lenUTF8];
535 char upperUTF8[lenUTF8];
536 size_t i = 0;
537 while (*sComplex && *sComplex != '|') {
538 originUTF8[i++] = *sComplex;
539 sComplex++;
541 sComplex++;
542 originUTF8[i] = 0;
543 i = 0;
544 while (*sComplex && *sComplex != '|') {
545 foldedUTF8[i++] = *sComplex;
546 sComplex++;
548 sComplex++;
549 foldedUTF8[i] = 0;
550 i = 0;
551 while (*sComplex && *sComplex != '|') {
552 upperUTF8[i++] = *sComplex;
553 sComplex++;
555 sComplex++;
556 upperUTF8[i] = 0;
557 i = 0;
558 while (*sComplex && *sComplex != '|') {
559 lowerUTF8[i++] = *sComplex;
560 sComplex++;
562 sComplex++;
563 lowerUTF8[i] = 0;
565 int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8));
567 if (conversion == CaseConversionFold && foldedUTF8[0]) {
568 caseConvFold.Add(character, foldedUTF8);
571 if (conversion == CaseConversionUpper && upperUTF8[0]) {
572 caseConvUp.Add(character, upperUTF8);
575 if (conversion == CaseConversionLower && lowerUTF8[0]) {
576 caseConvLow.Add(character, lowerUTF8);
580 switch (conversion) {
581 case CaseConversionFold:
582 caseConvFold.FinishedAdding();
583 break;
584 case CaseConversionUpper:
585 caseConvUp.FinishedAdding();
586 break;
587 case CaseConversionLower:
588 caseConvLow.FinishedAdding();
589 break;
593 CaseConverter *ConverterForConversion(enum CaseConversion conversion) {
594 switch (conversion) {
595 case CaseConversionFold:
596 return &caseConvFold;
597 case CaseConversionUpper:
598 return &caseConvUp;
599 case CaseConversionLower:
600 return &caseConvLow;
602 return 0;
607 #ifdef SCI_NAMESPACE
608 namespace Scintilla {
609 #endif
611 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
612 CaseConverter *pCaseConv = ConverterForConversion(conversion);
613 if (!pCaseConv->Initialised())
614 SetupConversions(conversion);
615 return pCaseConv;
618 const char *CaseConvert(int character, enum CaseConversion conversion) {
619 CaseConverter *pCaseConv = ConverterForConversion(conversion);
620 if (!pCaseConv->Initialised())
621 SetupConversions(conversion);
622 return pCaseConv->Find(character);
625 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
626 CaseConverter *pCaseConv = ConverterForConversion(conversion);
627 if (!pCaseConv->Initialised())
628 SetupConversions(conversion);
629 return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
632 #ifdef SCI_NAMESPACE
634 #endif