Update Scintilla to version 3.6.2
[geany-mirror.git] / scintilla / src / CaseConvert.cxx
blob63a27222bf58d774ee3f11e6199cd197ad493013
1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4 ** Case fold characters and convert them to upper or lower case.
5 ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6 ** Should only be rarely regenerated for new versions of Unicode.
7 **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
11 #include <cstring>
13 #include <stdexcept>
14 #include <vector>
15 #include <algorithm>
17 #include "StringCopy.h"
18 #include "CaseConvert.h"
19 #include "UniConversion.h"
20 #include "UnicodeFromUTF8.h"
22 #ifdef SCI_NAMESPACE
23 using namespace Scintilla;
24 #endif
26 namespace {
27 // Use an unnamed namespace to protect the declarations from name conflicts
29 // Unicode code points are ordered by groups and follow patterns.
30 // Most characters (pitch==1) are in ranges for a particular alphabet and their
31 // upper case forms are a fixed distance away.
32 // Another pattern (pitch==2) is where each lower case letter is preceded by
33 // the upper case form. These are also grouped into ranges.
35 int symmetricCaseConversionRanges[] = {
36 //lower, upper, range length, range pitch
37 //++Autogenerated -- start of section automatically generated
38 //**\(\*\n\)
39 97,65,26,1,
40 224,192,23,1,
41 248,216,7,1,
42 257,256,24,2,
43 314,313,8,2,
44 331,330,23,2,
45 462,461,8,2,
46 479,478,9,2,
47 505,504,20,2,
48 547,546,9,2,
49 583,582,5,2,
50 945,913,17,1,
51 963,931,9,1,
52 985,984,12,2,
53 1072,1040,32,1,
54 1104,1024,16,1,
55 1121,1120,17,2,
56 1163,1162,27,2,
57 1218,1217,7,2,
58 1233,1232,44,2,
59 1377,1329,38,1,
60 7681,7680,75,2,
61 7841,7840,48,2,
62 7936,7944,8,1,
63 7952,7960,6,1,
64 7968,7976,8,1,
65 7984,7992,8,1,
66 8000,8008,6,1,
67 8032,8040,8,1,
68 8560,8544,16,1,
69 9424,9398,26,1,
70 11312,11264,47,1,
71 11393,11392,50,2,
72 11520,4256,38,1,
73 42561,42560,23,2,
74 42625,42624,12,2,
75 42787,42786,7,2,
76 42803,42802,31,2,
77 42879,42878,5,2,
78 42913,42912,5,2,
79 65345,65313,26,1,
80 66600,66560,40,1,
82 //--Autogenerated -- end of section automatically generated
85 // Code points that are symmetric but don't fit into a range of similar characters
86 // are listed here.
88 int symmetricCaseConversions[] = {
89 //lower, upper
90 //++Autogenerated -- start of section automatically generated
91 //**1 \(\*\n\)
92 255,376,
93 307,306,
94 309,308,
95 311,310,
96 378,377,
97 380,379,
98 382,381,
99 384,579,
100 387,386,
101 389,388,
102 392,391,
103 396,395,
104 402,401,
105 405,502,
106 409,408,
107 410,573,
108 414,544,
109 417,416,
110 419,418,
111 421,420,
112 424,423,
113 429,428,
114 432,431,
115 436,435,
116 438,437,
117 441,440,
118 445,444,
119 447,503,
120 454,452,
121 457,455,
122 460,458,
123 477,398,
124 499,497,
125 501,500,
126 572,571,
127 575,11390,
128 576,11391,
129 578,577,
130 592,11375,
131 593,11373,
132 594,11376,
133 595,385,
134 596,390,
135 598,393,
136 599,394,
137 601,399,
138 603,400,
139 608,403,
140 611,404,
141 613,42893,
142 614,42922,
143 616,407,
144 617,406,
145 619,11362,
146 623,412,
147 625,11374,
148 626,413,
149 629,415,
150 637,11364,
151 640,422,
152 643,425,
153 648,430,
154 649,580,
155 650,433,
156 651,434,
157 652,581,
158 658,439,
159 881,880,
160 883,882,
161 887,886,
162 891,1021,
163 892,1022,
164 893,1023,
165 940,902,
166 941,904,
167 942,905,
168 943,906,
169 972,908,
170 973,910,
171 974,911,
172 983,975,
173 1010,1017,
174 1016,1015,
175 1019,1018,
176 1231,1216,
177 7545,42877,
178 7549,11363,
179 8017,8025,
180 8019,8027,
181 8021,8029,
182 8023,8031,
183 8048,8122,
184 8049,8123,
185 8050,8136,
186 8051,8137,
187 8052,8138,
188 8053,8139,
189 8054,8154,
190 8055,8155,
191 8056,8184,
192 8057,8185,
193 8058,8170,
194 8059,8171,
195 8060,8186,
196 8061,8187,
197 8112,8120,
198 8113,8121,
199 8144,8152,
200 8145,8153,
201 8160,8168,
202 8161,8169,
203 8165,8172,
204 8526,8498,
205 8580,8579,
206 11361,11360,
207 11365,570,
208 11366,574,
209 11368,11367,
210 11370,11369,
211 11372,11371,
212 11379,11378,
213 11382,11381,
214 11500,11499,
215 11502,11501,
216 11507,11506,
217 11559,4295,
218 11565,4301,
219 42874,42873,
220 42876,42875,
221 42892,42891,
222 42897,42896,
223 42899,42898,
225 //--Autogenerated -- end of section automatically generated
228 // Characters that have complex case conversions are listed here.
229 // This includes cases where more than one character is needed for a conversion,
230 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
231 // lower(upper(x)) != x.
233 const char *complexCaseConversions =
234 // Original | Folded | Upper | Lower |
235 //++Autogenerated -- start of section automatically generated
236 //**2 \(\*\n\)
237 "\xc2\xb5|\xce\xbc|\xce\x9c||"
238 "\xc3\x9f|ss|SS||"
239 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
240 "\xc4\xb1||I||"
241 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
242 "\xc5\xbf|s|S||"
243 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
244 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
245 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
246 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
247 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
248 "\xcd\x85|\xce\xb9|\xce\x99||"
249 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
250 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
251 "\xcf\x82|\xcf\x83|\xce\xa3||"
252 "\xcf\x90|\xce\xb2|\xce\x92||"
253 "\xcf\x91|\xce\xb8|\xce\x98||"
254 "\xcf\x95|\xcf\x86|\xce\xa6||"
255 "\xcf\x96|\xcf\x80|\xce\xa0||"
256 "\xcf\xb0|\xce\xba|\xce\x9a||"
257 "\xcf\xb1|\xcf\x81|\xce\xa1||"
258 "\xcf\xb4|\xce\xb8||\xce\xb8|"
259 "\xcf\xb5|\xce\xb5|\xce\x95||"
260 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
261 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
262 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
263 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
264 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
265 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
266 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
267 "\xe1\xba\x9e|ss||\xc3\x9f|"
268 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
269 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
270 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
271 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
272 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
273 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
274 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
275 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
276 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
277 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
278 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
279 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
280 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
281 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
282 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
283 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
284 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
285 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
286 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
287 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
288 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
289 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
290 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
291 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
292 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
293 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
294 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
295 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
296 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
297 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
298 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
299 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
300 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
301 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
302 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
303 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
304 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
305 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
306 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
307 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
308 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
309 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
310 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
311 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
312 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
313 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
314 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
315 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
316 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
317 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
318 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
319 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
320 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
321 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
322 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
323 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
324 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
325 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
326 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
327 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
328 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
329 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
330 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
331 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
332 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
333 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
334 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
335 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
336 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
337 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
338 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
339 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
340 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
341 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
342 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
343 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
344 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
345 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
346 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
347 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
348 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
349 "\xe2\x84\xaa|k||k|"
350 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
351 "\xef\xac\x80|ff|FF||"
352 "\xef\xac\x81|fi|FI||"
353 "\xef\xac\x82|fl|FL||"
354 "\xef\xac\x83|ffi|FFI||"
355 "\xef\xac\x84|ffl|FFL||"
356 "\xef\xac\x85|st|ST||"
357 "\xef\xac\x86|st|ST||"
358 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
359 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
360 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
361 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
362 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
364 //--Autogenerated -- end of section automatically generated
367 class CaseConverter : public ICaseConverter {
368 // Maximum length of a case conversion result is 6 bytes in UTF-8
369 enum { maxConversionLength=6 };
370 struct ConversionString {
371 char conversion[maxConversionLength+1];
372 ConversionString() {
373 conversion[0] = '\0';
376 // Conversions are initially store in a vector of structs but then decomposed into
377 // parallel arrays as that is about 10% faster to search.
378 struct CharacterConversion {
379 int character;
380 ConversionString conversion;
381 CharacterConversion(int character_=0, const char *conversion_="") : character(character_) {
382 StringCopy(conversion.conversion, conversion_);
384 bool operator<(const CharacterConversion &other) const {
385 return character < other.character;
388 typedef std::vector<CharacterConversion> CharacterToConversion;
389 CharacterToConversion characterToConversion;
390 // The parallel arrays
391 std::vector<int> characters;
392 std::vector<ConversionString> conversions;
394 public:
395 CaseConverter() {
397 bool Initialised() const {
398 return characters.size() > 0;
400 void Add(int character, const char *conversion) {
401 characterToConversion.push_back(CharacterConversion(character, conversion));
403 const char *Find(int character) {
404 const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
405 if (it == characters.end())
406 return 0;
407 else if (*it == character)
408 return conversions[it - characters.begin()].conversion;
409 else
410 return 0;
412 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) {
413 size_t lenConverted = 0;
414 size_t mixedPos = 0;
415 unsigned char bytes[UTF8MaxBytes + 1];
416 while (mixedPos < lenMixed) {
417 const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]);
418 const char *caseConverted = 0;
419 size_t lenMixedChar = 1;
420 if (UTF8IsAscii(leadByte)) {
421 caseConverted = Find(leadByte);
422 } else {
423 bytes[0] = leadByte;
424 const int widthCharBytes = UTF8BytesOfLead[leadByte];
425 for (int b=1; b<widthCharBytes; b++) {
426 bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
428 int classified = UTF8Classify(bytes, widthCharBytes);
429 if (!(classified & UTF8MaskInvalid)) {
430 // valid UTF-8
431 lenMixedChar = classified & UTF8MaskWidth;
432 int character = UnicodeFromUTF8(bytes);
433 caseConverted = Find(character);
436 if (caseConverted) {
437 // Character has a conversion so copy that conversion in
438 while (*caseConverted) {
439 converted[lenConverted++] = *caseConverted++;
440 if (lenConverted >= sizeConverted)
441 return 0;
443 } else {
444 // Character has no conversion so copy the input to output
445 for (size_t i=0; i<lenMixedChar; i++) {
446 converted[lenConverted++] = mixed[mixedPos+i];
447 if (lenConverted >= sizeConverted)
448 return 0;
451 mixedPos += lenMixedChar;
453 return lenConverted;
455 void FinishedAdding() {
456 std::sort(characterToConversion.begin(), characterToConversion.end());
457 characters.reserve(characterToConversion.size());
458 conversions.reserve(characterToConversion.size());
459 for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) {
460 characters.push_back(it->character);
461 conversions.push_back(it->conversion);
463 // Empty the original calculated data completely
464 CharacterToConversion().swap(characterToConversion);
468 CaseConverter caseConvFold;
469 CaseConverter caseConvUp;
470 CaseConverter caseConvLow;
472 void UTF8FromUTF32Character(int uch, char *putf) {
473 size_t k = 0;
474 if (uch < 0x80) {
475 putf[k++] = static_cast<char>(uch);
476 } else if (uch < 0x800) {
477 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
478 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
479 } else if (uch < 0x10000) {
480 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
481 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
482 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
483 } else {
484 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
485 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
486 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
487 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
489 putf[k] = 0;
492 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
493 char lowerUTF8[UTF8MaxBytes+1];
494 UTF8FromUTF32Character(lower, lowerUTF8);
495 char upperUTF8[UTF8MaxBytes+1];
496 UTF8FromUTF32Character(upper, upperUTF8);
498 switch (conversion) {
499 case CaseConversionFold:
500 caseConvFold.Add(upper, lowerUTF8);
501 break;
502 case CaseConversionUpper:
503 caseConvUp.Add(lower, upperUTF8);
504 break;
505 case CaseConversionLower:
506 caseConvLow.Add(upper, lowerUTF8);
507 break;
511 void SetupConversions(enum CaseConversion conversion) {
512 // First initialize for the symmetric ranges
513 for (size_t i=0; i<ELEMENTS(symmetricCaseConversionRanges);) {
514 int lower = symmetricCaseConversionRanges[i++];
515 int upper = symmetricCaseConversionRanges[i++];
516 int length = symmetricCaseConversionRanges[i++];
517 int pitch = symmetricCaseConversionRanges[i++];
518 for (int j=0; j<length*pitch; j+=pitch) {
519 AddSymmetric(conversion, lower+j, upper+j);
522 // Add the symmetric singletons
523 for (size_t i=0; i<ELEMENTS(symmetricCaseConversions);) {
524 int lower = symmetricCaseConversions[i++];
525 int upper = symmetricCaseConversions[i++];
526 AddSymmetric(conversion, lower, upper);
528 // Add the complex cases
529 const char *sComplex = complexCaseConversions;
530 while (*sComplex) {
531 // Longest ligature is 3 character so 5 for safety
532 const size_t lenUTF8 = 5*UTF8MaxBytes+1;
533 char originUTF8[lenUTF8];
534 char foldedUTF8[lenUTF8];
535 char lowerUTF8[lenUTF8];
536 char upperUTF8[lenUTF8];
537 size_t i = 0;
538 while (*sComplex && *sComplex != '|') {
539 originUTF8[i++] = *sComplex;
540 sComplex++;
542 sComplex++;
543 originUTF8[i] = 0;
544 i = 0;
545 while (*sComplex && *sComplex != '|') {
546 foldedUTF8[i++] = *sComplex;
547 sComplex++;
549 sComplex++;
550 foldedUTF8[i] = 0;
551 i = 0;
552 while (*sComplex && *sComplex != '|') {
553 upperUTF8[i++] = *sComplex;
554 sComplex++;
556 sComplex++;
557 upperUTF8[i] = 0;
558 i = 0;
559 while (*sComplex && *sComplex != '|') {
560 lowerUTF8[i++] = *sComplex;
561 sComplex++;
563 sComplex++;
564 lowerUTF8[i] = 0;
566 int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8));
568 if (conversion == CaseConversionFold && foldedUTF8[0]) {
569 caseConvFold.Add(character, foldedUTF8);
572 if (conversion == CaseConversionUpper && upperUTF8[0]) {
573 caseConvUp.Add(character, upperUTF8);
576 if (conversion == CaseConversionLower && lowerUTF8[0]) {
577 caseConvLow.Add(character, lowerUTF8);
581 switch (conversion) {
582 case CaseConversionFold:
583 caseConvFold.FinishedAdding();
584 break;
585 case CaseConversionUpper:
586 caseConvUp.FinishedAdding();
587 break;
588 case CaseConversionLower:
589 caseConvLow.FinishedAdding();
590 break;
594 CaseConverter *ConverterForConversion(enum CaseConversion conversion) {
595 switch (conversion) {
596 case CaseConversionFold:
597 return &caseConvFold;
598 case CaseConversionUpper:
599 return &caseConvUp;
600 case CaseConversionLower:
601 return &caseConvLow;
603 return 0;
608 #ifdef SCI_NAMESPACE
609 namespace Scintilla {
610 #endif
612 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
613 CaseConverter *pCaseConv = ConverterForConversion(conversion);
614 if (!pCaseConv->Initialised())
615 SetupConversions(conversion);
616 return pCaseConv;
619 const char *CaseConvert(int character, enum CaseConversion conversion) {
620 CaseConverter *pCaseConv = ConverterForConversion(conversion);
621 if (!pCaseConv->Initialised())
622 SetupConversions(conversion);
623 return pCaseConv->Find(character);
626 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
627 CaseConverter *pCaseConv = ConverterForConversion(conversion);
628 if (!pCaseConv->Initialised())
629 SetupConversions(conversion);
630 return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
633 #ifdef SCI_NAMESPACE
635 #endif