2010-06-21 Atsushi Enomoto <atsushi@ximian.com>
[mcs.git] / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
blob01ce2684c087c77be2a65a4182d8ec4f5cc228c5
1 //
2 // create-mscompat-collation-table.cs : generates Windows-like sortkey tables.
3 //
4 // Author:
5 // Atsushi Enomoto <atsushi@ximian.com>
6 //
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
16 //
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
19 //
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 // There are two kind of sort keys : which are computed and which are laid out
31 // as an indexed array. Computed sort keys are:
33 // - Surrogate
34 // - PrivateUse
36 // Though it is possible to "compute" level 3 weights, they are still dumped
37 // to an array to avoid execution cost.
39 #define Binary
41 using System;
42 using System.IO;
43 using System.Collections;
44 using System.Globalization;
45 using System.Text;
46 using System.Xml;
48 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
50 namespace Mono.Globalization.Unicode
52 internal class MSCompatSortKeyTableGenerator
54 public static void Main (string [] args)
56 new MSCompatSortKeyTableGenerator ().Run (args);
59 const int DecompositionWide = 1; // fixed
60 const int DecompositionSub = 2; // fixed
61 const int DecompositionSmall = 3;
62 const int DecompositionIsolated = 4;
63 const int DecompositionInitial = 5;
64 const int DecompositionFinal = 6;
65 const int DecompositionMedial = 7;
66 const int DecompositionNoBreak = 8;
67 const int DecompositionVertical = 9;
68 const int DecompositionFraction = 0xA;
69 const int DecompositionFont = 0xB;
70 const int DecompositionSuper = 0xC; // fixed
71 const int DecompositionFull = 0xE;
72 const int DecompositionNarrow = 0xD;
73 const int DecompositionCircle = 0xF;
74 const int DecompositionSquare = 0x10;
75 const int DecompositionCompat = 0x11;
76 const int DecompositionCanonical = 0x12;
78 TextWriter CSResult = Console.Out;
79 TextWriter CResult = TextWriter.Null;
81 byte [] fillIndex = new byte [256]; // by category
82 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
84 char [] specialIgnore = new char [] {
85 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
86 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
89 // FIXME: need more love (as always)
90 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
91 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
92 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
93 '\u0292', '\u01BE', '\u0298'};
94 byte [] alphaWeights = new byte [] {
95 2, 9, 0xA, 0x1A, 0x21,
96 0x23, 0x25, 0x2C, 0x32, 0x35,
97 0x36, 0x48, 0x51, 0x70, 0x7C,
98 0x7E, 0x89, 0x8A, 0x91, 0x99,
99 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
100 0xA9, 0xAA, 0xB3, 0xB4};
102 bool [] isSmallCapital = new bool [char.MaxValue + 1];
103 bool [] isUppercase = new bool [char.MaxValue + 1];
105 byte [] decompType = new byte [char.MaxValue + 1];
106 int [] decompIndex = new int [char.MaxValue + 1];
107 int [] decompLength = new int [char.MaxValue + 1];
108 int [] decompValues;
109 decimal [] decimalValue = new decimal [char.MaxValue + 1];
111 byte [] diacritical = new byte [char.MaxValue + 1];
113 string [] diacritics = new string [] {
114 // LATIN, CYRILLIC etc.
115 "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
116 "ABKHASIAN",
117 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
118 "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
119 "WITH ACUTE;", "WITH GRAVE;",
121 "WITH DOT ABOVE;", " MIDDLE DOT;",
122 "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
123 "WITH DIALYTIKA;",
124 "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
125 "DIALYTIKA TONOS", "DIALYTIKA AND TONOS",
126 "ABKHASIAN CHE WITH DESCENDER",
127 "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
128 "WITH OGONEK;", "WITH CEDILLA;",
130 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
131 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
132 "STROKE OVERLAY",
133 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
134 " DIAERESIS AND GRAVE;",
135 " BREVE AND ACUTE;",
136 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
137 " MACRON AND ACUTE;",
138 " MACRON AND GRAVE;",
140 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
141 " RING ABOVE AND ACUTE",
142 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
143 " CIRCUMFLEX AND TILDE",
144 " TILDE AND DIAERESIS",
145 " STROKE AND ACUTE",
146 " BREVE AND TILDE",
147 " CEDILLA AND BREVE",
148 " OGONEK AND MACRON",
149 // 0x40
150 "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE",
151 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
152 " DOUBLE GRAVE",
153 " INVERTED BREVE",
154 "ROMAN NUMERAL",
155 " PRECEDED BY APOSTROPHE",
156 "WITH HORN;",
157 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
158 " PALATAL HOOK",
159 " DOT BELOW;",
160 " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK",
161 " RING BELOW", "LOW VERTICAL LINE",
163 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
164 " BREVE BELOW;", " HORN AND GRAVE",
165 " LOW MACRON",
166 " TILDE BELOW",
167 " TOPBAR",
168 " DOT BELOW AND DOT ABOVE",
169 " RIGHT HALF RING", " HORN AND TILDE",
170 " CIRCUMFLEX AND DOT BELOW",
171 " BREVE AND DOT BELOW",
172 " DOT BELOW AND MACRON",
173 " TONE TWO",
174 " HORN AND HOOK ABOVE",
175 " HORN AND DOT",
176 // CIRCLED, PARENTHESIZED and so on
177 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
178 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
179 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
181 byte [] diacriticWeights = new byte [] {
182 // LATIN.
183 3, 3, 3, 5, 5, 5, 5,
184 0xE, 0xF,
185 0xE, 0xF,
187 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
188 0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
190 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
191 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
193 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
194 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
196 0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
197 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59,
198 0x5A, 0x5A,
200 0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68,
201 0x69, 0x69, 0x6A, 0x6D, 0x6E,
202 0x87, 0x95, 0xAA,
203 // CIRCLED, PARENTHESIZED and so on.
204 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
205 0xF3, 0xF3, 0xF3
208 int [] numberSecondaryWeightBounds = new int [] {
209 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
210 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
211 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
212 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
213 0xE50, 0xE60, 0xED0, 0xEE0
216 char [] orderedGurmukhi;
217 char [] orderedGujarati;
218 char [] orderedGeorgian;
219 char [] orderedThaana;
221 static readonly char [] orderedTamilConsonants = new char [] {
222 // based on traditional Tamil consonants, except for
223 // Grantha (where Microsoft breaks traditionalism).
224 // http://www.angelfire.com/empire/thamizh/padanGaL
225 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
226 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
227 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
228 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
229 '\u0BB7', '\u0BB9'};
231 // cp -> character name (only for some characters)
232 ArrayList sortableCharNames = new ArrayList ();
234 // cp -> arrow value (int)
235 ArrayList arrowValues = new ArrayList ();
237 // cp -> box value (int)
238 ArrayList boxValues = new ArrayList ();
240 // cp -> level1 value
241 Hashtable arabicLetterPrimaryValues = new Hashtable ();
243 // letterName -> cp
244 Hashtable arabicNameMap = new Hashtable ();
246 // cp -> Hashtable [decompType] -> cp
247 Hashtable nfkdMap = new Hashtable ();
249 // Latin letter -> ArrayList [int]
250 Hashtable latinMap = new Hashtable ();
252 ArrayList jisJapanese = new ArrayList ();
253 ArrayList nonJisJapanese = new ArrayList ();
255 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
256 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
257 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
258 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
259 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
261 byte [] ignorableFlags = new byte [char.MaxValue + 1];
263 static double [] unicodeAge = new double [char.MaxValue + 1];
265 ArrayList tailorings = new ArrayList ();
267 void Run (string [] args)
269 string dirname = args.Length == 0 ? "downloaded" : args [0];
270 ParseSources (dirname);
271 Console.Error.WriteLine ("parse done.");
273 ModifyParsedValues ();
274 GenerateCore ();
275 Console.Error.WriteLine ("generation done.");
276 CResult = new StreamWriter ("collation-tables.h", false);
277 Serialize ();
278 CResult.Close ();
279 Console.Error.WriteLine ("serialization done.");
281 StreamWriter sw = new StreamWriter ("agelog.txt");
282 for (int i = 0; i < char.MaxValue; i++) {
283 bool shouldBe = false;
284 switch (Char.GetUnicodeCategory ((char) i)) {
285 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
286 shouldBe = true; break;
288 if (unicodeAge [i] >= 3.1)
289 shouldBe = true;
290 //if (IsIgnorable (i) != shouldBe)
291 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
293 sw.Close ();
297 byte [] CompressArray (byte [] source, CodePointIndexer i)
299 return (byte []) CodePointIndexer.CompressArray (
300 source, typeof (byte), i);
303 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
305 return (ushort []) CodePointIndexer.CompressArray (
306 source, typeof (ushort), i);
309 void WriteByte (byte value)
314 void Serialize ()
316 // Tailorings
317 SerializeTailorings ();
319 byte [] categories = new byte [map.Length];
320 byte [] level1 = new byte [map.Length];
321 byte [] level2 = new byte [map.Length];
322 byte [] level3 = new byte [map.Length];
323 // widthCompat is now removed from the mapping table.
324 // If it turned out that it is still required, grep this source and uncomment
325 // widthCompat related lines. FIXME: remove those lines in the future.
326 // ushort [] widthCompat = new ushort [map.Length];
327 for (int i = 0; i < map.Length; i++) {
328 categories [i] = map [i].Category;
329 level1 [i] = map [i].Level1;
330 level2 [i] = map [i].Level2;
331 level3 [i] = ComputeLevel3Weight ((char) i);
333 // For Japanese Half-width characters, don't
334 // map widthCompat. It is IgnoreKanaType that
335 // handles those width differences.
336 if (0xFF6D <= i && i <= 0xFF9D)
337 continue;
338 switch (decompType [i]) {
339 case DecompositionNarrow:
340 case DecompositionWide:
341 case DecompositionSuper:
342 case DecompositionSub:
343 // they are always 1 char
344 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
345 break;
350 // compress
351 ignorableFlags = CompressArray (ignorableFlags,
352 UUtil.Ignorable);
353 categories = CompressArray (categories, UUtil.Category);
354 level1 = CompressArray (level1, UUtil.Level1);
355 level2 = CompressArray (level2, UUtil.Level2);
356 level3 = CompressArray (level3, UUtil.Level3);
357 // widthCompat = (ushort []) CodePointIndexer.CompressArray (
358 // widthCompat, typeof (ushort), UUtil.WidthCompat);
359 cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
360 cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
361 cjkJA = CompressArray (cjkJA, UUtil.Cjk);
362 cjkKO = CompressArray (cjkKO, UUtil.Cjk);
363 cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
365 // Ignorables
366 CResult.WriteLine ("static const guint8 collation_table_ignorableFlags [] = {");
367 CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {");
368 #if Binary
369 MemoryStream ms = new MemoryStream ();
370 BinaryWriter binary = new BinaryWriter (ms);
371 binary.Write (UUtil.ResourceVersion);
372 binary.Write (ignorableFlags.Length);
373 #endif
374 for (int i = 0; i < ignorableFlags.Length; i++) {
375 byte value = ignorableFlags [i];
376 if (value < 10)
377 CSResult.Write ("{0},", value);
378 else
379 CSResult.Write ("0x{0:X02},", value);
380 CResult.Write ("{0},", value);
381 #if Binary
382 binary.Write (value);
383 #endif
384 if ((i & 0xF) == 0xF) {
385 CSResult.WriteLine ("// {0:X04}",
386 UUtil.Ignorable.ToCodePoint (i - 0xF));
387 CResult.WriteLine ();
390 CResult.WriteLine ("0};");
391 CSResult.WriteLine ("};");
392 CSResult.WriteLine ();
394 // Primary category
395 CResult.WriteLine ("static const guint8 collation_table_category [] = {");
396 CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {");
397 #if Binary
398 binary.Write (categories.Length);
399 #endif
400 for (int i = 0; i < categories.Length; i++) {
401 byte value = categories [i];
402 if (value < 10)
403 CSResult.Write ("{0},", value);
404 else
405 CSResult.Write ("0x{0:X02},", value);
406 CResult.Write ("{0},", value);
407 #if Binary
408 binary.Write (value);
409 #endif
410 if ((i & 0xF) == 0xF) {
411 CSResult.WriteLine ("// {0:X04}",
412 UUtil.Category.ToCodePoint (i - 0xF));
413 CResult.WriteLine ();
416 CResult.WriteLine ("};");
417 CSResult.WriteLine ("};");
418 CSResult.WriteLine ();
420 // Primary weight value
421 CResult.WriteLine ("static const guint8 collation_table_level1 [] = {");
422 CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {");
423 #if Binary
424 binary.Write (level1.Length);
425 #endif
426 for (int i = 0; i < level1.Length; i++) {
427 byte value = level1 [i];
428 if (value < 10)
429 CSResult.Write ("{0},", value);
430 else
431 CSResult.Write ("0x{0:X02},", value);
432 CResult.Write ("{0},", value);
433 #if Binary
434 binary.Write (value);
435 #endif
436 if ((i & 0xF) == 0xF) {
437 CSResult.WriteLine ("// {0:X04}",
438 UUtil.Level1.ToCodePoint (i - 0xF));
439 CResult.WriteLine ();
442 CResult.WriteLine ("0};");
443 CSResult.WriteLine ("};");
444 CSResult.WriteLine ();
446 // Secondary weight
447 CResult.WriteLine ("static const guint8 collation_table_level2 [] = {");
448 CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {");
449 #if Binary
450 binary.Write (level2.Length);
451 #endif
452 for (int i = 0; i < level2.Length; i++) {
453 byte value = level2 [i];
454 if (value < 10)
455 CSResult.Write ("{0},", value);
456 else
457 CSResult.Write ("0x{0:X02},", value);
458 CResult.Write ("{0},", value);
459 #if Binary
460 binary.Write (value);
461 #endif
462 if ((i & 0xF) == 0xF) {
463 CSResult.WriteLine ("// {0:X04}",
464 UUtil.Level2.ToCodePoint (i - 0xF));
465 CResult.WriteLine ();
468 CResult.WriteLine ("0};");
469 CSResult.WriteLine ("};");
470 CSResult.WriteLine ();
472 // Thirtiary weight
473 CResult.WriteLine ("static const guint8 collation_table_level3 [] = {");
474 CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {");
475 #if Binary
476 binary.Write (level3.Length);
477 #endif
478 for (int i = 0; i < level3.Length; i++) {
479 byte value = level3 [i];
480 if (value < 10)
481 CSResult.Write ("{0},", value);
482 else
483 CSResult.Write ("0x{0:X02},", value);
484 CResult.Write ("{0},", value);
485 #if Binary
486 binary.Write (value);
487 #endif
488 if ((i & 0xF) == 0xF) {
489 CSResult.WriteLine ("// {0:X04}",
490 UUtil.Level3.ToCodePoint (i - 0xF));
491 CResult.WriteLine ();
494 CResult.WriteLine ("0};");
495 CSResult.WriteLine ("};");
496 CSResult.WriteLine ();
499 // Width insensitivity mappings
500 // (for now it is more lightweight than dumping the
501 // entire NFKD table).
502 CResult.WriteLine ("static const guint16* widthCompat [] = {");
503 CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {");
504 #if Binary
505 binary.Write (widthCompat.Length);
506 #endif
507 for (int i = 0; i < widthCompat.Length; i++) {
508 ushort value = widthCompat [i];
509 if (value < 10)
510 CSResult.Write ("{0},", value);
511 else
512 CSResult.Write ("0x{0:X02},", value);
513 CResult.Write ("{0},", value);
514 #if Binary
515 binary.Write (value);
516 #endif
517 if ((i & 0xF) == 0xF) {
518 CSResult.WriteLine ("// {0:X04}",
519 UUtil.WidthCompat.ToCodePoint (i - 0xF));
520 CResult.WriteLine ();
523 CResult.WriteLine ("0};");
524 CSResult.WriteLine ("};");
525 CSResult.WriteLine ();
528 #if Binary
529 using (FileStream fs = File.Create ("../resources/collation.core.bin")) {
530 byte [] array = ms.ToArray ();
531 fs.Write (array, 0, array.Length);
533 #endif
535 // CJK
536 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
537 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
538 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
539 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
540 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
543 void SerializeCJK (string name, ushort [] cjk, int max_unused)
545 // CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length);
546 CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length);
548 int len = cjk.Length;
549 CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name);
550 CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
551 // the actual length is *2
552 for (int i = 0; i < 4; i++, len /= 256) {
553 CResult.Write ("{0},", len & 0xFF);
554 CSResult.Write ("0x{0:X04},", len & 0xFF);
556 CResult.WriteLine ();
557 CSResult.WriteLine ();
558 #if Binary
559 MemoryStream ms = new MemoryStream ();
560 BinaryWriter binary = new BinaryWriter (ms);
561 binary.Write (UUtil.ResourceVersion);
562 binary.Write (cjk.Length); // the actual size is *2.
563 #endif
564 // category
565 for (int i = 0; i < cjk.Length; i++) {
566 // if (i == max)
567 // break;
568 byte value = (byte) (cjk [i] >> 8);
569 if (value < 10)
570 CSResult.Write ("{0},", value);
571 else
572 CSResult.Write ("0x{0:X02},", value);
573 CResult.Write ("{0},", value);
574 #if Binary
575 binary.Write (value);
576 #endif
577 if ((i & 0xF) == 0xF) {
578 CSResult.WriteLine ("// {0:X04}", i - 0xF);
579 CResult.WriteLine ();
583 // level 1
584 for (int i = 0; i < cjk.Length; i++) {
585 // if (i == max)
586 // break;
587 byte value = (byte) (cjk [i] & 0xFF);
588 if (value < 10)
589 CSResult.Write ("{0},", value);
590 else
591 CSResult.Write ("0x{0:X02},", value);
592 CResult.Write ("{0},", value);
593 #if Binary
594 binary.Write (value);
595 #endif
596 if ((i & 0xF) == 0xF) {
597 CSResult.WriteLine ("// {0:X04}", i - 0xF);
598 CResult.WriteLine ();
602 CResult.WriteLine ("0};");
603 CSResult.WriteLine ("};");
604 CSResult.WriteLine ();
605 #if Binary
606 using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) {
607 byte [] array = ms.ToArray ();
608 fs.Write (array, 0, array.Length);
610 #endif
613 void SerializeCJK (string name, byte [] cjk, int max)
615 CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name);
616 CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
617 #if Binary
618 MemoryStream ms = new MemoryStream ();
619 BinaryWriter binary = new BinaryWriter (ms);
620 binary.Write (UUtil.ResourceVersion);
621 #endif
622 for (int i = 0; i < cjk.Length; i++) {
623 if (i == max)
624 break;
625 byte value = cjk [i];
626 if (value < 10)
627 CSResult.Write ("{0},", value);
628 else
629 CSResult.Write ("0x{0:X02},", value);
630 CResult.Write ("{0},", value);
631 #if Binary
632 binary.Write (value);
633 #endif
634 if ((i & 0xF) == 0xF) {
635 CSResult.WriteLine ("// {0:X04}", i - 0xF);
636 CResult.WriteLine ();
639 CResult.WriteLine ("0};");
640 CSResult.WriteLine ("};");
641 CSResult.WriteLine ();
642 #if Binary
643 using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) {
644 byte [] array = ms.ToArray ();
645 fs.Write (array, 0, array.Length);
647 #endif
650 void SerializeTailorings ()
652 Hashtable indexes = new Hashtable ();
653 Hashtable counts = new Hashtable ();
654 CResult.WriteLine ("static const guint16 collation_table_tailoring [] = {");
655 CSResult.WriteLine ("static char [] tailoringArr = new char [] {");
656 int count = 0;
657 #if Binary
658 MemoryStream ms = new MemoryStream ();
659 BinaryWriter binary = new BinaryWriter (ms);
660 // Here we don't need to output resource version.
661 // This is cached.
662 #endif
663 foreach (Tailoring t in tailorings) {
664 if (t.Alias != 0)
665 continue;
666 CResult.Write ("/*{0}*/", t.LCID);
667 CSResult.Write ("/*{0}*/", t.LCID);
668 indexes.Add (t.LCID, count);
669 char [] values = t.ItemToCharArray ();
670 counts.Add (t.LCID, values.Length);
671 foreach (char c in values) {
672 CSResult.Write ("'\\x{0:X}', ", (int) c);
673 CResult.Write ("{0},", (int) c);
674 if (++count % 16 == 0) {
675 CSResult.WriteLine (" // {0:X04}", count - 16);
676 CResult.WriteLine ();
678 #if Binary
679 binary.Write ((ushort) c);
680 #endif
683 CResult.WriteLine ("0};");
684 CSResult.WriteLine ("};");
686 CResult.WriteLine ("static const guint32 collation_table_tailoring_infos [] = {");
687 CResult.WriteLine ("{0}, /*count*/", tailorings.Count);
688 CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
689 #if Binary
690 byte [] rawdata = ms.ToArray ();
691 ms = new MemoryStream ();
692 binary = new BinaryWriter (ms);
693 binary.Write (UUtil.ResourceVersion);
694 binary.Write (tailorings.Count);
695 #endif
696 foreach (Tailoring t in tailorings) {
697 int target = t.Alias != 0 ? t.Alias : t.LCID;
698 if (!indexes.ContainsKey (target)) {
699 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
700 continue;
702 int idx = (int) indexes [target];
703 int cnt = (int) counts [target];
704 bool french = t.FrenchSort;
705 if (t.Alias != 0)
706 foreach (Tailoring t2 in tailorings)
707 if (t2.LCID == t.LCID)
708 french = t2.FrenchSort;
709 CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
710 CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0);
711 #if Binary
712 binary.Write (t.LCID);
713 binary.Write (idx);
714 binary.Write (cnt);
715 binary.Write (french);
716 #endif
718 CResult.WriteLine ("0};");
719 CSResult.WriteLine ("};");
720 #if Binary
721 binary.Write ((byte) 0xFF);
722 binary.Write ((byte) 0xFF);
723 binary.Write (rawdata.Length / 2);
724 binary.Write (rawdata, 0, rawdata.Length);
727 using (FileStream fs = File.Create ("../resources/collation.tailoring.bin")) {
728 byte [] array = ms.ToArray ();
729 fs.Write (array, 0, array.Length);
731 #endif
734 #region Parse
736 void ParseSources (string dirname)
738 string unidata =
739 dirname + "/UnicodeData.txt";
740 string derivedCoreProps =
741 dirname + "/DerivedCoreProperties.txt";
742 string scripts =
743 dirname + "/Scripts.txt";
744 string cp932 =
745 dirname + "/CP932.TXT";
746 string derivedAge =
747 dirname + "/DerivedAge.txt";
748 string chXML = dirname + "/common/collation/zh.xml";
749 string jaXML = dirname + "/common/collation/ja.xml";
750 string koXML = dirname + "/common/collation/ko.xml";
752 ParseDerivedAge (derivedAge);
754 FillIgnorables ();
756 ParseJISOrder (cp932); // in prior to ParseUnidata()
757 ParseUnidata (unidata);
758 ModifyUnidata ();
759 ParseDerivedCoreProperties (derivedCoreProps);
760 ParseScripts (scripts);
761 ParseCJK (chXML, jaXML, koXML);
763 ParseTailorings ("mono-tailoring-source.txt");
766 void ParseTailorings (string filename)
768 Tailoring t = null;
769 int line = 0;
770 using (StreamReader sr = new StreamReader (filename)) {
771 try {
772 while (sr.Peek () >= 0) {
773 line++;
774 ProcessTailoringLine (ref t,
775 sr.ReadLine ().Trim ());
777 } catch (Exception) {
778 Console.Error.WriteLine ("ERROR at line {0}", line);
779 throw;
784 // For now this is enough.
785 string ParseTailoringSourceValue (string s)
787 StringBuilder sb = new StringBuilder ();
788 for (int i = 0; i < s.Length; i++) {
789 if (i + 5 < s.Length &&
790 s [i] == '\\' && s [i + 1] == 'u') {
791 sb.Append (
792 (char) int.Parse (
793 s.Substring (i + 2, 4),
794 NumberStyles.HexNumber),
796 i += 5;
798 else
799 sb.Append (s [i]);
801 return sb.ToString ();
804 void ProcessTailoringLine (ref Tailoring t, string s)
806 int idx = s.IndexOf ('#');
807 if (idx > 0)
808 s = s.Substring (0, idx).Trim ();
809 if (s.Length == 0 || s [0] == '#')
810 return;
811 if (s [0] == '@') {
812 idx = s.IndexOf ('=');
813 if (idx > 0)
814 t = new Tailoring (
815 int.Parse (s.Substring (1, idx - 1)),
816 int.Parse (s.Substring (idx + 1)));
817 else
818 t = new Tailoring (int.Parse (s.Substring (1)));
819 tailorings.Add (t);
820 return;
822 if (s.StartsWith ("*FrenchSort")) {
823 t.FrenchSort = true;
824 return;
826 string d = "*Diacritical";
827 if (s.StartsWith (d)) {
828 idx = s.IndexOf ("->");
829 t.AddDiacriticalMap (
830 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
831 NumberStyles.HexNumber),
832 byte.Parse (s.Substring (idx + 2).Trim (),
833 NumberStyles.HexNumber));
834 return;
836 idx = s.IndexOf (':');
837 if (idx > 0) {
838 string source = s.Substring (0, idx).Trim ();
839 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
840 byte [] b = new byte [4];
841 for (int i = 0; i < 4; i++) {
842 if (l [i] == "*")
843 b [i] = 0;
844 else
845 b [i] = byte.Parse (l [i],
846 NumberStyles.HexNumber);
848 t.AddSortKeyMap (ParseTailoringSourceValue (source),
851 idx = s.IndexOf ('=');
852 if (idx > 0)
853 t.AddReplacementMap (
854 ParseTailoringSourceValue (
855 s.Substring (0, idx).Trim ()),
856 ParseTailoringSourceValue (
857 s.Substring (idx + 1).Trim ()));
860 void ParseDerivedAge (string filename)
862 using (StreamReader file =
863 new StreamReader (filename)) {
864 while (file.Peek () >= 0) {
865 string s = file.ReadLine ();
866 int idx = s.IndexOf ('#');
867 if (idx >= 0)
868 s = s.Substring (0, idx);
869 idx = s.IndexOf (';');
870 if (idx < 0)
871 continue;
873 string cpspec = s.Substring (0, idx);
874 idx = cpspec.IndexOf ("..");
875 NumberStyles nf = NumberStyles.HexNumber |
876 NumberStyles.AllowTrailingWhite;
877 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
878 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
879 string value = s.Substring (cpspec.Length + 1).Trim ();
881 // FIXME: use index
882 if (cp > char.MaxValue)
883 continue;
885 double v = double.Parse (value);
886 for (int i = cp; i <= cpEnd; i++)
887 unicodeAge [i] = v;
890 unicodeAge [0] = double.MaxValue; // never be supported
893 void ParseUnidata (string filename)
895 ArrayList decompValues = new ArrayList ();
896 using (StreamReader unidata =
897 new StreamReader (filename)) {
898 for (int line = 1; unidata.Peek () >= 0; line++) {
899 try {
900 ProcessUnidataLine (unidata.ReadLine (), decompValues);
901 } catch (Exception) {
902 Console.Error.WriteLine ("**** At line " + line);
903 throw;
907 this.decompValues = (int [])
908 decompValues.ToArray (typeof (int));
911 char previousLatinTarget = char.MinValue;
912 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
914 void ProcessUnidataLine (string s, ArrayList decompValues)
916 int idx = s.IndexOf ('#');
917 if (idx >= 0)
918 s = s.Substring (0, idx);
919 idx = s.IndexOf (';');
920 if (idx < 0)
921 return;
922 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
923 string [] values = s.Substring (idx + 1).Split (';');
925 // FIXME: use index
926 if (cp > char.MaxValue)
927 return;
928 if (IsIgnorable (cp))
929 return;
931 string name = values [0];
933 // SPECIAL CASE: rename some characters for diacritical
934 // remapping. FIXME: why are they different?
935 // FIXME: it's still not working.
936 if (cp == 0x018B || cp == 0x018C)
937 name = name.Replace ("TOPBAR", "STROKE");
939 // isSmallCapital
940 if (s.IndexOf ("SMALL CAPITAL") > 0)
941 isSmallCapital [cp] = true;
943 // latin mapping by character name
944 if (s.IndexOf ("LATIN") >= 0) {
945 int lidx = s.IndexOf ("LETTER DOTLESS ");
946 int offset = lidx + 15;
947 if (lidx < 0) {
948 lidx = s.IndexOf ("LETTER TURNED ");
949 offset = lidx + 14;
951 if (lidx < 0) {
952 lidx = s.IndexOf ("LETTER CAPITAL ");
953 offset = lidx + 15;
955 if (lidx < 0) {
956 lidx = s.IndexOf ("LETTER SCRIPT ");
957 offset = lidx + 14;
959 if (lidx < 0) {
960 lidx = s.IndexOf ("LETTER ");
961 offset = lidx + 7;
963 char c = lidx > 0 ? s [offset] : char.MinValue;
964 char n = s [offset + 1];
965 char target = char.MinValue;
966 if ('A' <= c && c <= 'Z' &&
967 (n == ' ') || n == ';') {
968 target = c;
969 // FIXME: After 'Z', I cannot reset this state.
970 previousLatinTarget = c == 'Z' ? char.MinValue : c;
973 if (s.Substring (offset).StartsWith ("ALPHA"))
974 target = 'A';
975 else if (s.Substring (offset).StartsWith ("TONE SIX"))
976 target = 'B';
977 else if (s.Substring (offset).StartsWith ("OPEN O"))
978 target = 'C';
979 else if (s.Substring (offset).StartsWith ("ETH"))
980 target = 'D';
981 else if (s.Substring (offset).StartsWith ("SCHWA"))
982 target = 'E';
983 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
984 target = 'O';
985 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
986 target = 'R';
987 else if (s.Substring (offset).StartsWith ("TONE TWO"))
988 target = 'S';
989 else if (s.Substring (offset).StartsWith ("ESH"))
990 target = 'S';
991 else if (s.Substring (offset).StartsWith ("OUNCE"))
992 target = 'Z';
994 // For remaining IPA chars, direct mapping is
995 // much faster.
996 switch (cp) {
997 case 0x0166: case 0x0167:
998 // Though they are 'T', they have different weight
999 target = char.MinValue; break;
1000 case 0x0299: target = 'B'; break;
1001 case 0x029A: target = 'E'; break;
1002 case 0x029B: target = 'G'; break;
1003 case 0x029C: target = 'H'; break;
1004 case 0x029D: target = 'J'; break;
1005 case 0x029E: target = 'K'; break;
1006 case 0x029F: target = 'L'; break;
1007 case 0x02A0: target = 'Q'; break;
1008 case 0x02A7: target = 'T'; break;
1009 case 0x02A8: target = 'T'; break;
1012 if (target == char.MinValue)
1013 target = previousLatinTarget;
1015 if (target != char.MinValue) {
1016 ArrayList entry = (ArrayList) latinMap [target];
1017 if (entry == null) {
1018 entry = new ArrayList ();
1019 latinMap [target] = entry;
1021 entry.Add (cp);
1022 // FIXME: This secondary weight is hack.
1023 // They are here because they must not
1024 // be identical to the corresponding
1025 // ASCII latins.
1026 if (c != target && diacritical [cp] == 0) {
1027 diacriticalOffset [c - 'A']++;
1028 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
1033 // Arrow names
1034 if (0x2000 <= cp && cp < 0x3000) {
1035 int value = 0;
1036 // SPECIAL CASES. FIXME: why?
1037 switch (cp) {
1038 case 0x21C5: value = -1; break; // E2
1039 case 0x261D: value = 1; break;
1040 case 0x27A6: value = 3; break;
1041 case 0x21B0: value = 7; break;
1042 case 0x21B1: value = 3; break;
1043 case 0x21B2: value = 7; break;
1044 case 0x21B4: value = 5; break;
1045 case 0x21B5: value = 7; break;
1046 case 0x21B9: value = -1; break; // E1
1047 case 0x21CF: value = 7; break;
1048 case 0x21D0: value = 3; break;
1050 string [] arrowTargets = new string [] {
1052 "UPWARDS",
1053 "NORTH EAST",
1054 "RIGHTWARDS",
1055 "SOUTH EAST",
1056 "DOWNWARDS",
1057 "SOUTH WEST",
1058 "LEFTWARDS",
1059 "NORTH WEST",
1060 "LEFT RIGHT",
1061 "UP DOWN",
1063 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
1064 s.IndexOf ("LEFTWARDS") >= 0)
1065 value = 0xE1 - 0xD8;
1066 else if (s.IndexOf ("UPWARDS") >= 0 &&
1067 s.IndexOf ("DOWNWARDS") >= 0)
1068 value = 0xE2 - 0xD8;
1069 else if (s.IndexOf ("ARROW") >= 0 &&
1070 s.IndexOf ("COMBINING") < 0 &&
1071 s.IndexOf ("CLOCKWISE") >= 0)
1072 value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
1073 if (value == 0)
1074 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
1075 if (s.IndexOf (arrowTargets [i]) > 0 &&
1076 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
1077 s.IndexOf (" OVER") < 0
1079 value = i;
1080 if (value > 0)
1081 arrowValues.Add (new DictionaryEntry (
1082 cp, value));
1085 // Box names
1086 if (0x2500 <= cp && cp < 0x2600) {
1087 int value = int.MinValue;
1088 // flags:
1089 // up:1 down:2 right:4 left:8 vert:16 horiz:32
1090 // [h,rl] [r] [l]
1091 // [v,ud] [u] [d]
1092 // [dr] [dl] [ur] [ul]
1093 // [vr,udr] [vl,vdl]
1094 // [hd,rld] [hu,rlu]
1095 // [hv,udrl,rlv,udh]
1096 ArrayList flags = new ArrayList (new int [] {
1097 32, 8 + 4, 8, 4,
1098 16, 1 + 2, 1, 2,
1099 4 + 2, 8 + 2, 4 + 1, 8 + 1,
1100 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
1101 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
1102 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
1104 byte [] offsets = new byte [] {
1105 0, 0, 1, 2,
1106 3, 3, 4, 5,
1107 6, 7, 8, 9,
1108 10, 10, 11, 11,
1109 12, 12, 13, 13,
1110 14, 14, 14, 14};
1111 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
1112 int flag = 0;
1113 if (s.IndexOf (" UP") >= 0)
1114 flag |= 1;
1115 if (s.IndexOf (" DOWN") >= 0)
1116 flag |= 2;
1117 if (s.IndexOf (" RIGHT") >= 0)
1118 flag |= 4;
1119 if (s.IndexOf (" LEFT") >= 0)
1120 flag |= 8;
1121 if (s.IndexOf (" VERTICAL") >= 0)
1122 flag |= 16;
1123 if (s.IndexOf (" HORIZONTAL") >= 0)
1124 flag |= 32;
1126 int fidx = flags.IndexOf (flag);
1127 if (fidx >= 0)
1128 value = offsets [fidx];
1129 } else if (s.IndexOf ("BLOCK") >= 0) {
1130 if (s.IndexOf ("ONE EIGHTH") >= 0)
1131 value = 0x12;
1132 else if (s.IndexOf ("ONE QUARTER") >= 0)
1133 value = 0x13;
1134 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1135 value = 0x14;
1136 else if (s.IndexOf ("HALF") >= 0)
1137 value = 0x15;
1138 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1139 value = 0x16;
1140 else if (s.IndexOf ("THREE QUARTERS") >= 0)
1141 value = 0x17;
1142 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1143 value = 0x18;
1144 else
1145 value = 0x19;
1147 else if (s.IndexOf ("SHADE") >= 0)
1148 value = 0x19;
1149 else if (s.IndexOf ("SQUARE") >= 0)
1150 value = 0xBC - 0xE5;
1151 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1152 value = 0xBE - 0xE5;
1153 else if (s.IndexOf ("RECTANGLE") >= 0)
1154 value = 0xBD - 0xE5;
1155 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1156 value = 0xBF - 0xE5;
1157 else if (s.IndexOf ("TRIANGLE") >= 0) {
1158 if (s.IndexOf ("UP-POINTING") >= 0)
1159 value = 0xC0 - 0xE5;
1160 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1161 value = 0xC1 - 0xE5;
1162 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1163 value = 0xC2 - 0xE5;
1164 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1165 value = 0xC3 - 0xE5;
1167 else if (s.IndexOf ("POINTER") >= 0) {
1168 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1169 value = 0xC4 - 0xE5;
1170 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1171 value = 0xC5 - 0xE5;
1173 else if (s.IndexOf ("DIAMOND") >= 0)
1174 value = 0xC6 - 0xE5;
1175 else if (s.IndexOf ("FISHEYE") >= 0)
1176 value = 0xC7 - 0xE5;
1177 else if (s.IndexOf ("LOZENGE") >= 0)
1178 value = 0xC8 - 0xE5;
1179 else if (s.IndexOf ("BULLSEYE") >= 0)
1180 value = 0xC9 - 0xE5;
1181 else if (s.IndexOf ("CIRCLE") >= 0) {
1182 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1183 value = 0xCA - 0xE5;
1184 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1185 value = 0xCB - 0xE5;
1186 else
1187 value = 0xC9 - 0xE5;
1189 else if (s.IndexOf ("BULLET") >= 0)
1190 value = 0xCC - 0xE5;
1191 if (0x25DA <= cp && cp <= 0x25E5)
1192 value = 0xCD + cp - 0x25DA - 0xE5;
1194 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1195 switch (cp) {
1196 case 0x2571: value = 0xF; break;
1197 case 0x2572: value = 0x10; break;
1198 case 0x2573: value = 0x11; break;
1200 if (value != int.MinValue)
1201 boxValues.Add (new DictionaryEntry (
1202 cp, value));
1205 // For some characters store the name and sort later
1206 // to determine sorting.
1207 if (0x2100 <= cp && cp <= 0x213F &&
1208 Char.IsSymbol ((char) cp))
1209 sortableCharNames.Add (
1210 new DictionaryEntry (cp, name));
1211 else if (0x3380 <= cp && cp <= 0x33DD)
1212 sortableCharNames.Add (new DictionaryEntry (
1213 cp, name.Substring (7)));
1215 if (Char.GetUnicodeCategory ((char) cp) ==
1216 UnicodeCategory.MathSymbol) {
1217 if (name.StartsWith ("CIRCLED "))
1218 diacritical [cp] = 0xEE;
1219 if (name.StartsWith ("SQUARED "))
1220 diacritical [cp] = 0xEF;
1223 // diacritical weights by character name
1224 if (diacritics.Length != diacriticWeights.Length)
1225 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1226 for (int d = diacritics.Length - 1; d >= 0; d--) {
1227 if (s.IndexOf (diacritics [d]) > 0) {
1228 diacritical [cp] += diacriticWeights [d];
1229 if (s.IndexOf ("COMBINING") >= 0)
1230 diacritical [cp] -= (byte) 2;
1231 break;
1233 // also process "COMBINING blah" here
1234 // For now it is limited to cp < 0x0370
1235 // if (cp < 0x0300 || cp >= 0x0370)
1236 // continue;
1237 string tmp = diacritics [d].TrimEnd (';');
1238 if (tmp.IndexOf ("WITH ") == 0)
1239 tmp = tmp.Substring (4);
1240 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1241 if (name == tmp) {
1242 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1243 break;
1245 //if (name == tmp)
1246 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1248 // Two-step grep required for it.
1249 if (s.IndexOf ("FULL STOP") > 0 &&
1250 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1251 diacritical [cp] |= 0xF4;
1252 if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1253 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1254 s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1256 // Arabic letter name
1257 if (0x0621 <= cp && cp <= 0x064A &&
1258 Char.GetUnicodeCategory ((char) cp)
1259 == UnicodeCategory.OtherLetter) {
1260 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1261 switch (cp) {
1262 case 0x0621:
1263 case 0x0624:
1264 case 0x0626:
1265 // hamza, waw, yeh ... special cases.
1266 value = 0x07;
1267 break;
1268 case 0x0649:
1269 case 0x064A:
1270 value = 0x77; // special cases.
1271 break;
1272 default:
1273 // Get primary letter name i.e.
1274 // XXX part of ARABIC LETTER XXX yyy
1275 // e.g. that of "TEH MARBUTA" is "TEH".
1276 string letterName =
1277 (cp == 0x0640) ?
1278 // 0x0640 is special: it does
1279 // not start with ARABIC LETTER
1280 name :
1281 name.Substring (14);
1282 int tmpIdx = letterName.IndexOf (' ');
1283 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1284 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1285 if (arabicNameMap.ContainsKey (letterName))
1286 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1287 else
1288 arabicNameMap [letterName] = cp;
1289 break;
1291 arabicLetterPrimaryValues [cp] = value;
1294 // Japanese square letter
1295 if (0x3300 <= cp && cp <= 0x3357)
1296 if (!ExistsJIS (cp))
1297 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1299 // normalizationType
1300 string decomp = values [4];
1301 idx = decomp.IndexOf ('<');
1302 if (idx >= 0) {
1303 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1304 case "full":
1305 decompType [cp] = DecompositionFull;
1306 break;
1307 case "sub":
1308 decompType [cp] = DecompositionSub;
1309 break;
1310 case "super":
1311 decompType [cp] = DecompositionSuper;
1312 break;
1313 case "small":
1314 decompType [cp] = DecompositionSmall;
1315 break;
1316 case "isolated":
1317 decompType [cp] = DecompositionIsolated;
1318 break;
1319 case "initial":
1320 decompType [cp] = DecompositionInitial;
1321 break;
1322 case "final":
1323 decompType [cp] = DecompositionFinal;
1324 break;
1325 case "medial":
1326 decompType [cp] = DecompositionMedial;
1327 break;
1328 case "noBreak":
1329 decompType [cp] = DecompositionNoBreak;
1330 break;
1331 case "compat":
1332 decompType [cp] = DecompositionCompat;
1333 break;
1334 case "fraction":
1335 decompType [cp] = DecompositionFraction;
1336 break;
1337 case "font":
1338 decompType [cp] = DecompositionFont;
1339 break;
1340 case "circle":
1341 decompType [cp] = DecompositionCircle;
1342 break;
1343 case "square":
1344 decompType [cp] = DecompositionSquare;
1345 break;
1346 case "wide":
1347 decompType [cp] = DecompositionWide;
1348 break;
1349 case "narrow":
1350 decompType [cp] = DecompositionNarrow;
1351 break;
1352 case "vertical":
1353 decompType [cp] = DecompositionVertical;
1354 break;
1355 default:
1356 throw new Exception ("Support NFKD type : " + decomp);
1359 else
1360 decompType [cp] = DecompositionCanonical;
1361 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1362 if (decomp.Length > 0) {
1364 string [] velems = decomp.Split (' ');
1365 int didx = decompValues.Count;
1366 decompIndex [cp] = didx;
1367 foreach (string v in velems)
1368 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1369 decompLength [cp] = velems.Length;
1371 // [decmpType] -> this_cp
1372 int targetCP = (int) decompValues [didx];
1373 // for "(x)" it specially maps to 'x' .
1374 // FIXME: check if it is sane
1375 if (velems.Length == 3 &&
1376 (int) decompValues [didx] == '(' &&
1377 (int) decompValues [didx + 2] == ')')
1378 targetCP = (int) decompValues [didx + 1];
1379 // special: 0x215F "1/"
1380 else if (cp == 0x215F)
1381 targetCP = '1';
1382 else if (velems.Length > 1 &&
1383 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1384 // skip them, except for CJK ideograph compat
1385 targetCP = 0;
1387 if (targetCP != 0) {
1388 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1389 if (entry == null) {
1390 entry = new Hashtable ();
1391 nfkdMap [targetCP] = entry;
1393 entry [(byte) decompType [cp]] = cp;
1396 // numeric values
1397 if (values [5].Length > 0)
1398 decimalValue [cp] = decimal.Parse (values [5]);
1399 else if (values [6].Length > 0)
1400 decimalValue [cp] = decimal.Parse (values [6]);
1401 else if (values [7].Length > 0) {
1402 string decstr = values [7];
1403 idx = decstr.IndexOf ('/');
1404 if (cp == 0x215F) // special. "1/"
1405 decimalValue [cp] = 0x1;
1406 else if (idx > 0)
1407 // m/n
1408 decimalValue [cp] =
1409 decimal.Parse (decstr.Substring (0, idx))
1410 / decimal.Parse (decstr.Substring (idx + 1));
1411 else if (decstr [0] == '(' &&
1412 decstr [decstr.Length - 1] == ')')
1413 // (n)
1414 decimalValue [cp] =
1415 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1416 else if (decstr [decstr.Length - 1] == '.')
1417 // n.
1418 decimalValue [cp] =
1419 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1420 else
1421 decimalValue [cp] = decimal.Parse (decstr);
1425 void ParseDerivedCoreProperties (string filename)
1427 // IsUppercase
1428 using (StreamReader file =
1429 new StreamReader (filename)) {
1430 for (int line = 1; file.Peek () >= 0; line++) {
1431 try {
1432 ProcessDerivedCorePropLine (file.ReadLine ());
1433 } catch (Exception) {
1434 Console.Error.WriteLine ("**** At line " + line);
1435 throw;
1441 void ProcessDerivedCorePropLine (string s)
1443 int idx = s.IndexOf ('#');
1444 if (idx >= 0)
1445 s = s.Substring (0, idx);
1446 idx = s.IndexOf (';');
1447 if (idx < 0)
1448 return;
1449 string cpspec = s.Substring (0, idx);
1450 idx = cpspec.IndexOf ("..");
1451 NumberStyles nf = NumberStyles.HexNumber |
1452 NumberStyles.AllowTrailingWhite;
1453 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1454 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1455 string value = s.Substring (cpspec.Length + 1).Trim ();
1457 // FIXME: use index
1458 if (cp > char.MaxValue)
1459 return;
1461 switch (value) {
1462 case "Uppercase":
1463 for (int x = cp; x <= cpEnd; x++)
1464 isUppercase [x] = true;
1465 break;
1469 void ParseScripts (string filename)
1471 ArrayList gurmukhi = new ArrayList ();
1472 ArrayList gujarati = new ArrayList ();
1473 ArrayList georgian = new ArrayList ();
1474 ArrayList thaana = new ArrayList ();
1476 using (StreamReader file =
1477 new StreamReader (filename)) {
1478 while (file.Peek () >= 0) {
1479 string s = file.ReadLine ();
1480 int idx = s.IndexOf ('#');
1481 if (idx >= 0)
1482 s = s.Substring (0, idx);
1483 idx = s.IndexOf (';');
1484 if (idx < 0)
1485 continue;
1487 string cpspec = s.Substring (0, idx);
1488 idx = cpspec.IndexOf ("..");
1489 NumberStyles nf = NumberStyles.HexNumber |
1490 NumberStyles.AllowTrailingWhite;
1491 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1492 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1493 string value = s.Substring (cpspec.Length + 1).Trim ();
1495 // FIXME: use index
1496 if (cp > char.MaxValue)
1497 continue;
1499 switch (value) {
1500 case "Gurmukhi":
1501 for (int x = cp; x <= cpEnd; x++)
1502 if (!IsIgnorable (x))
1503 gurmukhi.Add ((char) x);
1504 break;
1505 case "Gujarati":
1506 for (int x = cp; x <= cpEnd; x++)
1507 if (!IsIgnorable (x))
1508 gujarati.Add ((char) x);
1509 break;
1510 case "Georgian":
1511 for (int x = cp; x <= cpEnd; x++)
1512 if (!IsIgnorable (x))
1513 georgian.Add ((char) x);
1514 break;
1515 case "Thaana":
1516 for (int x = cp; x <= cpEnd; x++)
1517 if (!IsIgnorable (x))
1518 thaana.Add ((char) x);
1519 break;
1523 gurmukhi.Sort (UCAComparer.Instance);
1524 gujarati.Sort (UCAComparer.Instance);
1525 georgian.Sort (UCAComparer.Instance);
1526 thaana.Sort (UCAComparer.Instance);
1527 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1528 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1529 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1530 orderedThaana = (char []) thaana.ToArray (typeof (char));
1533 void ParseJISOrder (string filename)
1535 int line = 1;
1536 try {
1537 using (StreamReader file =
1538 new StreamReader (filename)) {
1539 for (;file.Peek () >= 0; line++)
1540 ProcessJISOrderLine (file.ReadLine ());
1542 } catch (Exception) {
1543 Console.Error.WriteLine ("---- line {0}", line);
1544 throw;
1548 char [] ws = new char [] {'\t', ' '};
1550 void ProcessJISOrderLine (string s)
1552 int idx = s.IndexOf ('#');
1553 if (idx >= 0)
1554 s = s.Substring (0, idx).Trim ();
1555 if (s.Length == 0)
1556 return;
1557 idx = s.IndexOfAny (ws);
1558 if (idx < 0)
1559 return;
1560 // They start with "0x" so cut them out.
1561 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1562 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1563 jisJapanese.Add (new JISCharacter (cp, jis));
1566 void ParseCJK (string zhXML, string jaXML, string koXML)
1568 XmlDocument doc = new XmlDocument ();
1569 doc.XmlResolver = null;
1570 int v;
1571 string s;
1572 string category;
1573 int offset;
1574 ushort [] arr;
1576 // Chinese Simplified
1577 category = "chs";
1578 arr = cjkCHS;
1579 offset = 0;//char.MaxValue - arr.Length;
1580 doc.Load (zhXML);
1581 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1582 v = 0x8008;
1583 foreach (char c in s) {
1584 if (c < '\u3100')
1585 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1586 else {
1587 arr [(int) c - offset] = (ushort) v++;
1588 if (v % 256 == 0)
1589 v += 2;
1593 // Chinese Traditional
1594 category = "cht";
1595 arr = cjkCHT;
1596 offset = 0;//char.MaxValue - arr.Length;
1597 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1598 v = 0x8002;
1599 foreach (char c in s) {
1600 if (c < '\u4E00')
1601 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1602 else {
1603 arr [(int) c - offset] = (ushort) v++;
1604 if (v % 256 == 0)
1605 v += 2;
1609 // Japanese
1610 category = "ja";
1611 arr = cjkJA;
1612 offset = 0;//char.MaxValue - arr.Length;
1614 // SPECIAL CASES
1615 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1616 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1617 arr [0x337E] = 0x8005;
1618 arr [0x337D] = 0x8006;
1619 arr [0x337C] = 0x8007;
1621 v = 0x8008;
1622 foreach (JISCharacter jc in jisJapanese) {
1623 if (jc.JIS < 0x8800)
1624 continue;
1625 char c = (char) jc.CP;
1627 if (c < '\u4E00')
1628 // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1629 continue;
1630 else {
1631 arr [(int) c - offset] = (ushort) v++;
1632 if (v % 256 == 0)
1633 v += 2;
1635 // SPECIAL CASES:
1636 if (c == '\u662D') // U+337C
1637 continue;
1638 if (c == '\u5927') // U+337D
1639 continue;
1640 if (c == '\u5E73') // U+337B
1641 continue;
1642 if (c == '\u660E') // U+337E
1643 continue;
1644 if (c == '\u9686') // U+F9DC
1645 continue;
1647 // FIXME: there are still remaining
1648 // characters after U+FA0C.
1649 // for (int k = 0; k < char.MaxValue; k++) {
1650 for (int k = 0; k < '\uFA0D'; k++) {
1651 if (decompIndex [k] == 0 || IsIgnorable (k))
1652 continue;
1653 if (decompValues [decompIndex [k]] == c /*&&
1654 decompLength [k] == 1*/ ||
1655 decompLength [k] == 3 &&
1656 decompValues [decompIndex [k] + 1] == c) {
1657 arr [k - offset] = (ushort) v++;
1658 if (v % 256 == 0)
1659 v += 2;
1665 // Korean
1666 // Korean weight is somewhat complex. It first shifts
1667 // Hangul category from 52-x to 80-x (they are anyways
1668 // computed). CJK ideographs are placed at secondary
1669 // weight, like XX YY 01 zz 01, where XX and YY are
1670 // corresponding "reset" value and zz is 41,43,45...
1672 // Unlike chs,cht and ja, Korean value is a combined
1673 // ushort which is computed as category
1675 category = "ko";
1676 arr = cjkKO;
1677 offset = 0;//char.MaxValue - arr.Length;
1678 doc.Load (koXML);
1679 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1680 XmlElement sc = (XmlElement) reset.NextSibling;
1681 // compute "category" and "level 1" for the
1682 // target "reset" Hangle syllable
1683 char rc = reset.InnerText [0];
1684 int ri = ((int) rc - 0xAC00) + 1;
1685 ushort p = (ushort)
1686 ((ri / 254) * 256 + (ri % 254) + 2);
1687 // Place the characters after the target.
1688 s = sc.InnerText;
1689 v = 0x41;
1690 foreach (char c in s) {
1691 arr [(int) c - offset] = p;
1692 cjkKOlv2 [(int) c - offset] = (byte) v;
1693 v += 2;
1698 #endregion
1700 #region Generation
1702 void FillIgnorables ()
1704 for (int i = 0; i <= char.MaxValue; i++) {
1705 if (Char.GetUnicodeCategory ((char) i) ==
1706 UnicodeCategory.OtherNotAssigned)
1707 continue;
1708 if (IsIgnorable (i))
1709 ignorableFlags [i] |= 1;
1710 if (IsIgnorableSymbol (i))
1711 ignorableFlags [i] |= 2;
1712 if (IsIgnorableNonSpacing (i))
1713 ignorableFlags [i] |= 4;
1717 void ModifyUnidata ()
1719 ArrayList decompValues = new ArrayList (this.decompValues);
1721 // Hebrew uppercase letters.
1722 foreach (int i in new int []
1723 {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6})
1724 isUppercase [i] = true;
1727 // Modify some decomposition equivalence
1728 for (int i = 0xFE31; i <= 0xFE34; i++) {
1729 decompType [i] = 0;
1730 decompIndex [i] = 0;
1731 decompLength [i] = 0;
1733 decompType [0x037E] = 0;
1734 decompIndex [0x037E] = 0;
1735 decompLength [0x037E] = 0;
1737 // Hangzhou numbers
1738 for (int i = 0x3021; i <= 0x3029; i++)
1739 diacritical [i] = 0x4E;
1740 // Korean parens numbers
1741 for (int i = 0x3200; i <= 0x321C; i++)
1742 diacritical [i] = 0xA;
1743 for (int i = 0x3260; i <= 0x327B; i++)
1744 diacritical [i] = 0xC;
1746 // LAMESPEC: these remapping should not be done.
1747 // Windows have incorrect CJK compat mappings.
1748 decompValues [decompIndex [0x32A9]] = 0x91AB;
1749 decompLength [0x323B] = 1;
1750 decompValues [decompIndex [0x323B]] = 0x5B78;
1751 decompValues [decompIndex [0x32AB]] = 0x5B78;
1752 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1753 decompLength [0x3238] = 1;
1754 decompValues [decompIndex [0x3238]] = 0x52DE;
1755 decompValues [decompIndex [0x3298]] = 0x52DE;
1757 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1758 decompIndex [0xFA0C] = decompValues.Count;
1759 decompValues.Add ((int) 0x5140);
1760 decompLength [0xFA0C] = 1;
1761 decompIndex [0xF929] = decompLength [0xF929] = 0;
1763 decompValues [decompIndex [0xF92C]] = 0x90DE;
1765 decompIndex [0x2125] = decompValues.Count;
1766 decompValues.Add ((int) 0x005A);
1767 decompLength [0x2125] = 1;
1768 decompType [0x2125] = DecompositionFont;
1770 this.decompValues = decompValues.ToArray (typeof (int)) as int [];
1773 void ModifyParsedValues ()
1775 // Sometimes STROKE don't work fine
1776 diacritical [0xD8] = diacritical [0xF8] = 0x21;
1777 diacritical [0x141] = diacritical [0x142] = 0x1F;
1778 // FIXME: why?
1779 diacritical [0xAA] = diacritical [0xBA] = 3;
1780 diacritical [0xD0] = diacritical [0xF0] = 0x68;
1781 diacritical [0x131] = 3;
1782 diacritical [0x138] = 3;
1783 // TOPBAR does not work as an identifier for the weight
1784 diacritical [0x182] = diacritical [0x183] = 0x68; // B
1785 diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D
1786 // TONE TWO
1787 diacritical [0x1A7] = diacritical [0x1A8] = 0x87;
1788 // TONE SIX
1789 diacritical [0x184] = diacritical [0x185] = 0x87;
1790 // OPEN E
1791 diacritical [0x190] = diacritical [0x25B] = 0x7B;
1792 // There are many letters w/ diacritical weight 0x7B
1793 diacritical [0x0192] = diacritical [0x0194] =
1794 diacritical [0x0195] = diacritical [0x0196] =
1795 diacritical [0x019C] = diacritical [0x019E] =
1796 diacritical [0x01A6] = diacritical [0x01B1] =
1797 diacritical [0x01B2] = diacritical [0x01BF] = 0x7B;
1798 // ... as well as 0x7C
1799 diacritical [0x01A2] = diacritical [0x01A3] = 0x7C;
1801 // <font> NFKD characters seem to have diacritical
1802 // weight as 3,4,5... but the order does not look
1803 // by codepoint and I have no idea how they are sorted.
1804 diacritical [0x210E] = 3;
1805 diacritical [0x210F] = 0x68;
1806 diacritical [0x2110] = 4;
1807 diacritical [0x2111] = 5;
1808 diacritical [0x2112] = 4;
1809 diacritical [0x2113] = 4;
1810 diacritical [0x211B] = 4;
1811 diacritical [0x211C] = 5;
1813 // some cyrillic diacritical weight. They seem to be
1814 // based on old character names, so it's quicker to
1815 // set them directly here.
1816 // FIXME: they are by mostly unknown reason
1817 diacritical [0x0496] = diacritical [0x0497] = 7;
1818 diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1819 diacritical [0x049A] = diacritical [0x049B] = 0x17;
1820 diacritical [0x049C] = diacritical [0x049D] = 9;
1821 diacritical [0x049E] = diacritical [0x049F] = 4;
1822 diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1823 diacritical [0x04A2] = diacritical [0x04A3] = 7;
1824 diacritical [0x04A4] = diacritical [0x04A5] = 8;
1825 diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA?
1826 diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2
1827 diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U?
1828 diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC
1829 diacritical [0x04B4] = diacritical [0x04B5] = 3;
1830 diacritical [0x04B6] = 8;
1831 diacritical [0x04B7] = 7;
1832 diacritical [0x04B8] = diacritical [0x04B9] = 9;
1833 diacritical [0x04BA] = diacritical [0x04BB] = 9;
1835 // number, secondary weights
1836 byte weight = 0x38;
1837 int [] numarr = numberSecondaryWeightBounds;
1838 for (int i = 0; i < numarr.Length; i += 2, weight++)
1839 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1840 if (Char.IsNumber ((char) cp))
1841 diacritical [cp] = weight;
1843 // Gurmukhi special letters' diacritical weight
1844 for (int i = 0x0A50; i < 0x0A60; i++)
1845 diacritical [i] = 4;
1846 // Oriya special letters' diacritical weight
1847 for (int i = 0x0B5C; i < 0x0B60; i++)
1848 diacritical [i] = 6;
1850 // Update name part of named characters
1851 for (int i = 0; i < sortableCharNames.Count; i++) {
1852 DictionaryEntry de =
1853 (DictionaryEntry) sortableCharNames [i];
1854 int cp = (int) de.Key;
1855 string renamed = null;
1856 switch (cp) {
1857 case 0x2101: renamed = "A_1"; break;
1858 case 0x33C3: renamed = "A_2"; break;
1859 case 0x2105: renamed = "C_1"; break;
1860 case 0x2106: renamed = "C_2"; break;
1861 case 0x211E: renamed = "R1"; break;
1862 case 0x211F: renamed = "R2"; break;
1863 // Remove some of them!
1864 case 0x2103:
1865 case 0x2109:
1866 case 0x2116:
1867 case 0x2117:
1868 case 0x2118:
1869 case 0x2125:
1870 case 0x2127:
1871 case 0x2129:
1872 case 0x212E:
1873 case 0x2132:
1874 sortableCharNames.RemoveAt (i);
1875 i--;
1876 continue;
1878 if (renamed != null)
1879 sortableCharNames [i] =
1880 new DictionaryEntry (cp, renamed);
1884 void GenerateCore ()
1886 UnicodeCategory uc;
1888 #region Specially ignored // 01
1889 // This will raise "Defined" flag up.
1890 // FIXME: Check If it is really fine. Actually for
1891 // Japanese voice marks this code does remapping.
1892 foreach (char c in specialIgnore)
1893 map [(int) c] = new CharMapEntry (0, 0, 0);
1894 #endregion
1896 #region Extenders (FF FF)
1897 fillIndex [0xFF] = 0xFF;
1898 char [] specialBiggest = new char [] {
1899 '\u3005', '\u3031', '\u3032', '\u309D',
1900 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1901 '\uFE7C', '\uFE7D', '\uFF70'};
1902 foreach (char c in specialBiggest)
1903 AddCharMap (c, 0xFF, 0);
1904 #endregion
1906 #region Variable weights
1907 // Controls : 06 03 - 06 3D
1908 fillIndex [0x6] = 3;
1909 for (int i = 0; i < 65536; i++) {
1910 if (IsIgnorable (i))
1911 continue;
1912 char c = (char) i;
1913 uc = Char.GetUnicodeCategory (c);
1914 // NEL is whitespace but not ignored here.
1915 if (uc == UnicodeCategory.Control &&
1916 !Char.IsWhiteSpace (c) || c == '\u0085')
1917 AddCharMap (c, 6, 1);
1920 // Apostrophe 06 80
1921 fillIndex [0x6] = 0x80;
1922 AddCharMap ('\'', 6, 0);
1923 AddCharMap ('\uFF07', 6, 1);
1924 AddCharMap ('\uFE63', 6, 1);
1926 // SPECIAL CASE: fill FE32 here in prior to be added
1927 // at 2013. Windows does not always respect NFKD.
1928 map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1930 // Hyphen/Dash : 06 81 - 06 90
1931 for (int i = 0; i < char.MaxValue; i++) {
1932 if (!IsIgnorable (i) &&
1933 Char.GetUnicodeCategory ((char) i) ==
1934 UnicodeCategory.DashPunctuation) {
1935 AddCharMapGroup2 ((char) i, 6, 1, 0);
1936 if (i == 0x2011) {
1937 // SPECIAL: add 2027 and 2043
1938 // Maybe they are regarded the
1939 // same hyphens in "central"
1940 // position.
1941 AddCharMap ('\u2027', 6, 1);
1942 AddCharMap ('\u2043', 6, 1);
1946 // They are regarded as primarily equivalent to '-'
1947 map [0x208B] = new CharMapEntry (6, 0x82, 0);
1948 map [0x207B] = new CharMapEntry (6, 0x82, 0);
1949 map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1951 // Arabic variable weight chars 06 A0 -
1952 fillIndex [6] = 0xA0;
1953 // vowels
1954 for (int i = 0x64B; i <= 0x650; i++)
1955 AddArabicCharMap ((char) i, 6, 1, 0);
1956 // sukun
1957 AddCharMapGroup ('\u0652', 6, 1, 0);
1958 // shadda
1959 AddCharMapGroup ('\u0651', 6, 1, 0);
1960 #endregion
1963 #region Nonspacing marks // 01
1964 // FIXME: 01 03 - 01 B6 ... annoyance :(
1966 // Combining diacritical marks: 01 DC -
1968 fillIndex [0x1] = 0x41;
1969 for (int i = 0x030E; i <= 0x0326; i++)
1970 if (!IsIgnorable (i))
1971 AddCharMap ((char) i, 0x1, 1);
1972 for (int i = 0x0329; i <= 0x0334; i++)
1973 if (!IsIgnorable (i))
1974 AddCharMap ((char) i, 0x1, 1);
1975 fillIndex [0x1]++;
1976 for (int i = 0x0339; i <= 0x0341; i++)
1977 if (!IsIgnorable (i))
1978 AddCharMap ((char) i, 0x1, 1);
1979 fillIndex [0x1] = 0x74;
1980 for (int i = 0x0346; i <= 0x0348; i++)
1981 if (!IsIgnorable (i))
1982 AddCharMap ((char) i, 0x1, 1);
1983 for (int i = 0x02BE; i <= 0x02BF; i++)
1984 if (!IsIgnorable (i))
1985 AddCharMap ((char) i, 0x1, 1);
1986 for (int i = 0x02C1; i <= 0x02C5; i++)
1987 if (!IsIgnorable (i))
1988 AddCharMap ((char) i, 0x1, 1);
1989 for (int i = 0x02CE; i <= 0x02CF; i++)
1990 if (!IsIgnorable (i))
1991 AddCharMap ((char) i, 0x1, 1);
1992 fillIndex [0x1]++;
1993 for (int i = 0x02D1; i <= 0x02D3; i++)
1994 if (!IsIgnorable (i))
1995 AddCharMap ((char) i, 0x1, 1);
1996 AddCharMap ('\u02DE', 0x1, 1);
1997 for (int i = 0x02E4; i <= 0x02E9; i++)
1998 if (!IsIgnorable (i))
1999 AddCharMap ((char) i, 0x1, 1);
2002 // FIXME: needs more love here (it should eliminate
2003 // all the hacky code above).
2004 for (int i = 0x0300; i < 0x0370; i++)
2005 if (!IsIgnorable (i) && diacritical [i] != 0
2006 && !map [i].Defined)
2007 map [i] = new CharMapEntry (
2008 0x1, 0x1, diacritical [i]);
2010 // Cyrillic and Armenian nonspacing mark
2011 fillIndex [0x1] = 0x94;
2012 for (int i = 0x400; i < 0x580; i++)
2013 if (!IsIgnorable (i) &&
2014 Char.GetUnicodeCategory ((char) i) ==
2015 UnicodeCategory.NonSpacingMark)
2016 AddCharMap ((char) i, 1, 1);
2018 fillIndex [0x1] = 0x8D;
2019 // syriac dotted nonspacing marks (1)
2020 AddCharMap ('\u0740', 0x1, 1);
2021 AddCharMap ('\u0741', 0x1, 1);
2022 AddCharMap ('\u0742', 0x1, 1);
2023 // syriac oblique nonspacing marks
2024 AddCharMap ('\u0747', 0x1, 1);
2025 AddCharMap ('\u0748', 0x1, 1);
2026 // syriac dotted nonspacing marks (2)
2027 fillIndex [0x1] = 0x94; // this reset is mandatory
2028 AddCharMap ('\u0732', 0x1, 1);
2029 AddCharMap ('\u0735', 0x1, 1);
2030 AddCharMap ('\u0738', 0x1, 1);
2031 AddCharMap ('\u0739', 0x1, 1);
2032 AddCharMap ('\u073C', 0x1, 1);
2033 // SPECIAL CASES: superscripts
2034 AddCharMap ('\u073F', 0x1, 1);
2035 AddCharMap ('\u0711', 0x1, 1);
2036 // syriac "DOTS"
2037 for (int i = 0x0743; i <= 0x0746; i++)
2038 AddCharMap ((char) i, 0x1, 1);
2039 for (int i = 0x0730; i <= 0x0780; i++)
2040 if (!map [i].Defined &&
2041 Char.GetUnicodeCategory ((char) i) ==
2042 UnicodeCategory.NonSpacingMark)
2043 AddCharMap ((char) i, 0x1, 1);
2045 // LAMESPEC: It should not stop at '\u20E1'. There are
2046 // a few more characters (that however results in
2047 // overflow of level 2 unless we start before 0xDD).
2048 fillIndex [0x1] = 0xDD;
2049 for (int i = 0x20D0; i <= 0x20DC; i++)
2050 AddCharMap ((char) i, 0x1, 1);
2051 fillIndex [0x1] = 0xEC;
2052 for (int i = 0x20DD; i <= 0x20E1; i++)
2053 AddCharMap ((char) i, 0x1, 1);
2054 fillIndex [0x1] = 0x4;
2055 AddCharMap ('\u0CD5', 0x1, 1);
2056 AddCharMap ('\u0CD6', 0x1, 1);
2057 AddCharMap ('\u093C', 0x1, 1);
2058 for (int i = 0x302A; i <= 0x302D; i++)
2059 AddCharMap ((char) i, 0x1, 1);
2060 AddCharMap ('\u0C55', 0x1, 1);
2061 AddCharMap ('\u0C56', 0x1, 1);
2063 fillIndex [0x1] = 0x50; // I wonder how they are sorted
2064 for (int i = 0x02D4; i <= 0x02D7; i++)
2065 AddCharMap ((char) i, 0x1, 1);
2067 // They are not part of Nonspacing marks, but have
2068 // only diacritical weight.
2069 for (int i = 0x3099; i <= 0x309C; i++)
2070 map [i] = new CharMapEntry (1, 1, 1);
2071 map [0xFF9E] = new CharMapEntry (1, 1, 1);
2072 map [0xFF9F] = new CharMapEntry (1, 1, 2);
2073 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
2074 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
2075 for (int i = 0x30FC; i <= 0x30FE; i++)
2076 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
2078 fillIndex [0x1] = 0xA;
2079 for (int i = 0x0951; i <= 0x0954; i++)
2080 AddCharMap ((char) i, 0x1, 2);
2082 #endregion
2085 #region Whitespaces // 07 03 -
2086 fillIndex [0x7] = 0x2;
2087 AddCharMap (' ', 0x7, 2);
2088 AddCharMap ('\u00A0', 0x7, 1);
2089 for (int i = 9; i <= 0xD; i++)
2090 AddCharMap ((char) i, 0x7, 1);
2091 for (int i = 0x2000; i <= 0x200B; i++)
2092 AddCharMap ((char) i, 0x7, 1);
2094 fillIndex [0x7] = 0x17;
2095 AddCharMapGroup ('\u2028', 0x7, 1, 0);
2096 AddCharMapGroup ('\u2029', 0x7, 1, 0);
2098 // Characters which used to represent layout control.
2099 // LAMESPEC: Windows developers seem to have thought
2100 // that those characters are kind of whitespaces,
2101 // while they aren't.
2102 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
2103 AddCharMap ('\u2423', 0x7, 1, 0); // open box
2105 #endregion
2107 // category 09 - continued symbols from 08
2108 fillIndex [0x9] = 2;
2109 // misc tech mark
2110 for (int cp = 0x2300; cp <= 0x237A; cp++)
2111 AddCharMap ((char) cp, 0x9, 1, 0);
2113 // arrows
2114 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
2115 foreach (DictionaryEntry de in arrowValues) {
2116 int idx = (int) de.Value;
2117 int cp = (int) de.Key;
2118 if (map [cp].Defined)
2119 continue;
2120 fillIndex [0x9] = (byte) (0xD8 + idx);
2121 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
2122 arrowLv2 [idx]++;
2124 // boxes
2125 byte [] boxLv2 = new byte [128];
2126 // 0-63 will be used for those offsets are positive,
2127 // and 64-127 are for negative ones.
2128 for (int i = 0; i < boxLv2.Length; i++)
2129 boxLv2 [i] = 3;
2130 foreach (DictionaryEntry de in boxValues) {
2131 int cp = (int) de.Key;
2132 int off = (int) de.Value;
2133 if (map [cp].Defined)
2134 continue;
2135 if (off < 0) {
2136 fillIndex [0x9] = (byte) (0xE5 + off);
2137 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
2139 else {
2140 fillIndex [0x9] = (byte) (0xE5 + off);
2141 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
2144 // Some special characters (slanted)
2145 fillIndex [0x9] = 0xF4;
2146 AddCharMap ('\u2571', 0x9, 3);
2147 AddCharMap ('\u2572', 0x9, 3);
2148 AddCharMap ('\u2573', 0x9, 3);
2150 // FIXME: implement 0A
2151 #region Symbols
2152 fillIndex [0xA] = 2;
2153 // byte currency symbols
2154 for (int cp = 0; cp < 0x100; cp++) {
2155 uc = Char.GetUnicodeCategory ((char) cp);
2156 if (!IsIgnorable (cp) &&
2157 uc == UnicodeCategory.CurrencySymbol &&
2158 cp != '$')
2159 AddCharMapGroup ((char) cp, 0xA, 1, 0);
2161 // byte other symbols
2162 for (int cp = 0; cp < 0x100; cp++) {
2163 if (cp == 0xA6)
2164 continue; // SPECIAL: skip FIXME: why?
2165 uc = Char.GetUnicodeCategory ((char) cp);
2166 if (!IsIgnorable (cp) &&
2167 uc == UnicodeCategory.OtherSymbol ||
2168 cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
2169 AddCharMapGroup ((char) cp, 0xA, 1, 0);
2171 // U+30FB here
2172 AddCharMapGroup ('\u30FB', 0xA, 1, 0);
2174 for (int cp = 0x2020; cp <= 0x2031; cp++)
2175 if (Char.IsPunctuation ((char) cp))
2176 AddCharMap ((char) cp, 0xA, 1, 0);
2177 // SPECIAL CASES: why?
2178 AddCharMap ('\u203B', 0xA, 1, 0);
2179 AddCharMap ('\u2040', 0xA, 1, 0);
2180 AddCharMap ('\u2041', 0xA, 1, 0);
2181 AddCharMap ('\u2042', 0xA, 1, 0);
2183 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
2184 AddCharMap ((char) cp, 0xA, 1, 0);
2186 // 3004 is skipped at first...
2187 for (int cp = 0x3010; cp <= 0x3040; cp++)
2188 if (Char.IsSymbol ((char) cp))
2189 AddCharMap ((char) cp, 0xA, 1, 0);
2190 // SPECIAL CASES: added here
2191 AddCharMap ('\u3004', 0xA, 1, 0);
2192 AddCharMap ('\u327F', 0xA, 1, 0);
2194 for (int cp = 0x2600; cp <= 0x2613; cp++)
2195 AddCharMap ((char) cp, 0xA, 1, 0);
2196 // Dingbats
2197 for (int cp = 0x2620; cp <= 0x2770; cp++)
2198 if (Char.IsSymbol ((char) cp))
2199 AddCharMap ((char) cp, 0xA, 1, 0);
2200 // OCR
2201 for (int i = 0x2440; i < 0x2460; i++)
2202 AddCharMap ((char) i, 0xA, 1, 0);
2204 // SPECIAL CASES: why?
2205 AddCharMap ('\u0E3F', 0xA, 1, 0);
2206 AddCharMap ('\u2117', 0xA, 1, 0);
2207 AddCharMap ('\u20AC', 0xA, 1, 0);
2208 #endregion
2210 #region Numbers // 0C 02 - 0C E1
2211 fillIndex [0xC] = 2;
2213 // 9F8 : Bengali "one less than the denominator"
2214 AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2216 ArrayList numbers = new ArrayList ();
2217 for (int i = 0; i < 65536; i++)
2218 if (!IsIgnorable (i) &&
2219 Char.IsNumber ((char) i) &&
2220 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2221 numbers.Add (i);
2223 ArrayList numberValues = new ArrayList ();
2224 foreach (int i in numbers)
2225 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2226 // SPECIAL CASE: Cyrillic Thousand sign
2227 numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2228 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2230 //foreach (DictionaryEntry de in numberValues)
2231 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2233 // FIXME: fillIndex adjustment lines are too
2234 // complicated. It must be simpler.
2235 decimal prevValue = -1;
2236 foreach (DictionaryEntry de in numberValues) {
2237 int cp = (int) de.Key;
2238 decimal currValue = (decimal) de.Value;
2239 bool addnew = false;
2240 if (prevValue < currValue &&
2241 prevValue - (int) prevValue == 0 &&
2242 prevValue >= 1) {
2244 addnew = true;
2245 // Process Hangzhou and Roman numbers
2247 // There are some SPECIAL cases.
2248 if (currValue != 4) // no increment for 4
2249 fillIndex [0xC]++;
2251 int xcp;
2252 if (currValue <= 13) {
2253 if (currValue == 4)
2254 fillIndex [0xC]++;
2255 // SPECIAL CASE
2256 if (currValue == 11)
2257 AddCharMap ('\u0BF0', 0xC, 1);
2258 xcp = (int) prevValue + 0x2160 - 1;
2259 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2260 xcp = (int) prevValue + 0x2170 - 1;
2261 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2262 fillIndex [0xC]++;
2264 if (currValue < 12)
2265 fillIndex [0xC]++;
2266 if (currValue <= 10) {
2267 xcp = (int) prevValue + 0x3021 - 1;
2268 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2269 fillIndex [0xC]++;
2272 if (prevValue < currValue)
2273 prevValue = currValue;
2274 if (map [cp].Defined)
2275 continue;
2276 // HangZhou and Roman are add later
2277 // (code is above)
2278 if (0x3021 <= cp && cp < 0x302A
2279 || 0x2160 <= cp && cp < 0x216C
2280 || 0x2170 <= cp && cp < 0x217C)
2281 continue;
2283 if (cp == 0x215B) // FIXME: why?
2284 fillIndex [0xC] += 2;
2285 else if (cp == 0x3021) // FIXME: why?
2286 fillIndex [0xC]++;
2287 if (addnew || cp <= '9') {
2288 int mod = (int) currValue - 1;
2289 int xcp;
2290 if (1 <= currValue && currValue <= 11) {
2291 xcp = mod + 0x2776;
2292 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2293 xcp = mod + 0x2780;
2294 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2295 xcp = mod + 0x278A;
2296 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2298 if (1 <= currValue && currValue <= 20) {
2299 xcp = mod + 0x2460;
2300 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2301 xcp = mod + 0x2474;
2302 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2303 xcp = mod + 0x2488;
2304 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2307 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2308 fillIndex [0xC]++;
2309 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2311 switch (cp) {
2312 // Maybe Bengali digit numbers do not increase
2313 // indexes, but 0x09E6 does.
2314 case 0x09E7: case 0x09E8: case 0x09E9:
2315 case 0x09EA:
2316 // SPECIAL CASES
2317 case 0x0BF0: case 0x2180: case 0x2181:
2318 break;
2319 // SPECIAL CASE
2320 case 0x0BF1:
2321 fillIndex [0xC]++;
2322 break;
2323 default:
2324 if (currValue < 11 || currValue == 1000)
2325 fillIndex [0xC]++;
2326 break;
2329 // Add special cases that are not regarded as
2330 // numbers in UnicodeCategory speak.
2331 if (cp == '5') {
2332 // TONE FIVE
2333 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2334 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2336 else if (cp == '2' || cp == '6') // FIXME: why?
2337 fillIndex [0xC]++;
2340 // 221E: infinity
2341 fillIndex [0xC] = 0xFF;
2342 AddCharMap ('\u221E', 0xC, 1);
2343 #endregion
2345 #region Letters and NonSpacing Marks (general)
2347 // ASCII Latin alphabets
2348 for (int i = 0; i < alphabets.Length; i++)
2349 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2351 // non-ASCII Latin alphabets
2352 // FIXME: there is no such characters that are placed
2353 // *after* "alphabets" array items. This is nothing
2354 // more than a hack that creates dummy weight for
2355 // primary characters.
2356 for (int i = 0x0080; i < 0x0300; i++) {
2357 if (!Char.IsLetter ((char) i))
2358 continue;
2359 // For those Latin Letters which has NFKD are
2360 // not added as independent primary character.
2361 if (decompIndex [i] != 0)
2362 continue;
2363 // SPECIAL CASES:
2364 // 1.some alphabets have primarily
2365 // equivalent ASCII alphabets.
2366 // 2.some have independent primary weights,
2367 // but inside a-to-z range.
2368 // 3.there are some expanded characters that
2369 // are not part of Unicode Standard NFKD.
2370 // 4. some characters are letter in IsLetter
2371 // but not in sortkeys (maybe unicode version
2372 // difference caused it).
2373 switch (i) {
2374 // 1. skipping them does not make sense
2375 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2376 // case 0x184: case 0x185: case 0x186: case 0x189:
2377 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2378 // case 0x194: case 0x195: case 0x196: case 0x19A:
2379 // case 0x19B: case 0x19C:
2380 // 2. skipping them does not make sense
2381 // case 0x14A: // Ng
2382 // case 0x14B: // ng
2383 // 3.
2384 case 0xC6: // AE
2385 case 0xE6: // ae
2386 case 0xDE: // Icelandic Thorn
2387 case 0xFE: // Icelandic Thorn
2388 case 0xDF: // German ss
2389 case 0xFF: // German ss
2390 // 4.
2391 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2392 // not classified yet
2393 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2394 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2395 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2396 // case 0x1DD:
2397 continue;
2399 AddCharMapGroup ((char) i, 0xE, 1, 0);
2402 // IPA extensions
2403 // FIXME: this results in not equivalent values to
2404 // Windows, but is safer for comparison.
2405 char [] ipaArray = new char [0x300 - 0x250 + 0x20];
2406 for (int i = 0x40; i < 0x60; i++)
2407 if (Char.IsLetter ((char) i))
2408 ipaArray [i - 0x40] = (char) (i);
2409 for (int i = 0x250; i < 0x300; i++)
2410 if (Char.IsLetter ((char) i))
2411 ipaArray [i - 0x250 + 0x20] = (char) i;
2412 Array.Sort (ipaArray, UCAComparer.Instance);
2413 int targetASCII = 0;
2414 byte latinDiacritical = 0x7B;
2415 foreach (char c in ipaArray) {
2416 if (c <= 'Z') {
2417 targetASCII = c;
2418 latinDiacritical = 0x7B;
2420 else
2421 map [(int) c] = new CharMapEntry (
2422 0xE,
2423 map [targetASCII].Level1,
2424 latinDiacritical++);
2427 // Greek and Coptic
2429 // FIXME: this is (mysterious and) incomplete.
2430 for (int i = 0x0380; i < 0x0400; i++)
2431 if (diacritical [i] == 0 &&
2432 decompLength [i] == 1 &&
2433 decompType [i] == DecompositionCompat)
2434 diacritical [i] = 3;
2436 fillIndex [0xF] = 2;
2437 for (int i = 0x0391; i < 0x03AA; i++)
2438 if (i != 0x03A2)
2439 AddCharMap ((char) i, 0xF, 1,
2440 diacritical [i]);
2441 fillIndex [0xF] = 2;
2442 for (int i = 0x03B1; i < 0x03CA; i++)
2443 if (i != 0x03C2)
2444 AddCharMap ((char) i, 0xF, 1,
2445 diacritical [i]);
2446 // Final Sigma
2447 map [0x03C2] = new CharMapEntry (0xF,
2448 map [0x03C3].Level1, map [0x03C3].Level2);
2450 fillIndex [0xF] = 0x40;
2451 for (int i = 0x03DA; i < 0x03F0; i++)
2452 AddCharMap ((char) i, 0xF,
2453 (byte) (i % 2 == 0 ? 0 : 2),
2454 diacritical [i]);
2456 // NFKD
2457 for (int i = 0x0386; i <= 0x0400; i++)
2458 FillLetterNFKD (i, true, true);
2460 // Cyrillic.
2461 // Cyrillic letters are sorted like Latin letters i.e.
2462 // containing culture-specific letters between the
2463 // standard Cyrillic sequence.
2465 // We can't use UCA here; it has different sorting.
2466 char [] orderedCyrillic = new char [] {
2467 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2468 '\u0452', // DJE for Serbocroatian
2469 '\u0435',
2470 '\u0454', // IE for Ukrainian
2471 '\u0436', '\u0437',
2472 '\u0455', // DZE
2473 '\u0438',
2474 '\u0456', // Byelorussian-Ukrainian I
2475 '\u0457', // YI
2476 '\u0439',
2477 '\u0458', // JE
2478 '\u043A', '\u043B',
2479 '\u0459', // LJE
2480 '\u043C', '\u043D',
2481 '\u045A', // NJE
2482 '\u043E',
2483 // 4E9 goes here.
2484 '\u043F', '\u0440', '\u0441', '\u0442',
2485 '\u045B', // TSHE for Serbocroatian
2486 '\u0443',
2487 '\u045E', // Short U for Byelorussian
2488 '\u04B1', // Straight U w/ stroke (diacritical!)
2489 '\u0444', '\u0445', '\u0446', '\u0447',
2490 '\u045F', // DZHE
2491 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2492 '\u044D', '\u044E', '\u044F'};
2494 // For some characters here is a map to basic cyrillic
2495 // letters. See UnicodeData.txt character names for
2496 // the sources. Here I simply declare an equiv. array.
2497 // The content characters are map from U+490(,491),
2498 // skipping small letters.
2499 char [] cymap_src = new char [] {
2500 '\u0433', '\u0433', '\u0433', '\u0436',
2501 '\u0437', '\u043A', '\u043A', '\u043A',
2502 '\u043A', '\u043D', '\u043D', '\u043F',
2503 '\u0445', '\u0441', '\u0442', '\u0443',
2504 '\u0443', '\u0445', '\u0446', '\u0447',
2505 '\u0447', '\u0432', '\u0435', '\u0435',
2506 '\u0406', '\u0436', '\u043A', '\u043D',
2507 '\u0447', '\u0435'};
2509 fillIndex [0x10] = 0x8D;
2510 for (int i = 0x0460; i < 0x0481; i++) {
2511 if (Char.IsLetter ((char) i)) {
2512 if (i == 0x0476)
2513 // U+476/477 have the same
2514 // primary weight as U+474/475.
2515 fillIndex [0x10] -= 3;
2516 AddLetterMap ((char) i, 0x10, 3);
2520 fillIndex [0x10] = 0x6;
2521 for (int i = 0; i < orderedCyrillic.Length; i++) {
2522 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2523 if (!IsIgnorable ((int) c) &&
2524 Char.IsLetter (c) &&
2525 !map [c].Defined) {
2526 AddLetterMap (c, 0x10, 0);
2527 fillIndex [0x10] += 3;
2531 // NFKD
2532 for (int i = 0x0401; i <= 0x045F; i++)
2533 FillLetterNFKD (i, false, false);
2535 for (int i = 0; i < cymap_src.Length; i++) {
2536 char c = cymap_src [i];
2537 fillIndex [0x10] = map [c].Level1;
2538 int c2 = 0x0490 + i * 2;
2539 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2542 // Armenian
2543 fillIndex [0x11] = 0x3;
2544 fillIndex [0x1] = 0x98;
2545 for (int i = 0x0531; i < 0x0586; i++) {
2546 if (i == 0x0559 || i == 0x55A)
2547 AddCharMap ((char) i, 1, 1);
2548 if (Char.IsLetter ((char) i))
2549 AddLetterMap ((char) i, 0x11, 1);
2552 // Hebrew
2553 // -Letters
2554 fillIndex [0x12] = 0x2;
2555 for (int i = 0x05D0; i < 0x05FF; i++)
2556 if (Char.IsLetter ((char) i)) {
2557 if (isUppercase [i]) {
2558 fillIndex [0x12]--;
2559 AddLetterMap ((char) i, 0x12, 2);
2561 else
2562 AddLetterMap ((char) i, 0x12, 1);
2564 // -Accents
2565 fillIndex [0x1] = 0x3;
2566 for (int i = 0x0591; i <= 0x05C2; i++) {
2567 if (i == 0x05A3 || i == 0x05BB)
2568 fillIndex [0x1]++;
2569 if (i != 0x05BE)
2570 AddCharMap ((char) i, 0x1, 1);
2573 // Arabic
2574 fillIndex [0x1] = 0x8E;
2575 fillIndex [0x13] = 0x3;
2576 for (int i = 0x0621; i <= 0x064A; i++) {
2577 // Abjad
2578 if (Char.GetUnicodeCategory ((char) i)
2579 != UnicodeCategory.OtherLetter) {
2580 // FIXME: arabic nonspacing marks are
2581 // in different order.
2582 AddCharMap ((char) i, 0x1, 1);
2583 continue;
2585 // map [i] = new CharMapEntry (0x13,
2586 // (byte) arabicLetterPrimaryValues [i], 1);
2587 fillIndex [0x13] =
2588 (byte) arabicLetterPrimaryValues [i];
2589 byte formDiacritical = 8; // default
2590 // SPECIAL CASES:
2591 switch (i) {
2592 case 0x0622: formDiacritical = 9; break;
2593 case 0x0623: formDiacritical = 0xA; break;
2594 case 0x0624: formDiacritical = 5; break;
2595 case 0x0625: formDiacritical = 0xB; break;
2596 case 0x0626: formDiacritical = 7; break;
2597 case 0x0649: formDiacritical = 5; break;
2598 case 0x064A: formDiacritical = 7; break;
2600 // AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2601 AddArabicCharMap ((char) i, 0x13, 1, formDiacritical);
2603 for (int i = 0x0670; i < 0x0673; i++)
2604 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2605 fillIndex [0x13] = 0x84;
2606 for (int i = 0x0674; i < 0x06D6; i++)
2607 if (Char.IsLetter ((char) i))
2608 AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2610 // Devanagari
2612 // FIXME: this could be fixed in more decent way
2613 for (int i = 0x0958; i <= 0x095F; i++)
2614 diacritical [i] = 8;
2616 // FIXME: it does seem straight codepoint mapping.
2617 fillIndex [0x14] = 04;
2618 for (int i = 0x0901; i < 0x0905; i++)
2619 if (!IsIgnorable (i))
2620 AddLetterMap ((char) i, 0x14, 2);
2621 fillIndex [0x14] = 0xB;
2622 for (int i = 0x0905; i < 0x093A; i++) {
2623 if (i == 0x0928)
2624 AddCharMap ('\u0929', 0x14, 0, 8);
2625 if (i == 0x0930)
2626 AddCharMap ('\u0931', 0x14, 0, 8);
2627 if (i == 0x0933)
2628 AddCharMap ('\u0934', 0x14, 0, 8);
2629 if (Char.IsLetter ((char) i))
2630 AddLetterMap ((char) i, 0x14, 4);
2631 if (i == 0x090B)
2632 AddCharMap ('\u0960', 0x14, 4);
2633 if (i == 0x090C)
2634 AddCharMap ('\u0961', 0x14, 4);
2636 fillIndex [0x14] = 0xDA;
2637 for (int i = 0x093E; i < 0x0945; i++)
2638 if (!IsIgnorable (i))
2639 AddLetterMap ((char) i, 0x14, 2);
2640 fillIndex [0x14] = 0xEC;
2641 for (int i = 0x0945; i < 0x094F; i++)
2642 if (!IsIgnorable (i))
2643 AddLetterMap ((char) i, 0x14, 2);
2645 // Bengali
2646 // -Letters
2647 fillIndex [0x15] = 02;
2648 for (int i = 0x0980; i < 0x9FF; i++) {
2649 if (IsIgnorable (i))
2650 continue;
2651 if (i == 0x09E0)
2652 fillIndex [0x15] = 0x3B;
2653 switch (Char.GetUnicodeCategory ((char) i)) {
2654 case UnicodeCategory.NonSpacingMark:
2655 case UnicodeCategory.DecimalDigitNumber:
2656 case UnicodeCategory.OtherNumber:
2657 continue;
2659 AddLetterMap ((char) i, 0x15, 1);
2661 // -Signs
2662 fillIndex [0x1] = 0x3;
2663 for (int i = 0x0981; i < 0x0A00; i++)
2664 if (Char.GetUnicodeCategory ((char) i) ==
2665 UnicodeCategory.NonSpacingMark)
2666 AddCharMap ((char) i, 0x1, 1);
2668 // Gurmukhi. orderedGurmukhi is from UCA
2669 // FIXME: it does not look equivalent to UCA.
2670 fillIndex [0x16] = 04;
2671 fillIndex [0x1] = 3;
2672 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2673 char c = orderedGurmukhi [i];
2674 if (IsIgnorable ((int) c))
2675 continue;
2676 if (IsIgnorableNonSpacing (c)) {
2677 AddLetterMap (c, 0x1, 1);
2678 continue;
2680 if (c == '\u0A3C' || c == '\u0A4D' ||
2681 '\u0A66' <= c && c <= '\u0A71')
2682 continue;
2683 // SPECIAL CASES
2684 byte shift = 4;
2685 switch (c) {
2686 case '\u0A33': case '\u0A36': case '\u0A16':
2687 case '\u0A17': case '\u0A5B': case '\u0A5E':
2688 shift = 0;
2689 break;
2691 if (c == '\u0A3E') // Skip
2692 fillIndex [0x16] = 0xC0;
2693 AddLetterMap (c, 0x16, shift);
2696 // Gujarati. orderedGujarati is from UCA
2697 fillIndex [0x17] = 0x4;
2698 // nonspacing marks
2699 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2700 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2701 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2702 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2703 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2704 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2705 // letters go first.
2706 for (int i = 0; i < orderedGujarati.Length; i++) {
2707 // SPECIAL CASE
2708 char c = orderedGujarati [i];
2709 if (Char.IsLetter (c)) {
2710 // SPECIAL CASES
2711 if (c == '\u0AB3' || c == '\u0A32')
2712 continue;
2713 if (c == '\u0A33') {
2714 AddCharMap ('\u0A32', 0x17, 0);
2715 AddCharMap ('\u0A33', 0x17, 4, 4);
2716 continue;
2718 if (c == '\u0A8B')
2719 AddCharMap ('\u0AE0', 0x17, 0, 5);
2720 AddCharMap (c, 0x17, 4);
2722 if (c == '\u0AB9')
2723 AddCharMap ('\u0AB3', 0x17, 6);
2726 // non-letters
2727 byte gujaratiShift = 4;
2728 fillIndex [0x17] = 0xC0;
2729 for (int i = 0; i < orderedGujarati.Length; i++) {
2730 char c = orderedGujarati [i];
2731 if (fillIndex [0x17] == 0xCC)
2732 gujaratiShift = 3;
2733 if (!Char.IsLetter (c)) {
2734 // SPECIAL CASES
2735 if (c == '\u0A82')
2736 AddCharMap ('\u0A81', 0x17, 2);
2737 if (c == '\u0AC2')
2738 fillIndex [0x17]++;
2739 AddLetterMap (c, 0x17, gujaratiShift);
2743 // Oriya
2744 fillIndex [0x1] = 03;
2745 fillIndex [0x18] = 02;
2746 for (int i = 0x0B00; i < 0x0B7F; i++) {
2747 switch (Char.GetUnicodeCategory ((char) i)) {
2748 case UnicodeCategory.NonSpacingMark:
2749 case UnicodeCategory.DecimalDigitNumber:
2750 AddLetterMap ((char) i, 0x1, 1);
2751 continue;
2753 AddLetterMapCore ((char) i, 0x18, 1, 0, true);
2756 // Tamil
2757 fillIndex [0x19] = 2;
2758 AddCharMap ('\u0BD7', 0x19, 0);
2759 fillIndex [0x19] = 0xA;
2760 // vowels
2761 for (int i = 0x0B82; i <= 0x0B94; i++)
2762 if (!IsIgnorable ((char) i))
2763 AddCharMap ((char) i, 0x19, 2);
2764 // special vowel
2765 fillIndex [0x19] = 0x28;
2766 // The array for Tamil consonants is a constant.
2767 // Windows have almost similar sequence to TAM from
2768 // tamilnet but a bit different in Grantha.
2769 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2770 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2771 // combining marks
2772 fillIndex [0x19] = 0x82;
2773 for (int i = 0x0BBE; i < 0x0BCD; i++)
2774 if (Char.GetUnicodeCategory ((char) i) ==
2775 UnicodeCategory.SpacingCombiningMark
2776 || i == 0x0BC0)
2777 AddLetterMap ((char) i, 0x19, 2);
2779 // Telugu
2780 fillIndex [0x1A] = 0x4;
2781 for (int i = 0x0C00; i < 0x0C62; i++) {
2782 if (i == 0x0C55 || i == 0x0C56)
2783 continue; // skip
2784 AddCharMap ((char) i, 0x1A, 3);
2785 char supp = (i == 0x0C0B) ? '\u0C60':
2786 i == 0x0C0C ? '\u0C61' : char.MinValue;
2787 if (supp == char.MinValue)
2788 continue;
2789 AddCharMap (supp, 0x1A, 3);
2792 // Kannada
2793 fillIndex [0x1B] = 4;
2794 for (int i = 0x0C80; i < 0x0CE5; i++) {
2795 if (i == 0x0CD5 || i == 0x0CD6)
2796 continue; // ignore
2797 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2798 continue; // shift after 0xCB9
2799 AddCharMap ((char) i, 0x1B, 3);
2800 if (i == 0x0CB9) {
2801 // SPECIAL CASES: but why?
2802 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2803 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2804 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2806 if (i == 0x0CB2)
2807 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2810 // Malayalam
2811 fillIndex [0x1C] = 2;
2812 fillIndex [0x1] = 3;
2813 for (int i = 0x0D02; i < 0x0D61; i++) {
2814 // FIXME: I avoided MSCompatUnicodeTable usage
2815 // here (it results in recursion). So check if
2816 // using NonSpacingMark makes sense or not.
2817 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2818 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2819 AddCharMap ((char) i, 0x1C, 1);
2820 else if (!IsIgnorable ((char) i))
2821 AddCharMap ((char) i, 1, 1);
2824 // Thai ... note that it breaks 0x1E wall after E2B!
2825 // Also, all Thai characters have level 2 value 3.
2826 fillIndex [0x1E] = 2;
2827 fillIndex [0x1] = 3;
2828 for (int i = 0xE40; i <= 0xE44; i++)
2829 AddCharMap ((char) i, 0x1E, 1, 3);
2830 for (int i = 0xE01; i < 0xE2B; i++)
2831 AddCharMap ((char) i, 0x1E, 6, 3);
2832 fillIndex [0x1F] = 5;
2833 for (int i = 0xE2B; i < 0xE30; i++)
2834 AddCharMap ((char) i, 0x1F, 6, 3);
2835 fillIndex [0x1F] = 0x1E;
2836 for (int i = 0xE30; i < 0xE3B; i++)
2837 AddCharMap ((char) i, 0x1F, 1, 3);
2838 // some Thai characters remains.
2839 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2840 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2841 foreach (char c in specialThai)
2842 AddCharMap (c, 0x1F, 1, 3);
2844 for (int i = 0xE00; i < 0xE80; i++)
2845 if (Char.GetUnicodeCategory ((char) i) ==
2846 UnicodeCategory.NonSpacingMark)
2847 AddCharMap ((char) i, 1, 1);
2849 // Lao
2850 fillIndex [0x1F] = 2;
2851 fillIndex [0x1] = 3;
2852 for (int i = 0xE80; i < 0xEDF; i++) {
2853 if (IsIgnorable ((char) i))
2854 continue;
2855 else if (Char.IsLetter ((char) i))
2856 AddCharMap ((char) i, 0x1F, 1);
2857 else if (Char.GetUnicodeCategory ((char) i) ==
2858 UnicodeCategory.NonSpacingMark)
2859 AddCharMap ((char) i, 1, 1);
2862 // Georgian. orderedGeorgian is from UCA DUCET.
2863 fillIndex [0x21] = 5;
2864 for (int i = 0; i < orderedGeorgian.Length; i++) {
2865 char c = orderedGeorgian [i];
2866 if (map [(int) c].Defined)
2867 continue;
2868 AddCharMap (c, 0x21, 0);
2869 if (c < '\u10F6')
2870 AddCharMap ((char) (c - 0x30), 0x21, 0);
2871 fillIndex [0x21] += 5;
2874 // Japanese Kana.
2875 fillIndex [0x22] = 2;
2876 int kanaOffset = 0x3041;
2877 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2879 for (int gyo = 0; gyo < 9; gyo++) {
2880 for (int dan = 0; dan < 5; dan++) {
2881 if (gyo == 7 && dan % 2 == 1) {
2882 // 'ya'-gyo
2883 fillIndex [0x22]++;
2884 kanaOffset -= 2; // There is no space for yi and ye.
2885 continue;
2887 int cp = kanaOffset + dan * kanaLines [gyo];
2888 // small lines (a-gyo, ya-gyo)
2889 if (gyo == 0 || gyo == 7) {
2890 AddKanaMap (cp, 1); // small
2891 AddKanaMap (cp + 1, 1);
2893 else
2894 AddKanaMap (cp, kanaLines [gyo]);
2895 fillIndex [0x22]++;
2897 if (cp == 0x30AB) {
2898 // add small 'ka' (before normal one)
2899 AddKanaMap (0x30F5, 1);
2900 kanaOffset++;
2902 if (cp == 0x30B1) {
2903 // add small 'ke' (before normal one)
2904 AddKanaMap (0x30F6, 1);
2905 kanaOffset++;
2907 if (cp == 0x3061) {
2908 // add small 'Tsu' (before normal one)
2909 AddKanaMap (0x3063, 1);
2910 kanaOffset++;
2913 fillIndex [0x22] += 3;
2914 kanaOffset += 5 * kanaLines [gyo];
2917 // Wa-gyo is almost special, so I just manually add.
2918 AddLetterMap ((char) 0x308E, 0x22, 0);
2919 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2920 AddLetterMap ((char) 0x308F, 0x22, 0);
2921 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2922 fillIndex [0x22]++;
2923 AddLetterMap ((char) 0x3090, 0x22, 0);
2924 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2925 fillIndex [0x22] += 2;
2926 // no "Wu" in Japanese.
2927 AddLetterMap ((char) 0x3091, 0x22, 0);
2928 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2929 fillIndex [0x22]++;
2930 AddLetterMap ((char) 0x3092, 0x22, 0);
2931 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2932 // Nn
2933 fillIndex [0x22] = 0x80;
2934 AddLetterMap ((char) 0x3093, 0x22, 0);
2935 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2937 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2938 map [0x30A6].Level1, 3);// voiced hiragana U
2939 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2940 map [0x30A6].Level1, 3);// voiced katakana U
2942 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2943 map [0x30AB].Level1, 0);// small katakana Ka
2944 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2945 map [0x30B1].Level1, 0);// small katakana Ke
2946 // voiced Wa lines
2947 for (int i = 0x30F7; i < 0x30FB; i++)
2948 map [i] = new CharMapEntry (map [i - 8].Category,
2949 map [i - 8].Level1,
2952 // JIS Japanese square chars.
2953 fillIndex [0x22] = 0x97;
2954 jisJapanese.Sort (JISComparer.Instance);
2955 foreach (JISCharacter j in jisJapanese)
2956 if (0x3300 <= j.CP && j.CP <= 0x3357)
2957 AddCharMap ((char) j.CP, 0x22, 1);
2958 // non-JIS Japanese square chars.
2959 nonJisJapanese.Sort (NonJISComparer.Instance);
2960 foreach (NonJISCharacter j in nonJisJapanese)
2961 AddCharMap ((char) j.CP, 0x22, 1);
2963 // Bopomofo
2964 fillIndex [0x23] = 0x02;
2965 for (int i = 0x3105; i <= 0x312C; i++)
2966 AddCharMap ((char) i, 0x23, 1);
2968 // Estrangela: ancient Syriac
2969 fillIndex [0x24] = 0x0B;
2970 // FIXME: is 0x71E really alternative form?
2971 ArrayList syriacAlternatives = new ArrayList (
2972 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2973 for (int i = 0x0710; i <= 0x072C; i++) {
2974 if (i == 0x0711) // NonSpacingMark
2975 continue;
2976 if (syriacAlternatives.Contains (i))
2977 continue;
2978 AddCharMap ((char) i, 0x24, 4);
2979 // FIXME: why?
2980 if (i == 0x721)
2981 fillIndex [0x24]++;
2983 foreach (int cp in syriacAlternatives)
2984 map [cp] = new CharMapEntry (0x24,
2985 (byte) (map [cp - 1].Level1 + 2),
2987 // FIXME: Syriac NonSpacingMark should go here.
2989 // Thaana
2990 // FIXME: it turned out that it does not look like UCA
2991 fillIndex [0x24] = 0x6E;
2992 fillIndex [0x1] = 0xAC;
2993 for (int i = 0; i < orderedThaana.Length; i++) {
2994 char c = orderedThaana [i];
2995 if (IsIgnorableNonSpacing ((int) c))
2996 AddCharMap (c, 1, 1);
2997 AddCharMap (c, 0x24, 2);
2998 if (c == '\u0782') // SPECIAL CASE: why?
2999 fillIndex [0x24] += 2;
3001 #endregion
3003 // FIXME: Add more culture-specific letters (that are
3004 // not supported in Windows collation) here.
3006 // Surrogate ... they are computed.
3008 #region Hangul
3009 // Hangul.
3011 // Unlike UCA Windows Hangul sequence mixes Jongseong
3012 // with Choseong sequence as well as Jungseong,
3013 // adjusted to have the same primary weight for the
3014 // same base character. So it is impossible to compute
3015 // those sort keys.
3017 // Here I introduce an ordered sequence of mixed
3018 // 'commands' and 'characters' that is similar to
3019 // LDML text:
3020 // - ',' increases primary weight.
3021 // - [A B] means a range, increasing index
3022 // - {A B} means a range, without increasing index
3023 // - '=' is no operation (it means the characters
3024 // of both sides have the same weight).
3025 // - '>' inserts a Hangul Syllable block that
3026 // contains 0x251 characters.
3027 // - '<' decreases the index
3028 // - '0'-'9' means skip count
3029 // - whitespaces are ignored
3032 string hangulSequence =
3033 "\u1100=\u11A8 > \u1101=\u11A9 >"
3034 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
3035 + "<{\u1113 \u1116}, \u3165,"
3036 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
3037 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
3038 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
3039 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
3040 + "[\u11D1 \u11D2], \u11B2,"
3041 + "[\u11D3 \u11D5], \u11B3,"
3042 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
3043 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
3044 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
3045 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
3046 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
3047 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
3048 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
3049 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
3050 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
3051 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
3052 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
3053 + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
3054 + "\u11F1,, \u11F2,,,"
3055 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
3056 + "<\u114D, \u110D,, >"
3057 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
3058 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
3059 + "\u1110=\u11C0 > \u1111=\u11C1 >"
3060 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
3061 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
3062 + "[\u11F5 \u11F8]"
3065 byte hangulCat = 0x52;
3066 fillIndex [hangulCat] = 0x2;
3068 int syllableBlock = 0;
3069 for (int n = 0; n < hangulSequence.Length; n++) {
3070 char c = hangulSequence [n];
3071 int start, end;
3072 if (Char.IsWhiteSpace (c))
3073 continue;
3074 switch (c) {
3075 case '=':
3076 break; // NOP
3077 case ',':
3078 IncrementSequentialIndex (ref hangulCat);
3079 break;
3080 case '<':
3081 if (fillIndex [hangulCat] == 2)
3082 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
3083 fillIndex [hangulCat]--;
3084 break;
3085 case '>':
3086 IncrementSequentialIndex (ref hangulCat);
3087 for (int l = 0; l < 0x15; l++)
3088 for (int v = 0; v < 0x1C; v++) {
3089 AddCharMap (
3090 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
3091 IncrementSequentialIndex (ref hangulCat);
3093 syllableBlock++;
3094 break;
3095 case '[':
3096 start = hangulSequence [n + 1];
3097 end = hangulSequence [n + 3];
3098 for (int i = start; i <= end; i++) {
3099 AddCharMap ((char) i, hangulCat, 0);
3100 if (end > i)
3101 IncrementSequentialIndex (ref hangulCat);
3103 n += 4; // consumes 5 characters for this operation
3104 break;
3105 case '{':
3106 start = hangulSequence [n + 1];
3107 end = hangulSequence [n + 3];
3108 for (int i = start; i <= end; i++)
3109 AddCharMap ((char) i, hangulCat, 0);
3110 n += 4; // consumes 5 characters for this operation
3111 break;
3112 default:
3113 AddCharMap (c, hangulCat, 0);
3114 break;
3118 // Some Jamo NFKD.
3119 for (int i = 0x3200; i < 0x3300; i++) {
3120 if (IsIgnorable (i) || map [i].Defined)
3121 continue;
3122 int ch = 0;
3123 // w/ bracket
3124 if (decompLength [i] == 4 &&
3125 decompValues [decompIndex [i]] == '(')
3126 ch = decompIndex [i] + 1;
3127 // circled
3128 else if (decompLength [i] == 2 &&
3129 decompValues [decompIndex [i] + 1] == '\u1161')
3130 ch = decompIndex [i];
3131 else if (decompLength [i] == 1)
3132 ch = decompIndex [i];
3133 else
3134 continue;
3135 ch = decompValues [ch];
3136 if (ch < 0x1100 || 0x1200 < ch &&
3137 ch < 0xAC00 || 0xD800 < ch)
3138 continue;
3140 // SPECIAL CASE ?
3141 int offset = i < 0x3260 ? 1 : 0;
3142 if (0x326E <= i && i <= 0x3273)
3143 offset = 1;
3145 map [i] = new CharMapEntry (map [ch].Category,
3146 (byte) (map [ch].Level1 + offset),
3147 map [ch].Level2);
3148 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
3152 #endregion
3154 // Letterlike characters and CJK compatibility square
3155 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
3156 int [] counts = new int ['Z' - 'A' + 1];
3157 char [] namedChars = new char [sortableCharNames.Count];
3158 int nCharNames = 0;
3159 foreach (DictionaryEntry de in sortableCharNames) {
3160 counts [((string) de.Value) [0] - 'A']++;
3161 namedChars [nCharNames++] = (char) ((int) de.Key);
3163 nCharNames = 0; // reset
3164 for (int a = 0; a < counts.Length; a++) {
3165 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
3166 for (int i = 0; i < counts [a]; i++)
3167 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
3168 AddCharMap (namedChars [nCharNames++], 0xE, 1);
3171 // CJK unified ideograph.
3172 byte cjkCat = 0x9E;
3173 fillIndex [cjkCat] = 0x2;
3174 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
3175 if (!IsIgnorable (cp))
3176 AddCharMapGroupCJK ((char) cp, ref cjkCat);
3177 // CJK Extensions goes here.
3178 // LAMESPEC: With this Windows style CJK layout, it is
3179 // impossible to add more CJK ideograph i.e. 0x9FA6-
3180 // 0x9FBB can never be added w/o breaking compat.
3181 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
3182 if (!IsIgnorable (cp))
3183 AddCharMapGroupCJK ((char) cp, ref cjkCat);
3185 // PrivateUse ... computed.
3186 // remaining Surrogate ... computed.
3188 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
3189 // non-alphanumeric ASCII except for: + - < = > '
3190 for (int i = 0x21; i < 0x7F; i++) {
3191 // SPECIAL CASE: 02C6 looks regarded as
3192 // equivalent to '^', which does not conform
3193 // to Unicode standard character database.
3194 if (i == 0x005B)
3195 AddCharMap ('\u2045', 0x7, 0, 0x1C);
3196 if (i == 0x005D)
3197 AddCharMap ('\u2046', 0x7, 0, 0x1C);
3198 if (i == 0x005E)
3199 AddCharMap ('\u02C6', 0x7, 0, 3);
3200 if (i == 0x0060)
3201 AddCharMap ('\u02CB', 0x7, 0, 3);
3203 if (Char.IsLetterOrDigit ((char) i)
3204 || "+-<=>'".IndexOf ((char) i) >= 0)
3205 continue; // they are not added here.
3207 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3208 // Insert 3001 after ',' and 3002 after '.'
3209 if (i == 0x2C)
3210 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
3211 else if (i == 0x2E)
3212 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
3213 else if (i == 0x3A)
3214 AddCharMap ('\uFE30', 0x7, 1, 0);
3216 #endregion
3218 #region 07 - Punctuations and something else
3219 for (int i = 0xA0; i < char.MaxValue; i++) {
3220 if (IsIgnorable (i))
3221 continue;
3223 // FIXME: actually those reset should not be
3224 // done but here I put for easy goal.
3225 if (i == 0x05C3)
3226 fillIndex [0x7]++;
3227 if (i == 0x0700)
3228 fillIndex [0x7] = 0xE2;
3229 if (i == 0x2016)
3230 fillIndex [0x7] = 0x77;
3231 if (i == 0x3008)
3232 fillIndex [0x7] = 0x93;
3234 if (0x02C8 <= i && i <= 0x02CD)
3235 continue; // nonspacing marks
3237 // SPECIAL CASE: maybe they could be allocated
3238 // dummy NFKD mapping and no special processing
3239 // would be required here.
3240 if (i == 0x00AF)
3241 AddCharMap ('\u02C9', 0x7, 0, 3);
3242 if (i == 0x00B4)
3243 AddCharMap ('\u02CA', 0x7, 0, 3);
3244 if (i == 0x02C7)
3245 AddCharMap ('\u02D8', 0x7, 0, 3);
3247 // SPECIAL CASES:
3248 switch (i) {
3249 case 0xAB: // 08
3250 case 0xB7: // 0A
3251 case 0xBB: // 08
3252 case 0x02B9: // 01
3253 case 0x02BA: // 01
3254 case 0x2329: // 09
3255 case 0x232A: // 09
3256 continue;
3259 switch (Char.GetUnicodeCategory ((char) i)) {
3260 case UnicodeCategory.OtherPunctuation:
3261 case UnicodeCategory.ClosePunctuation:
3262 case UnicodeCategory.OpenPunctuation:
3263 case UnicodeCategory.ConnectorPunctuation:
3264 case UnicodeCategory.InitialQuotePunctuation:
3265 case UnicodeCategory.FinalQuotePunctuation:
3266 case UnicodeCategory.ModifierSymbol:
3267 // SPECIAL CASES: // 0xA
3268 if (0x2020 <= i && i <= 0x2031)
3269 continue;
3270 if (i == 0x3003) // added later
3271 continue;
3272 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3273 break;
3274 default:
3275 if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3276 goto case UnicodeCategory.OtherPunctuation;
3277 break;
3281 // Control pictures
3282 // FIXME: it should not need to reset level 1, but
3283 // it's for easy goal.
3284 fillIndex [0x7] = 0xB6;
3285 for (int i = 0x2400; i <= 0x2424; i++)
3286 AddCharMap ((char) i, 0x7, 1, 0);
3288 // FIXME: what are they?
3289 AddCharMap ('\u3003', 0x7, 1);
3290 AddCharMap ('\u3006', 0x7, 1);
3291 AddCharMap ('\u02D0', 0x7, 1);
3292 AddCharMap ('\u10FB', 0x7, 1);
3293 AddCharMap ('\u0950', 0x7, 1);
3294 AddCharMap ('\u093D', 0x7, 1);
3295 AddCharMap ('\u0964', 0x7, 1);
3296 AddCharMap ('\u0965', 0x7, 1);
3297 AddCharMap ('\u0970', 0x7, 1);
3299 #endregion
3301 #region category 08 - symbols
3302 fillIndex [0x8] = 2;
3303 // Here Windows mapping is not straightforward. It is
3304 // not based on computation but seems manual sorting.
3305 AddCharMapGroup ('+', 0x8, 1, 0); // plus
3306 AddCharMapGroup ('\u2212', 0x8, 1); // minus
3307 AddCharMapGroup ('\u229D', 0x8, 1); // minus
3308 AddCharMapGroup ('\u2297', 0x8, 1); // mul
3309 AddCharMapGroup ('\u2044', 0x8, 1); // div
3310 AddCharMapGroup ('\u2215', 0x8, 0); // div
3311 AddCharMapGroup ('\u2298', 0x8, 1); // div slash
3312 AddCharMapGroup ('\u2217', 0x8, 0); // mul
3313 AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper
3314 AddCharMapGroup ('\u2218', 0x8, 0); // ring
3315 AddCharMapGroup ('\u229A', 0x8, 1); // ring
3316 AddCharMapGroup ('\u2219', 0x8, 0); // bullet
3317 AddCharMapGroup ('\u2299', 0x8, 1); // dot oper
3318 AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus
3319 AddCharMapGroup ('\u003C', 0x8, 1); // <
3320 AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation
3321 AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation
3323 for (int cp = 0; cp < 0x2300; cp++) {
3324 if (cp == 0xAC) // SPECIAL CASE: skip
3325 continue;
3326 if (cp == 0x200) {
3327 cp = 0x2200; // skip to 2200
3328 fillIndex [0x8] = 0x21;
3330 if (cp == 0x2295)
3331 fillIndex [0x8] = 0x3;
3332 if (cp == 0x22A2)
3333 fillIndex [0x8] = 0xAB;
3334 if (cp == 0x22B2)
3335 fillIndex [0x8] = 0xB9;
3336 if (!map [cp].Defined &&
3337 // Char.GetUnicodeCategory ((char) cp) ==
3338 // UnicodeCategory.MathSymbol)
3339 Char.IsSymbol ((char) cp))
3340 AddCharMapGroup ((char) cp, 0x8, 1);
3341 // SPECIAL CASES: no idea why Windows sorts as such
3342 switch (cp) {
3343 case 0x3E:
3344 AddCharMap ('\u227B', 0x8, 1, 0);
3345 AddCharMap ('\u22B1', 0x8, 1, 0);
3346 break;
3347 case 0xB1:
3348 AddCharMapGroup ('\u00AB', 0x8, 1);
3349 AddCharMapGroup ('\u226A', 0x8, 1);
3350 AddCharMapGroup ('\u00BB', 0x8, 1);
3351 AddCharMapGroup ('\u226B', 0x8, 1);
3352 break;
3353 case 0xF7:
3354 AddCharMap ('\u01C0', 0x8, 1, 0);
3355 AddCharMap ('\u01C1', 0x8, 1, 0);
3356 AddCharMap ('\u01C2', 0x8, 1, 0);
3357 break;
3360 #endregion
3362 #region Hack!
3364 // Characters w/ diacritical marks (NFKD)
3365 for (int i = 0; i <= char.MaxValue; i++) {
3366 if (map [i].Defined || IsIgnorable (i))
3367 continue;
3368 if (decompIndex [i] == 0)
3369 continue;
3371 int start = decompIndex [i];
3372 int primaryChar = decompValues [start];
3373 int secondary = diacritical [i];
3374 bool skip = false;
3375 int length = decompLength [i];
3376 // special processing for parenthesized ones.
3377 if (length == 3 &&
3378 decompValues [start] == '(' &&
3379 decompValues [start + 2] == ')') {
3380 primaryChar = decompValues [start + 1];
3381 length = 1;
3384 if (map [primaryChar].Level1 == 0)
3385 continue;
3387 for (int l = 1; l < length; l++) {
3388 int c = decompValues [start + l];
3389 if (map [c].Level1 != 0)
3390 skip = true;
3391 secondary += diacritical [c];
3393 if (skip)
3394 continue;
3395 map [i] = new CharMapEntry (
3396 map [primaryChar].Category,
3397 map [primaryChar].Level1,
3398 (byte) secondary);
3402 // Diacritical weight adjustment
3404 // Arabic Hamzah
3405 diacritical [0x624] = 0x5;
3406 diacritical [0x626] = 0x7;
3407 diacritical [0x622] = 0x9;
3408 diacritical [0x623] = 0xA;
3409 diacritical [0x625] = 0xB;
3410 diacritical [0x649] = 0x5; // 'alif maqs.uurah
3411 diacritical [0x64A] = 0x7; // Yaa'
3413 for (int i = 0; i < char.MaxValue; i++) {
3414 byte mod = 0;
3415 byte cat = map [i].Category;
3416 switch (cat) {
3417 case 0xE: // Latin diacritics
3418 case 0x22: // Japanese: circled characters
3419 mod = diacritical [i];
3420 break;
3421 case 0x13: // Arabic
3422 if (i == 0x0621)
3423 break; // 0
3424 if (diacritical [i] == 0 && decompLength [i] != 0)
3425 diacritical [i] = map [decompValues [decompIndex [i]]].Level2;
3426 if (diacritical [i] == 0 && i >= 0xFE8D)
3427 mod = 0x8; // default for arabic
3428 break;
3430 if (0x52 <= cat && cat <= 0x7F) // Hangul
3431 mod = diacritical [i];
3432 if (mod > 0)
3433 map [i] = new CharMapEntry (
3434 cat, map [i].Level1, mod);
3437 // FIXME: this is halfly hack but those NonSpacingMark
3438 // characters and still undefined are likely to
3439 // be nonspacing.
3440 for (int i = 0; i < char.MaxValue; i++) {
3441 if (map [i].Defined ||
3442 IsIgnorable (i))
3443 continue;
3444 switch (i) {
3445 // SPECIAL CASES.
3446 case 0x02B9:
3447 case 0x02BA:
3448 break;
3449 default:
3450 if (Char.GetUnicodeCategory ((char) i) !=
3451 UnicodeCategory.NonSpacingMark)
3452 continue;
3453 break;
3455 if (diacritical [i] != 0)
3456 map [i] = new CharMapEntry (1, 1, diacritical [i]);
3457 else
3458 AddCharMap ((char) i, 1, 1);
3461 #endregion
3464 TextInfo ti = CultureInfo.InvariantCulture.TextInfo;
3466 private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap)
3468 if (map [i].Defined)
3469 return;
3470 int up = (int) ti.ToUpper ((char) i);
3471 if (checkUpper && map [up].Category == 0xF) {
3472 if (i == up)
3473 return;
3474 FillLetterNFKD (up, checkUpper, greekRemap);
3475 map [i] = new CharMapEntry (0xF,
3476 map [up].Level1,
3477 map [up].Level2);
3478 } else {
3479 int idx = decompIndex [i];
3480 if (idx == 0)
3481 return;
3482 int primary = decompValues [decompIndex [i]];
3483 FillLetterNFKD (primary, checkUpper, greekRemap);
3485 int lv2 = map [primary].Level2;
3486 byte off = 0;
3487 for (int l = 1; l < decompLength [i]; l++) {
3488 int tmp = decompValues [idx + l];
3489 if (map [tmp].Category != 1)
3490 return;
3491 if (greekRemap && map [tmp].Level2 == 0xC)
3492 off += 3;
3493 else
3494 off += map [tmp].Level2;
3496 if (off > 0) {
3497 if (lv2 == 0)
3498 lv2 += 2;
3499 lv2 += off;
3501 // ... but override if the value already exists.
3502 if (diacritical [i] != 0)
3503 lv2 = diacritical [i];
3504 map [i] = new CharMapEntry (
3505 map [primary].Category,
3506 map [primary].Level1,
3507 (byte) lv2);
3511 private void IncrementSequentialIndex (ref byte hangulCat)
3513 fillIndex [hangulCat]++;
3514 if (fillIndex [hangulCat] == 0) { // overflown
3515 hangulCat++;
3516 fillIndex [hangulCat] = 0x2;
3520 // Reset fillIndex to fixed value and call AddLetterMap().
3521 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3523 fillIndex [category] = alphaWeight;
3524 AddLetterMap (c, category, 0);
3526 ArrayList al = latinMap [c] as ArrayList;
3527 if (al == null)
3528 return;
3530 foreach (int cp in al)
3531 AddLetterMap ((char) cp, category, 0);
3534 private void AddKanaMap (int i, byte voices)
3536 for (byte b = 0; b < voices; b++) {
3537 char c = (char) (i + b);
3538 byte arg = (byte) (b > 0 ? b + 2 : 0);
3539 // Hiragana
3540 AddLetterMapCore (c, 0x22, 0, arg, false);
3541 // Katakana
3542 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3546 private void AddLetterMap (char c, byte category, byte updateCount)
3548 AddLetterMapCore (c, category, updateCount, 0, true);
3551 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3553 char c2;
3554 // <small> updates index
3555 c2 = ToSmallForm (c);
3556 if (c2 != c)
3557 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3558 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3559 if (c2 != c && !map [(int) c2].Defined)
3560 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3561 bool doUpdate = true;
3562 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3563 doUpdate = false;
3564 else
3565 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3566 if (doUpdate)
3567 fillIndex [category] += updateCount;
3570 private bool AddCharMap (char c, byte category, byte increment)
3572 return AddCharMap (c, category, increment, 0);
3575 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3577 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3578 return false; // do nothing
3579 map [(int) c] = new CharMapEntry (category,
3580 category == 1 ? alt : fillIndex [category],
3581 category == 1 ? fillIndex [category] : alt);
3582 fillIndex [category] += increment;
3583 return true;
3587 // Adds characters to table in the order below
3588 // (+ increases weight):
3589 // (<small> +)
3590 // itself
3591 // <fraction>
3592 // <full> | <super> | <sub>
3593 // <circle> | <wide> (| <narrow>)
3594 // +
3595 // (vertical +)
3597 // level2 is fixed (does not increase).
3598 int [] sameWeightItems = new int [] {
3599 DecompositionFraction,
3600 DecompositionFull,
3601 DecompositionSuper,
3602 DecompositionSub,
3603 DecompositionCircle,
3604 DecompositionWide,
3605 DecompositionNarrow,
3607 private void AddCharMapGroup (char c, byte category, byte updateCount)
3609 AddCharMapGroup (c, category, updateCount, 0, true);
3612 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3614 AddCharMapGroup (c, category, updateCount, level2, false);
3617 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3619 if (map [(int) c].Defined)
3620 return;
3622 if (deferLevel2)
3623 level2 = diacritical [(int) c];
3625 char small = char.MinValue;
3626 char vertical = char.MinValue;
3627 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3628 if (nfkd != null) {
3629 object smv = nfkd [(byte) DecompositionSmall];
3630 if (smv != null)
3631 small = (char) ((int) smv);
3632 object vv = nfkd [(byte) DecompositionVertical];
3633 if (vv != null)
3634 vertical = (char) ((int) vv);
3637 // <small> updates index
3638 if (small != char.MinValue) {
3639 if (level2 == 0 && deferLevel2)
3640 level2 = diacritical [small];
3641 AddCharMap (small, category, updateCount, level2);
3644 // itself
3645 AddCharMap (c, category, 0, level2);
3647 if (nfkd != null) {
3648 foreach (int weight in sameWeightItems) {
3649 object wv = nfkd [(byte) weight];
3650 if (wv != null) {
3651 if (deferLevel2)
3652 level2 = diacritical [(int) wv];
3653 AddCharMap ((char) ((int) wv), category, 0, level2);
3658 // update index here.
3659 fillIndex [category] += updateCount;
3661 if (vertical != char.MinValue) {
3662 if (level2 == 0 && deferLevel2)
3663 level2 = diacritical [vertical];
3664 AddCharMap (vertical, category, updateCount, level2);
3668 private void AddCharMapCJK (char c, ref byte category)
3670 AddCharMap (c, category, 0, 0);
3671 IncrementSequentialIndex (ref category);
3673 // Special. I wonder why but Windows skips 9E F9.
3674 if (category == 0x9E && fillIndex [category] == 0xF9)
3675 IncrementSequentialIndex (ref category);
3678 private void AddCharMapGroupCJK (char c, ref byte category)
3680 AddCharMapCJK (c, ref category);
3682 // LAMESPEC: see below.
3683 if (c == '\u5B78') {
3684 AddCharMapCJK ('\u32AB', ref category);
3685 AddCharMapCJK ('\u323B', ref category);
3687 if (c == '\u52DE') {
3688 AddCharMapCJK ('\u3298', ref category);
3689 AddCharMapCJK ('\u3238', ref category);
3691 if (c == '\u5BEB')
3692 AddCharMapCJK ('\u32A2', ref category);
3693 if (c == '\u91AB')
3694 // Especially this mapping order totally does
3695 // not make sense to me.
3696 AddCharMapCJK ('\u32A9', ref category);
3698 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3699 if (nfkd == null)
3700 return;
3701 for (byte weight = 0; weight <= 0x12; weight++) {
3702 object wv = nfkd [weight];
3703 if (wv == null)
3704 continue;
3705 int w = (int) wv;
3707 // Special: they are ignored in this area.
3708 // FIXME: check if it is sane
3709 if (0xF900 <= w && w <= 0xFAD9)
3710 continue;
3711 // LAMESPEC: on Windows some of CJK characters
3712 // in 3200-32B0 are incorrectly mapped. They
3713 // mix Chinise and Japanese Kanji when
3714 // ordering those characters.
3715 switch (w) {
3716 case 0x32A2: case 0x3298: case 0x3238:
3717 case 0x32A9: case 0x323B: case 0x32AB:
3718 continue;
3721 AddCharMapCJK ((char) w, ref category);
3725 // For now it is only for 0x7 category.
3726 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3728 if (map [(int) c].Defined)
3729 return;
3731 bool updateWeight = false;
3732 // Process in advance (lower primary weight)
3733 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3734 if (!map [c2].Defined &&
3735 decompLength [c2] == 1 &&
3736 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3737 switch (decompType [c2]) {
3738 case DecompositionSmall:
3739 updateWeight = true;
3740 AddCharMap ((char) c2, category,
3741 0, level2);
3742 break;
3746 if (updateWeight)
3747 fillIndex [category] = (byte)
3748 (fillIndex [category] + updateCount);
3750 // Identical weight
3751 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3752 if (!map [c2].Defined &&
3753 decompLength [c2] == 1 &&
3754 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3755 switch (decompType [c2]) {
3756 case DecompositionSub:
3757 case DecompositionSuper:
3758 case DecompositionWide:
3759 case DecompositionNarrow:
3760 AddCharMap ((char) c2, category,
3761 0, level2);
3762 break;
3767 // itself
3768 AddCharMap (c, category, updateCount, level2);
3770 // Since nfkdMap is problematic to have two or more
3771 // NFKD to an identical character, here I iterate all.
3772 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3773 if (!map [c2].Defined &&
3774 decompLength [c2] == 1 &&
3775 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3776 switch (decompType [c2]) {
3777 case DecompositionWide:
3778 case DecompositionNarrow:
3779 case DecompositionSmall:
3780 case DecompositionSub:
3781 case DecompositionSuper:
3782 continue;
3783 default:
3784 AddCharMap ((char) c2, category, updateCount, level2);
3785 break;
3791 private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2)
3793 // itself
3794 AddCharMap (c, category, 0, level2);
3796 // Since nfkdMap is problematic to have two or more
3797 // NFKD to an identical character, here I iterate all.
3798 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3799 if (decompLength [c2] == 0)
3800 continue;
3801 int idx = decompIndex [c2] + decompLength [c2] - 1;
3802 if ((int) (decompValues [idx]) == (int) c)
3803 AddCharMap ((char) c2, category,
3804 0, level2);
3806 fillIndex [category] += updateCount;
3809 char ToSmallForm (char c)
3811 return ToDecomposed (c, DecompositionSmall, false);
3814 char ToDecomposed (char c, byte d, bool tail)
3816 if (decompType [(int) c] != d)
3817 return c;
3818 int idx = decompIndex [(int) c];
3819 if (tail)
3820 idx += decompLength [(int) c] - 1;
3821 return (char) decompValues [idx];
3824 bool ExistsJIS (int cp)
3826 foreach (JISCharacter j in jisJapanese)
3827 if (j.CP == cp)
3828 return true;
3829 return false;
3832 #endregion
3834 #region Level 3 properties (Case/Width)
3836 private byte ComputeLevel3Weight (char c)
3838 byte b = ComputeLevel3WeightRaw (c);
3839 return b > 0 ? (byte) (b + 2) : b;
3842 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3844 // CJK compat
3845 if ('\u3192' <= c && c <= '\u319F')
3846 return 0;
3848 // They have <narrow> NFKD mapping, and on Windows
3849 // those narrow characters are regarded as "normal",
3850 // thus those characters themselves are regarded as
3851 // "wide". grep "<narrow>" and you can pick them up
3852 // (ignoring Kana, Hangul etc.)
3853 switch (c) {
3854 case '\u3002':
3855 case '\u300C':
3856 case '\u300D':
3857 case '\u3001':
3858 case '\u30FB':
3859 case '\u2502':
3860 case '\u2190':
3861 case '\u2191':
3862 case '\u2192':
3863 case '\u2193':
3864 case '\u25A0':
3865 case '\u25CB':
3866 return 1;
3868 // Korean
3869 if ('\u11A8' <= c && c <= '\u11F9')
3870 return 2;
3871 if ('\uFFA0' <= c && c <= '\uFFDC')
3872 return 4;
3873 if ('\u3130' <= c && c <= '\u3164')
3874 return 5;
3875 if ('\u3165' <= c && c <= '\u318E')
3876 return 4;
3877 // Georgian Capital letters
3878 if ('\u10A0' <= c && c <= '\u10C5')
3879 return 0x10;
3880 // numbers
3881 if ('\u2776' <= c && c <= '\u277F')
3882 return 4;
3883 if ('\u2780' <= c && c <= '\u2789')
3884 return 8;
3885 if ('\u2776' <= c && c <= '\u2793')
3886 return 0xC;
3887 if ('\u2160' <= c && c <= '\u216F')
3888 return 0x10;
3889 if ('\u2181' <= c && c <= '\u2182')
3890 return 0x10;
3891 // Arabic
3892 if ('\u2135' <= c && c <= '\u2138')
3893 return 4;
3894 // I believe that Windows has a bug on setting level 3
3895 // weight here. NFKD results in different values.
3896 if ('\uFE80' < c && c < '\uFF00') {
3897 // 2(Isolated)/8(Final)/0x18(Medial)
3898 switch (decompType [(int) c]) {
3899 case DecompositionIsolated:
3900 return 0; // 2;
3901 case DecompositionFinal:
3902 return 8;
3903 case DecompositionMedial:
3904 return 0x18;
3905 case DecompositionInitial:
3906 return 0x10;
3910 // I have no idea why those symbols have level 3 weight
3911 if (c == '\u2104' || c == '\u212B')
3912 return 0x18;
3913 if ('\u211E' <= c && c <= '\u212B')
3914 return 0x10;
3916 // actually I dunno the reason why they have weights.
3917 switch (c) {
3918 case '\u01BC':
3919 return 0x10;
3920 case '\u06A9':
3921 return 0x20;
3922 case '\u06AA':
3923 return 0x28;
3924 // Gurmukhi
3925 case '\u0A39':
3926 case '\u0A59':
3927 case '\u0A5A':
3928 case '\u0A5B':
3929 case '\u0A5E':
3930 return 0x10;
3933 byte ret = 0;
3934 switch (c) {
3935 case '\u03C2':
3936 case '\u212B':
3937 ret = 8;
3938 break;
3939 case '\uFE42':
3940 ret = 0xA;
3941 break;
3944 // misc
3945 switch (decompType [(int) c]) {
3946 case DecompositionWide: // <wide>
3947 case DecompositionSub: // <sub>
3948 case DecompositionSuper: // <super>
3949 ret |= decompType [(int) c];
3950 break;
3952 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3953 ret |= 8;
3954 if (isUppercase [(int) c]) // DerivedCoreProperties
3955 ret |= 0x10;
3957 return ret;
3960 #endregion
3962 #region IsIgnorable
3964 static bool IsIgnorable (int i)
3966 if (unicodeAge [i] >= 3.1)
3967 return true;
3968 switch (char.GetUnicodeCategory ((char) i)) {
3969 case UnicodeCategory.OtherNotAssigned:
3970 case UnicodeCategory.Format:
3971 return true;
3973 return false;
3977 // FIXME: In the future use DerivedAge.txt to examine character
3978 // versions and set those ones that have higher version than
3979 // 1.0 as ignorable.
3980 static bool IsIgnorable (int i)
3982 switch (i) {
3983 case 0:
3984 // I guess, those characters are added between
3985 // Unicode 1.0 (LCMapString) and Unicode 3.1
3986 // (UnicodeCategory), so they used to be
3987 // something like OtherNotAssigned as of Unicode 1.1.
3988 case 0x2df: case 0x387:
3989 case 0x3d7: case 0x3d8: case 0x3d9:
3990 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3991 case 0x400: case 0x40d: case 0x450: case 0x45d:
3992 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3993 case 0x653: case 0x654: case 0x655: case 0x66d:
3994 case 0xb56:
3995 case 0x1e9b: case 0x202f: case 0x20ad:
3996 case 0x20ae: case 0x20af:
3997 case 0x20e2: case 0x20e3:
3998 case 0x2139: case 0x213a: case 0x2183:
3999 case 0x2425: case 0x2426: case 0x2619:
4000 case 0x2670: case 0x2671: case 0x3007:
4001 case 0x3190: case 0x3191:
4002 case 0xfffc: case 0xfffd:
4003 return true;
4004 // exceptional characters filtered by the
4005 // following conditions. Originally those exceptional
4006 // ranges are incorrect (they should not be ignored)
4007 // and most of those characters are unfortunately in
4008 // those ranges.
4009 case 0x4d8: case 0x4d9:
4010 case 0x4e8: case 0x4e9:
4011 case 0x70F:
4012 case 0x3036: case 0x303f:
4013 case 0x337b: case 0xfb1e:
4014 return false;
4017 if (
4018 // The whole Sinhala characters.
4019 0x0D82 <= i && i <= 0x0DF4
4020 // The whole Tibetan characters.
4021 || 0x0F00 <= i && i <= 0x0FD1
4022 // The whole Myanmar characters.
4023 || 0x1000 <= i && i <= 0x1059
4024 // The whole Etiopic, Cherokee,
4025 // Canadian Syllablic, Ogham, Runic,
4026 // Tagalog, Hanunoo, Philippine,
4027 // Buhid, Tagbanwa, Khmer and Mongorian
4028 // characters.
4029 || 0x1200 <= i && i <= 0x1DFF
4030 // Greek extension characters.
4031 || 0x1F00 <= i && i <= 0x1FFF
4032 // The whole Braille characters.
4033 || 0x2800 <= i && i <= 0x28FF
4034 // CJK radical characters.
4035 || 0x2E80 <= i && i <= 0x2EF3
4036 // Kangxi radical characters.
4037 || 0x2F00 <= i && i <= 0x2FD5
4038 // Ideographic description characters.
4039 || 0x2FF0 <= i && i <= 0x2FFB
4040 // Bopomofo letter and final
4041 || 0x31A0 <= i && i <= 0x31B7
4042 // White square with quadrant characters.
4043 || 0x25F0 <= i && i <= 0x25F7
4044 // Ideographic telegraph symbols.
4045 || 0x32C0 <= i && i <= 0x32CB
4046 || 0x3358 <= i && i <= 0x3370
4047 || 0x33E0 <= i && i <= 0x33FF
4048 // The whole YI characters.
4049 || 0xA000 <= i && i <= 0xA48C
4050 || 0xA490 <= i && i <= 0xA4C6
4051 // American small ligatures
4052 || 0xFB13 <= i && i <= 0xFB17
4053 // hebrew, arabic, variation selector.
4054 || 0xFB1D <= i && i <= 0xFE2F
4055 // Arabic ligatures.
4056 || 0xFEF5 <= i && i <= 0xFEFC
4057 // FIXME: why are they excluded?
4058 || 0x01F6 <= i && i <= 0x01F9
4059 || 0x0218 <= i && i <= 0x0233
4060 || 0x02A9 <= i && i <= 0x02AD
4061 || 0x02EA <= i && i <= 0x02EE
4062 || 0x0349 <= i && i <= 0x036F
4063 || 0x0488 <= i && i <= 0x048F
4064 || 0x04D0 <= i && i <= 0x04FF
4065 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
4066 || 0x06D6 <= i && i <= 0x06ED
4067 || 0x06FA <= i && i <= 0x06FE
4068 || 0x2048 <= i && i <= 0x204D
4069 || 0x20e4 <= i && i <= 0x20ea
4070 || 0x213C <= i && i <= 0x214B
4071 || 0x21EB <= i && i <= 0x21FF
4072 || 0x22F2 <= i && i <= 0x22FF
4073 || 0x237B <= i && i <= 0x239A
4074 || 0x239B <= i && i <= 0x23CF
4075 || 0x24EB <= i && i <= 0x24FF
4076 || 0x2596 <= i && i <= 0x259F
4077 || 0x25F8 <= i && i <= 0x25FF
4078 || 0x2672 <= i && i <= 0x2689
4079 || 0x2768 <= i && i <= 0x2775
4080 || 0x27d0 <= i && i <= 0x27ff
4081 || 0x2900 <= i && i <= 0x2aff
4082 || 0x3033 <= i && i <= 0x303F
4083 || 0x31F0 <= i && i <= 0x31FF
4084 || 0x3250 <= i && i <= 0x325F
4085 || 0x32B1 <= i && i <= 0x32BF
4086 || 0x3371 <= i && i <= 0x337B
4087 || 0xFA30 <= i && i <= 0xFA6A
4089 return true;
4091 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4092 switch (uc) {
4093 case UnicodeCategory.PrivateUse:
4094 case UnicodeCategory.Surrogate:
4095 return false;
4096 // ignored by nature
4097 case UnicodeCategory.Format:
4098 case UnicodeCategory.OtherNotAssigned:
4099 return true;
4100 default:
4101 return false;
4105 // To check IsIgnorable sanity, try the driver below under MS.NET.
4108 public static void Main ()
4110 for (int i = 0; i <= char.MaxValue; i++)
4111 Dump (i, IsIgnorable (i));
4114 static void Dump (int i, bool ignore)
4116 switch (Char.GetUnicodeCategory ((char) i)) {
4117 case UnicodeCategory.PrivateUse:
4118 case UnicodeCategory.Surrogate:
4119 return; // check nothing
4122 string s1 = "";
4123 string s2 = new string ((char) i, 10);
4124 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
4125 if ((ret == 0) == ignore)
4126 return;
4127 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
4130 #endregion // IsIgnorable
4132 #region IsIgnorableSymbol
4133 static bool IsIgnorableSymbol (int i)
4135 if (IsIgnorable (i))
4136 return true;
4138 switch (i) {
4139 // *Letter
4140 case 0x00b5: case 0x01C0: case 0x01C1:
4141 case 0x01C2: case 0x01C3: case 0x01F6:
4142 case 0x01F7: case 0x01F8: case 0x01F9:
4143 case 0x02D0: case 0x02EE: case 0x037A:
4144 case 0x03D7: case 0x03F3:
4145 case 0x0400: case 0x040d:
4146 case 0x0450: case 0x045d:
4147 case 0x048C: case 0x048D:
4148 case 0x048E: case 0x048F:
4149 case 0x0587: case 0x0640: case 0x06E5:
4150 case 0x06E6: case 0x06FA: case 0x06FB:
4151 case 0x06FC: case 0x093D: case 0x0950:
4152 case 0x1E9B: case 0x2139: case 0x3006:
4153 case 0x3033: case 0x3034: case 0x3035:
4154 case 0xFE7E: case 0xFE7F:
4155 // OtherNumber
4156 case 0x16EE: case 0x16EF: case 0x16F0:
4157 // LetterNumber
4158 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
4159 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
4160 case 0x3038: // HANGZHOU NUMERAL TEN
4161 case 0x3039: // HANGZHOU NUMERAL TWENTY
4162 case 0x303a: // HANGZHOU NUMERAL THIRTY
4163 // OtherSymbol
4164 case 0x2117:
4165 case 0x327F:
4166 return true;
4167 // ModifierSymbol
4168 case 0x02B9: case 0x02BA: case 0x02C2:
4169 case 0x02C3: case 0x02C4: case 0x02C5:
4170 case 0x02C8: case 0x02CC: case 0x02CD:
4171 case 0x02CE: case 0x02CF: case 0x02D2:
4172 case 0x02D3: case 0x02D4: case 0x02D5:
4173 case 0x02D6: case 0x02D7: case 0x02DE:
4174 case 0x02E5: case 0x02E6: case 0x02E7:
4175 case 0x02E8: case 0x02E9:
4176 case 0x309B: case 0x309C:
4177 // OtherPunctuation
4178 case 0x055A: // American Apos
4179 case 0x05C0: // Hebrew Punct
4180 case 0x0E4F: // Thai FONGMAN
4181 case 0x0E5A: // Thai ANGKHANKHU
4182 case 0x0E5B: // Thai KHOMUT
4183 // CurencySymbol
4184 case 0x09F2: // Bengali Rupee Mark
4185 case 0x09F3: // Bengali Rupee Sign
4186 // MathSymbol
4187 case 0x221e: // INF.
4188 // OtherSymbol
4189 case 0x0482:
4190 case 0x09FA:
4191 case 0x0B70:
4192 return false;
4195 // *Letter
4196 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
4197 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
4198 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
4200 return true;
4202 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4203 switch (uc) {
4204 case UnicodeCategory.Surrogate:
4205 return false; // inconsistent
4207 case UnicodeCategory.SpacingCombiningMark:
4208 case UnicodeCategory.EnclosingMark:
4209 case UnicodeCategory.NonSpacingMark:
4210 case UnicodeCategory.PrivateUse:
4211 // NonSpacingMark
4212 if (0x064B <= i && i <= 0x0652) // Arabic
4213 return true;
4214 return false;
4216 case UnicodeCategory.Format:
4217 case UnicodeCategory.OtherNotAssigned:
4218 return true;
4220 default:
4221 bool use = false;
4222 // OtherSymbols
4223 if (
4224 // latin in a circle
4225 0x249A <= i && i <= 0x24E9
4226 || 0x2100 <= i && i <= 0x2132
4227 // Japanese
4228 || 0x3196 <= i && i <= 0x31A0
4229 // Korean
4230 || 0x3200 <= i && i <= 0x321C
4231 // Chinese/Japanese
4232 || 0x322A <= i && i <= 0x3243
4233 // CJK
4234 || 0x3260 <= i && i <= 0x32B0
4235 || 0x32D0 <= i && i <= 0x3357
4236 || 0x337B <= i && i <= 0x33DD
4238 use = !Char.IsLetterOrDigit ((char) i);
4239 if (use)
4240 return false;
4242 // This "Digit" rule is mystery.
4243 // It filters some symbols out.
4244 if (Char.IsLetterOrDigit ((char) i))
4245 return false;
4246 if (Char.IsNumber ((char) i))
4247 return false;
4248 if (Char.IsControl ((char) i)
4249 || Char.IsSeparator ((char) i)
4250 || Char.IsPunctuation ((char) i))
4251 return true;
4252 if (Char.IsSymbol ((char) i))
4253 return true;
4255 // FIXME: should check more
4256 return false;
4260 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
4262 public static void Main ()
4264 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
4265 for (int i = 0; i <= char.MaxValue; i++) {
4266 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4267 if (uc == UnicodeCategory.Surrogate)
4268 continue;
4270 bool ret = IsIgnorableSymbol (i);
4272 string s1 = "TEST ";
4273 string s2 = "TEST " + (char) i;
4275 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
4277 if (ret != (result == 0))
4278 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
4279 ret ? "should not ignore" :
4280 "should ignore",
4281 i,(char) i, uc);
4285 #endregion
4287 #region NonSpacing
4288 static bool IsIgnorableNonSpacing (int i)
4290 if (IsIgnorable (i))
4291 return true;
4293 switch (i) {
4294 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
4295 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
4296 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
4297 return true;
4298 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
4299 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
4300 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
4301 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
4302 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
4303 case 0x0CCD: case 0x0E4E:
4304 return false;
4307 if (0x02b9 <= i && i <= 0x02c5
4308 || 0x02cc <= i && i <= 0x02d7
4309 || 0x02e4 <= i && i <= 0x02ef
4310 || 0x20DD <= i && i <= 0x20E0
4312 return true;
4314 if (0x064B <= i && i <= 0x00652
4315 || 0x0941 <= i && i <= 0x0948
4316 || 0x0AC1 <= i && i <= 0x0ACD
4317 || 0x0C3E <= i && i <= 0x0C4F
4318 || 0x0E31 <= i && i <= 0x0E3F
4320 return false;
4322 return Char.GetUnicodeCategory ((char) i) ==
4323 UnicodeCategory.NonSpacingMark;
4326 // We can reuse IsIgnorableSymbol testcode
4327 // for IsIgnorableNonSpacing.
4328 #endregion
4331 struct CharMapEntry
4333 public byte Category;
4334 public byte Level1;
4335 public byte Level2; // It is always single byte.
4336 public bool Defined;
4338 public CharMapEntry (byte category, byte level1, byte level2)
4340 Category = category;
4341 Level1 = level1;
4342 Level2 = level2;
4343 Defined = true;
4347 class JISCharacter
4349 public readonly int CP;
4350 public readonly int JIS;
4352 public JISCharacter (int cp, int cpJIS)
4354 CP = cp;
4355 JIS = cpJIS;
4359 class JISComparer : IComparer
4361 public static readonly JISComparer Instance =
4362 new JISComparer ();
4364 public int Compare (object o1, object o2)
4366 JISCharacter j1 = (JISCharacter) o1;
4367 JISCharacter j2 = (JISCharacter) o2;
4368 return j1.JIS - j2.JIS;
4372 class NonJISCharacter
4374 public readonly int CP;
4375 public readonly string Name;
4377 public NonJISCharacter (int cp, string name)
4379 CP = cp;
4380 Name = name;
4384 class NonJISComparer : IComparer
4386 public static readonly NonJISComparer Instance =
4387 new NonJISComparer ();
4389 public int Compare (object o1, object o2)
4391 NonJISCharacter j1 = (NonJISCharacter) o1;
4392 NonJISCharacter j2 = (NonJISCharacter) o2;
4393 return string.CompareOrdinal (j1.Name, j2.Name);
4397 class DecimalDictionaryValueComparer : IComparer
4399 public static readonly DecimalDictionaryValueComparer Instance
4400 = new DecimalDictionaryValueComparer ();
4402 private DecimalDictionaryValueComparer ()
4406 public int Compare (object o1, object o2)
4408 DictionaryEntry e1 = (DictionaryEntry) o1;
4409 DictionaryEntry e2 = (DictionaryEntry) o2;
4410 // FIXME: in case of 0, compare decomposition categories
4411 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4412 if (ret != 0)
4413 return ret;
4414 int i1 = (int) e1.Key;
4415 int i2 = (int) e2.Key;
4416 return i1 - i2;
4420 class StringDictionaryValueComparer : IComparer
4422 public static readonly StringDictionaryValueComparer Instance
4423 = new StringDictionaryValueComparer ();
4425 private StringDictionaryValueComparer ()
4429 public int Compare (object o1, object o2)
4431 DictionaryEntry e1 = (DictionaryEntry) o1;
4432 DictionaryEntry e2 = (DictionaryEntry) o2;
4433 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4434 if (ret != 0)
4435 return ret;
4436 int i1 = (int) e1.Key;
4437 int i2 = (int) e2.Key;
4438 return i1 - i2;
4442 class UCAComparer : IComparer
4444 public static readonly UCAComparer Instance
4445 = new UCAComparer ();
4447 private UCAComparer ()
4451 public int Compare (object o1, object o2)
4453 char i1 = (char) o1;
4454 char i2 = (char) o2;
4456 int l1 = CollationElementTable.GetSortKeyCount (i1);
4457 int l2 = CollationElementTable.GetSortKeyCount (i2);
4458 int l = l1 > l2 ? l2 : l1;
4460 for (int i = 0; i < l; i++) {
4461 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4462 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4463 int v = k1.Primary - k2.Primary;
4464 if (v != 0)
4465 return v;
4466 v = k1.Secondary - k2.Secondary;
4467 if (v != 0)
4468 return v;
4469 v = k1.Thirtiary - k2.Thirtiary;
4470 if (v != 0)
4471 return v;
4472 v = k1.Quarternary - k2.Quarternary;
4473 if (v != 0)
4474 return v;
4476 return l1 - l2;
4480 class Tailoring
4482 int lcid;
4483 int alias;
4484 bool frenchSort;
4485 ArrayList items = new ArrayList ();
4487 public Tailoring (int lcid)
4488 : this (lcid, 0)
4492 public Tailoring (int lcid, int alias)
4494 this.lcid = lcid;
4495 this.alias = alias;
4498 public int LCID {
4499 get { return lcid; }
4502 public int Alias {
4503 get { return alias; }
4506 public bool FrenchSort {
4507 get { return frenchSort; }
4508 set { frenchSort = value; }
4511 public void AddDiacriticalMap (byte target, byte replace)
4513 items.Add (new DiacriticalMap (target, replace));
4516 public void AddSortKeyMap (string source, byte [] sortkey)
4518 items.Add (new SortKeyMap (source, sortkey));
4521 public void AddReplacementMap (string source, string replace)
4523 items.Add (new ReplacementMap (source, replace));
4526 public char [] ItemToCharArray ()
4528 ArrayList al = new ArrayList ();
4529 foreach (ITailoringMap m in items)
4530 al.AddRange (m.ToCharArray ());
4531 return al.ToArray (typeof (char)) as char [];
4534 interface ITailoringMap
4536 char [] ToCharArray ();
4539 class DiacriticalMap : ITailoringMap
4541 public readonly byte Target;
4542 public readonly byte Replace;
4544 public DiacriticalMap (byte target, byte replace)
4546 Target = target;
4547 Replace = replace;
4550 public char [] ToCharArray ()
4552 char [] ret = new char [3];
4553 ret [0] = (char) 02; // kind:DiacriticalMap
4554 ret [1] = (char) Target;
4555 ret [2] = (char) Replace;
4556 return ret;
4560 class SortKeyMap : ITailoringMap
4562 public readonly string Source;
4563 public readonly byte [] SortKey;
4565 public SortKeyMap (string source, byte [] sortkey)
4567 Source = source;
4568 SortKey = sortkey;
4571 public char [] ToCharArray ()
4573 char [] ret = new char [Source.Length + 7];
4574 ret [0] = (char) 01; // kind:SortKeyMap
4575 for (int i = 0; i < Source.Length; i++)
4576 ret [i + 1] = Source [i];
4577 // null terminate
4578 for (int i = 0; i < 4; i++)
4579 ret [i + Source.Length + 2] = (char) SortKey [i];
4580 return ret;
4584 class ReplacementMap : ITailoringMap
4586 public readonly string Source;
4587 public readonly string Replace;
4589 public ReplacementMap (string source, string replace)
4591 Source = source;
4592 Replace = replace;
4595 public char [] ToCharArray ()
4597 char [] ret = new char [Source.Length + Replace.Length + 3];
4598 ret [0] = (char) 03; // kind:ReplaceMap
4599 int pos = 1;
4600 for (int i = 0; i < Source.Length; i++)
4601 ret [pos++] = Source [i];
4602 // null terminate
4603 pos++;
4604 for (int i = 0; i < Replace.Length; i++)
4605 ret [pos++] = Replace [i];
4606 // null terminate
4607 return ret;