class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

   1 //
   2 // create-mscompat-collation-table.cs : generates Windows-like sortkey tables.
   3 //
   4 // Author:
   5 //      Atsushi Enomoto  <atsushi@ximian.com>
   6 //
   7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
   8 //
   9 // Permission is hereby granted, free of charge, to any person obtaining
  10 // a copy of this software and associated documentation files (the
  11 // "Software"), to deal in the Software without restriction, including
  12 // without limitation the rights to use, copy, modify, merge, publish,
  13 // distribute, sublicense, and/or sell copies of the Software, and to
  14 // permit persons to whom the Software is furnished to do so, subject to
  15 // the following conditions:
  16 //
  17 // The above copyright notice and this permission notice shall be
  18 // included in all copies or substantial portions of the Software.
  19 //
  20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27 //
  28
  29 //
  30 // There are two kind of sort keys : which are computed and which are laid out
  31 // as an indexed array. Computed sort keys are:
  32 //
  33 //      - Surrogate
  34 //      - PrivateUse
  35 //
  36 // Though it is possible to "compute" level 3 weights, they are still dumped
  37 // to an array to avoid execution cost.
  38 //
  39 #define Binary
  40
  41 using System;
  42 using System.IO;
  43 using System.Collections;
  44 using System.Globalization;
  45 using System.Text;
  46 using System.Xml;
  47
  48 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
  49
  50 namespace Mono.Globalization.Unicode
  51 {
  52         internal class MSCompatSortKeyTableGenerator
  53         {
  54                 public static void Main (string [] args)
  55                 {
  56                         new MSCompatSortKeyTableGenerator ().Run (args);
  57                 }
  58
  59                 const int DecompositionWide = 1; // fixed
  60                 const int DecompositionSub = 2; // fixed
  61                 const int DecompositionSmall = 3;
  62                 const int DecompositionIsolated = 4;
  63                 const int DecompositionInitial = 5;
  64                 const int DecompositionFinal = 6;
  65                 const int DecompositionMedial = 7;
  66                 const int DecompositionNoBreak = 8;
  67                 const int DecompositionVertical = 9;
  68                 const int DecompositionFraction = 0xA;
  69                 const int DecompositionFont = 0xB;
  70                 const int DecompositionSuper = 0xC; // fixed
  71                 const int DecompositionFull = 0xE;
  72                 const int DecompositionNarrow = 0xD;
  73                 const int DecompositionCircle = 0xF;
  74                 const int DecompositionSquare = 0x10;
  75                 const int DecompositionCompat = 0x11;
  76                 const int DecompositionCanonical = 0x12;
  77
  78                 TextWriter CSResult = Console.Out;
  79                 TextWriter CResult = TextWriter.Null;
  80
  81                 byte [] fillIndex = new byte [256]; // by category
  82                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
  83
  84                 char [] specialIgnore = new char [] {
  85                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
  86                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
  87                         };
  88
  89                 // FIXME: need more love (as always)
  90                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
  91                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
  92                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  93                         '\u0292', '\u01BE', '\u0298'};
  94                 byte [] alphaWeights = new byte [] {
  95                         2, 9, 0xA, 0x1A, 0x21,
  96                         0x23, 0x25, 0x2C, 0x32, 0x35,
  97                         0x36, 0x48, 0x51, 0x70, 0x7C,
  98                         0x7E, 0x89, 0x8A, 0x91, 0x99,
  99                         0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
 100                         0xA9, 0xAA, 0xB3, 0xB4};
 101
 102                 bool [] isSmallCapital = new bool [char.MaxValue + 1];
 103                 bool [] isUppercase = new bool [char.MaxValue + 1];
 104
 105                 byte [] decompType = new byte [char.MaxValue + 1];
 106                 int [] decompIndex = new int [char.MaxValue + 1];
 107                 int [] decompLength = new int [char.MaxValue + 1];
 108                 int [] decompValues;
 109                 decimal [] decimalValue = new decimal [char.MaxValue + 1];
 110
 111                 byte [] diacritical = new byte [char.MaxValue + 1];
 112
 113                 string [] diacritics = new string [] {
 114                         // LATIN, CYRILLIC etc.
 115                         "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
 116                         "ABKHASIAN",
 117                         "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
 118                         "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
 119                         "WITH ACUTE;", "WITH GRAVE;",
 120                         //
 121                         "WITH DOT ABOVE;", " MIDDLE DOT;",
 122                         "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
 123                         "WITH DIALYTIKA;",
 124                         "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
 125                         "DIALYTIKA TONOS", "DIALYTIKA AND TONOS",
 126                         "ABKHASIAN CHE WITH DESCENDER",
 127                         "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
 128                         "WITH OGONEK;", "WITH CEDILLA;",
 129                         //
 130                         " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
 131                         "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
 132                         "STROKE OVERLAY",
 133                         " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
 134                         " DIAERESIS AND GRAVE;",
 135                         " BREVE AND ACUTE;",
 136                         " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
 137                         " MACRON AND ACUTE;",
 138                         " MACRON AND GRAVE;",
 139                         //
 140                         " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
 141                         " RING ABOVE AND ACUTE",
 142                         " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
 143                         " CIRCUMFLEX AND TILDE",
 144                         " TILDE AND DIAERESIS",
 145                         " STROKE AND ACUTE",
 146                         " BREVE AND TILDE",
 147                         " CEDILLA AND BREVE",
 148                         " OGONEK AND MACRON",
 149                         // 0x40
 150                         "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE",
 151                         "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
 152                         " DOUBLE GRAVE",
 153                         " INVERTED BREVE",
 154                         "ROMAN NUMERAL",
 155                         " PRECEDED BY APOSTROPHE",
 156                         "WITH HORN;",
 157                         " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
 158                         " PALATAL HOOK",
 159                         " DOT BELOW;",
 160                         " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK",
 161                         " RING BELOW", "LOW VERTICAL LINE",
 162                         //
 163                         " CIRCUMFLEX BELOW", "HORN AND ACUTE",
 164                         " BREVE BELOW;", " HORN AND GRAVE",
 165                         " LOW MACRON",
 166                         " TILDE BELOW",
 167                         " TOPBAR",
 168                         " DOT BELOW AND DOT ABOVE",
 169                         " RIGHT HALF RING", " HORN AND TILDE",
 170                         " CIRCUMFLEX AND DOT BELOW",
 171                         " BREVE AND DOT BELOW",
 172                         " DOT BELOW AND MACRON",
 173                         " TONE TWO",
 174                         " HORN AND HOOK ABOVE",
 175                         " HORN AND DOT",
 176                         // CIRCLED, PARENTHESIZED and so on
 177                         "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
 178                         "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
 179                         "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
 180                         };
 181                 byte [] diacriticWeights = new byte [] {
 182                         // LATIN.
 183                         3, 3, 3, 5, 5, 5, 5,
 184                         0xE, 0xF,
 185                         0xE, 0xF,
 186                         //
 187                         0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
 188                         0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
 189                         //
 190                         0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
 191                         0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
 192                         //
 193                         0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
 194                         0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
 195                         //
 196                         0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
 197                         0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59,
 198                         0x5A, 0x5A,
 199                         //
 200                         0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68,
 201                         0x69, 0x69, 0x6A, 0x6D, 0x6E,
 202                         0x87, 0x95, 0xAA,
 203                         // CIRCLED, PARENTHESIZED and so on.
 204                         0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
 205                         0xF3, 0xF3, 0xF3
 206                         };
 207
 208                 int [] numberSecondaryWeightBounds = new int [] {
 209                         0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
 210                         0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
 211                         0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
 212                         0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
 213                         0xE50, 0xE60, 0xED0, 0xEE0
 214                         };
 215
 216                 char [] orderedGurmukhi;
 217                 char [] orderedGujarati;
 218                 char [] orderedGeorgian;
 219                 char [] orderedThaana;
 220
 221                 static readonly char [] orderedTamilConsonants = new char [] {
 222                         // based on traditional Tamil consonants, except for
 223                         // Grantha (where Microsoft breaks traditionalism).
 224                         // http://www.angelfire.com/empire/thamizh/padanGaL
 225                         '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
 226                         '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
 227                         '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
 228                         '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
 229                         '\u0BB7', '\u0BB9'};
 230
 231                 // cp -> character name (only for some characters)
 232                 ArrayList sortableCharNames = new ArrayList ();
 233
 234                 // cp -> arrow value (int)
 235                 ArrayList arrowValues = new ArrayList ();
 236
 237                 // cp -> box value (int)
 238                 ArrayList boxValues = new ArrayList ();
 239
 240                 // cp -> level1 value
 241                 Hashtable arabicLetterPrimaryValues = new Hashtable ();
 242
 243                 // letterName -> cp
 244                 Hashtable arabicNameMap = new Hashtable ();
 245
 246                 // cp -> Hashtable [decompType] -> cp
 247                 Hashtable nfkdMap = new Hashtable ();
 248
 249                 // Latin letter -> ArrayList [int]
 250                 Hashtable latinMap = new Hashtable ();
 251
 252                 ArrayList jisJapanese = new ArrayList ();
 253                 ArrayList nonJisJapanese = new ArrayList ();
 254
 255                 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
 256                 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
 257                 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
 258                 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
 259                 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
 260
 261                 byte [] ignorableFlags = new byte [char.MaxValue + 1];
 262
 263                 static double [] unicodeAge = new double [char.MaxValue + 1];
 264
 265                 ArrayList tailorings = new ArrayList ();
 266
 267                 void Run (string [] args)
 268                 {
 269                         string dirname = args.Length == 0 ? "downloaded" : args [0];
 270                         ParseSources (dirname);
 271                         Console.Error.WriteLine ("parse done.");
 272
 273                         ModifyParsedValues ();
 274                         GenerateCore ();
 275                         Console.Error.WriteLine ("generation done.");
 276                         CResult = new StreamWriter ("collation-tables.h", false);
 277                         Serialize ();
 278                         CResult.Close ();
 279                         Console.Error.WriteLine ("serialization done.");
 280 /*
 281 StreamWriter sw = new StreamWriter ("agelog.txt");
 282 for (int i = 0; i < char.MaxValue; i++) {
 283 bool shouldBe = false;
 284 switch (Char.GetUnicodeCategory ((char) i)) {
 285 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
 286         shouldBe = true; break;
 287 }
 288 if (unicodeAge [i] >= 3.1)
 289         shouldBe = true;
 290 //if (IsIgnorable (i) != shouldBe)
 291 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
 292 }
 293 sw.Close ();
 294 */
 295                 }
 296
 297                 byte [] CompressArray (byte [] source, CodePointIndexer i)
 298                 {
 299                         return (byte []) CodePointIndexer.CompressArray  (
 300                                 source, typeof (byte), i);
 301                 }
 302
 303                 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
 304                 {
 305                         return (ushort []) CodePointIndexer.CompressArray  (
 306                                 source, typeof (ushort), i);
 307                 }
 308
 309                 void WriteByte (byte value)
 310                 {
 311
 312                 }
 313
 314                 void Serialize ()
 315                 {
 316                         // Tailorings
 317                         SerializeTailorings ();
 318
 319                         byte [] categories = new byte [map.Length];
 320                         byte [] level1 = new byte [map.Length];
 321                         byte [] level2 = new byte [map.Length];
 322                         byte [] level3 = new byte [map.Length];
 323 // widthCompat is now removed from the mapping table.
 324 // If it turned out that it is still required, grep this source and uncomment
 325 // widthCompat related lines. FIXME: remove those lines in the future.
 326 //                      ushort [] widthCompat = new ushort [map.Length];
 327                         for (int i = 0; i < map.Length; i++) {
 328                                 categories [i] = map [i].Category;
 329                                 level1 [i] = map [i].Level1;
 330                                 level2 [i] = map [i].Level2;
 331                                 level3 [i] = ComputeLevel3Weight ((char) i);
 332 /*
 333                                 // For Japanese Half-width characters, don't
 334                                 // map widthCompat. It is IgnoreKanaType that
 335                                 // handles those width differences.
 336                                 if (0xFF6D <= i && i <= 0xFF9D)
 337                                         continue;
 338                                 switch (decompType [i]) {
 339                                 case DecompositionNarrow:
 340                                 case DecompositionWide:
 341                                 case DecompositionSuper:
 342                                 case DecompositionSub:
 343                                         // they are always 1 char
 344                                         widthCompat [i] = (ushort) decompValues [decompIndex [i]];
 345                                         break;
 346                                 }
 347 */
 348                         }
 349
 350                         // compress
 351                         ignorableFlags = CompressArray (ignorableFlags,
 352                                 UUtil.Ignorable);
 353                         categories = CompressArray (categories, UUtil.Category);
 354                         level1 = CompressArray (level1, UUtil.Level1);
 355                         level2 = CompressArray (level2, UUtil.Level2);
 356                         level3 = CompressArray (level3, UUtil.Level3);
 357 //                      widthCompat = (ushort []) CodePointIndexer.CompressArray (
 358 //                              widthCompat, typeof (ushort), UUtil.WidthCompat);
 359                         cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
 360                         cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
 361                         cjkJA = CompressArray (cjkJA, UUtil.Cjk);
 362                         cjkKO = CompressArray (cjkKO, UUtil.Cjk);
 363                         cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
 364
 365                         // Ignorables
 366                         CResult.WriteLine ("static const guint8  collation_table_ignorableFlags [] = {");
 367                         CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {");
 368 #if Binary
 369                         MemoryStream ms = new MemoryStream ();
 370                         BinaryWriter binary = new BinaryWriter (ms);
 371                         binary.Write (UUtil.ResourceVersion);
 372                         binary.Write (ignorableFlags.Length);
 373 #endif
 374                         for (int i = 0; i < ignorableFlags.Length; i++) {
 375                                 byte value = ignorableFlags [i];
 376                                 if (value < 10)
 377                                         CSResult.Write ("{0},", value);
 378                                 else
 379                                         CSResult.Write ("0x{0:X02},", value);
 380                                 CResult.Write ("{0},", value);
 381 #if Binary
 382                                 binary.Write (value);
 383 #endif
 384                                 if ((i & 0xF) == 0xF) {
 385                                         CSResult.WriteLine ("// {0:X04}",
 386                                                 UUtil.Ignorable.ToCodePoint (i - 0xF));
 387                                         CResult.WriteLine ();
 388                                 }
 389                         }
 390                         CResult.WriteLine ("0};");
 391                         CSResult.WriteLine ("};");
 392                         CSResult.WriteLine ();
 393
 394                         // Primary category
 395                         CResult.WriteLine ("static const guint8 collation_table_category [] = {");
 396                         CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {");
 397 #if Binary
 398                         binary.Write (categories.Length);
 399 #endif
 400                         for (int i = 0; i < categories.Length; i++) {
 401                                 byte value = categories [i];
 402                                 if (value < 10)
 403                                         CSResult.Write ("{0},", value);
 404                                 else
 405                                         CSResult.Write ("0x{0:X02},", value);
 406                                 CResult.Write ("{0},", value);
 407 #if Binary
 408                                 binary.Write (value);
 409 #endif
 410                                 if ((i & 0xF) == 0xF) {
 411                                         CSResult.WriteLine ("// {0:X04}",
 412                                                 UUtil.Category.ToCodePoint (i - 0xF));
 413                                         CResult.WriteLine ();
 414                                 }
 415                         }
 416                         CResult.WriteLine ("};");
 417                         CSResult.WriteLine ("};");
 418                         CSResult.WriteLine ();
 419
 420                         // Primary weight value
 421                         CResult.WriteLine ("static const guint8 collation_table_level1 [] = {");
 422                         CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {");
 423 #if Binary
 424                         binary.Write (level1.Length);
 425 #endif
 426                         for (int i = 0; i < level1.Length; i++) {
 427                                 byte value = level1 [i];
 428                                 if (value < 10)
 429                                         CSResult.Write ("{0},", value);
 430                                 else
 431                                         CSResult.Write ("0x{0:X02},", value);
 432                                 CResult.Write ("{0},", value);
 433 #if Binary
 434                                 binary.Write (value);
 435 #endif
 436                                 if ((i & 0xF) == 0xF) {
 437                                         CSResult.WriteLine ("// {0:X04}",
 438                                                 UUtil.Level1.ToCodePoint (i - 0xF));
 439                                         CResult.WriteLine ();
 440                                 }
 441                         }
 442                         CResult.WriteLine ("0};");
 443                         CSResult.WriteLine ("};");
 444                         CSResult.WriteLine ();
 445
 446                         // Secondary weight
 447                         CResult.WriteLine ("static const guint8 collation_table_level2 [] = {");
 448                         CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {");
 449 #if Binary
 450                         binary.Write (level2.Length);
 451 #endif
 452                         for (int i = 0; i < level2.Length; i++) {
 453                                 byte value = level2 [i];
 454                                 if (value < 10)
 455                                         CSResult.Write ("{0},", value);
 456                                 else
 457                                         CSResult.Write ("0x{0:X02},", value);
 458                                 CResult.Write ("{0},", value);
 459 #if Binary
 460                                 binary.Write (value);
 461 #endif
 462                                 if ((i & 0xF) == 0xF) {
 463                                         CSResult.WriteLine ("// {0:X04}",
 464                                                 UUtil.Level2.ToCodePoint (i - 0xF));
 465                                         CResult.WriteLine ();
 466                                 }
 467                         }
 468                         CResult.WriteLine ("0};");
 469                         CSResult.WriteLine ("};");
 470                         CSResult.WriteLine ();
 471
 472                         // Thirtiary weight
 473                         CResult.WriteLine ("static const guint8 collation_table_level3 [] = {");
 474                         CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {");
 475 #if Binary
 476                         binary.Write (level3.Length);
 477 #endif
 478                         for (int i = 0; i < level3.Length; i++) {
 479                                 byte value = level3 [i];
 480                                 if (value < 10)
 481                                         CSResult.Write ("{0},", value);
 482                                 else
 483                                         CSResult.Write ("0x{0:X02},", value);
 484                                 CResult.Write ("{0},", value);
 485 #if Binary
 486                                 binary.Write (value);
 487 #endif
 488                                 if ((i & 0xF) == 0xF) {
 489                                         CSResult.WriteLine ("// {0:X04}",
 490                                                 UUtil.Level3.ToCodePoint (i - 0xF));
 491                                         CResult.WriteLine ();
 492                                 }
 493                         }
 494                         CResult.WriteLine ("0};");
 495                         CSResult.WriteLine ("};");
 496                         CSResult.WriteLine ();
 497
 498 /*
 499                         // Width insensitivity mappings
 500                         // (for now it is more lightweight than dumping the
 501                         // entire NFKD table).
 502                         CResult.WriteLine ("static const guint16* widthCompat [] = {");
 503                         CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {");
 504 #if Binary
 505                         binary.Write (widthCompat.Length);
 506 #endif
 507                         for (int i = 0; i < widthCompat.Length; i++) {
 508                                 ushort value = widthCompat [i];
 509                                 if (value < 10)
 510                                         CSResult.Write ("{0},", value);
 511                                 else
 512                                         CSResult.Write ("0x{0:X02},", value);
 513                                 CResult.Write ("{0},", value);
 514 #if Binary
 515                                 binary.Write (value);
 516 #endif
 517                                 if ((i & 0xF) == 0xF) {
 518                                         CSResult.WriteLine ("// {0:X04}",
 519                                                 UUtil.WidthCompat.ToCodePoint (i - 0xF));
 520                                         CResult.WriteLine ();
 521                                 }
 522                         }
 523                         CResult.WriteLine ("0};");
 524                         CSResult.WriteLine ("};");
 525                         CSResult.WriteLine ();
 526 */
 527
 528 #if Binary
 529                         using (FileStream fs = File.Create ("../resources/collation.core.bin")) {
 530                                 byte [] array = ms.ToArray ();
 531                                 fs.Write (array, 0, array.Length);
 532                         }
 533 #endif
 534
 535                         // CJK
 536                         SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
 537                         SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
 538                         SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
 539                         SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
 540                         SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
 541                 }
 542
 543                 void SerializeCJK (string name, ushort [] cjk, int max_unused)
 544                 {
 545 //                      CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length);
 546                         CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length);
 547
 548                         int len = cjk.Length;
 549                         CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name);
 550                         CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
 551                         // the actual length is *2
 552                         for (int i = 0; i < 4; i++, len /= 256) {
 553                                 CResult.Write ("{0},", len & 0xFF);
 554                                 CSResult.Write ("0x{0:X04},", len & 0xFF);
 555                         }
 556                         CResult.WriteLine ();
 557                         CSResult.WriteLine ();
 558 #if Binary
 559                         MemoryStream ms = new MemoryStream ();
 560                         BinaryWriter binary = new BinaryWriter (ms);
 561                         binary.Write (UUtil.ResourceVersion);
 562                         binary.Write (cjk.Length); // the actual size is *2.
 563 #endif
 564                         // category
 565                         for (int i = 0; i < cjk.Length; i++) {
 566 //                              if (i == max)
 567 //                                      break;
 568                                 byte value = (byte) (cjk [i] >> 8);
 569                                 if (value < 10)
 570                                         CSResult.Write ("{0},", value);
 571                                 else
 572                                         CSResult.Write ("0x{0:X02},", value);
 573                                 CResult.Write ("{0},", value);
 574 #if Binary
 575                                 binary.Write (value);
 576 #endif
 577                                 if ((i & 0xF) == 0xF) {
 578                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
 579                                         CResult.WriteLine ();
 580                                 }
 581                         }
 582
 583                         // level 1
 584                         for (int i = 0; i < cjk.Length; i++) {
 585 //                              if (i == max)
 586 //                                      break;
 587                                 byte value = (byte) (cjk [i] & 0xFF);
 588                                 if (value < 10)
 589                                         CSResult.Write ("{0},", value);
 590                                 else
 591                                         CSResult.Write ("0x{0:X02},", value);
 592                                 CResult.Write ("{0},", value);
 593 #if Binary
 594                                 binary.Write (value);
 595 #endif
 596                                 if ((i & 0xF) == 0xF) {
 597                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
 598                                         CResult.WriteLine ();
 599                                 }
 600                         }
 601
 602                         CResult.WriteLine ("0};");
 603                         CSResult.WriteLine ("};");
 604                         CSResult.WriteLine ();
 605 #if Binary
 606                         using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) {
 607                                 byte [] array = ms.ToArray ();
 608                                 fs.Write (array, 0, array.Length);
 609                         }
 610 #endif
 611                 }
 612
 613                 void SerializeCJK (string name, byte [] cjk, int max)
 614                 {
 615                         CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name);
 616                         CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
 617 #if Binary
 618                         MemoryStream ms = new MemoryStream ();
 619                         BinaryWriter binary = new BinaryWriter (ms);
 620                         binary.Write (UUtil.ResourceVersion);
 621 #endif
 622                         for (int i = 0; i < cjk.Length; i++) {
 623                                 if (i == max)
 624                                         break;
 625                                 byte value = cjk [i];
 626                                 if (value < 10)
 627                                         CSResult.Write ("{0},", value);
 628                                 else
 629                                         CSResult.Write ("0x{0:X02},", value);
 630                                 CResult.Write ("{0},", value);
 631 #if Binary
 632                                 binary.Write (value);
 633 #endif
 634                                 if ((i & 0xF) == 0xF) {
 635                                         CSResult.WriteLine ("// {0:X04}", i - 0xF);
 636                                         CResult.WriteLine ();
 637                                 }
 638                         }
 639                         CResult.WriteLine ("0};");
 640                         CSResult.WriteLine ("};");
 641                         CSResult.WriteLine ();
 642 #if Binary
 643                         using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) {
 644                                 byte [] array = ms.ToArray ();
 645                                 fs.Write (array, 0, array.Length);
 646                         }
 647 #endif
 648                 }
 649
 650                 void SerializeTailorings ()
 651                 {
 652                         Hashtable indexes = new Hashtable ();
 653                         Hashtable counts = new Hashtable ();
 654                         CResult.WriteLine ("static const guint16 collation_table_tailoring [] = {");
 655                         CSResult.WriteLine ("static char [] tailoringArr = new char [] {");
 656                         int count = 0;
 657 #if Binary
 658                         MemoryStream ms = new MemoryStream ();
 659                         BinaryWriter binary = new BinaryWriter (ms);
 660                         // Here we don't need to output resource version.
 661                         // This is cached.
 662 #endif
 663                         foreach (Tailoring t in tailorings) {
 664                                 if (t.Alias != 0)
 665                                         continue;
 666                                 CResult.Write ("/*{0}*/", t.LCID);
 667                                 CSResult.Write ("/*{0}*/", t.LCID);
 668                                 indexes.Add (t.LCID, count);
 669                                 char [] values = t.ItemToCharArray ();
 670                                 counts.Add (t.LCID, values.Length);
 671                                 foreach (char c in values) {
 672                                         CSResult.Write ("'\\x{0:X}', ", (int) c);
 673                                         CResult.Write ("{0},", (int) c);
 674                                         if (++count % 16 == 0) {
 675                                                 CSResult.WriteLine (" // {0:X04}", count - 16);
 676                                                 CResult.WriteLine ();
 677                                         }
 678 #if Binary
 679                                         binary.Write ((ushort) c);
 680 #endif
 681                                 }
 682                         }
 683                         CResult.WriteLine ("0};");
 684                         CSResult.WriteLine ("};");
 685
 686                         CResult.WriteLine ("static const guint32 collation_table_tailoring_infos [] = {");
 687                         CResult.WriteLine ("{0}, /*count*/", tailorings.Count);
 688                         CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
 689 #if Binary
 690                         byte [] rawdata = ms.ToArray ();
 691                         ms = new MemoryStream ();
 692                         binary = new BinaryWriter (ms);
 693                         binary.Write (UUtil.ResourceVersion);
 694                         binary.Write (tailorings.Count);
 695 #endif
 696                         foreach (Tailoring t in tailorings) {
 697                                 int target = t.Alias != 0 ? t.Alias : t.LCID;
 698                                 if (!indexes.ContainsKey (target)) {
 699                                         throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
 700                                         continue;
 701                                 }
 702                                 int idx = (int) indexes [target];
 703                                 int cnt = (int) counts [target];
 704                                 bool french = t.FrenchSort;
 705                                 if (t.Alias != 0)
 706                                         foreach (Tailoring t2 in tailorings)
 707                                                 if (t2.LCID == t.LCID)
 708                                                         french = t2.FrenchSort;
 709                                 CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
 710                                 CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0);
 711 #if Binary
 712                                 binary.Write (t.LCID);
 713                                 binary.Write (idx);
 714                                 binary.Write (cnt);
 715                                 binary.Write (french);
 716 #endif
 717                         }
 718                         CResult.WriteLine ("0};");
 719                         CSResult.WriteLine ("};");
 720 #if Binary
 721                         binary.Write ((byte) 0xFF);
 722                         binary.Write ((byte) 0xFF);
 723                         binary.Write (rawdata.Length / 2);
 724                         binary.Write (rawdata, 0, rawdata.Length);
 725
 726
 727                         using (FileStream fs = File.Create ("../resources/collation.tailoring.bin")) {
 728                                 byte [] array = ms.ToArray ();
 729                                 fs.Write (array, 0, array.Length);
 730                         }
 731 #endif
 732                 }
 733
 734                 #region Parse
 735
 736                 void ParseSources (string dirname)
 737                 {
 738                         string unidata =
 739                                 dirname + "/UnicodeData.txt";
 740                         string derivedCoreProps =
 741                                 dirname + "/DerivedCoreProperties.txt";
 742                         string scripts =
 743                                 dirname + "/Scripts.txt";
 744                         string cp932 =
 745                                 dirname + "/CP932.TXT";
 746                         string derivedAge =
 747                                 dirname + "/DerivedAge.txt";
 748                         string chXML = dirname + "/common/collation/zh.xml";
 749                         string jaXML = dirname + "/common/collation/ja.xml";
 750                         string koXML = dirname + "/common/collation/ko.xml";
 751
 752                         ParseDerivedAge (derivedAge);
 753
 754                         FillIgnorables ();
 755
 756                         ParseJISOrder (cp932); // in prior to ParseUnidata()
 757                         ParseUnidata (unidata);
 758                         ModifyUnidata ();
 759                         ParseDerivedCoreProperties (derivedCoreProps);
 760                         ParseScripts (scripts);
 761                         ParseCJK (chXML, jaXML, koXML);
 762
 763                         ParseTailorings ("mono-tailoring-source.txt");
 764                 }
 765
 766                 void ParseTailorings (string filename)
 767                 {
 768                         Tailoring t = null;
 769                         int line = 0;
 770                         using (StreamReader sr = new StreamReader (filename)) {
 771                                 try {
 772                                         while (sr.Peek () >= 0) {
 773                                                 line++;
 774                                                 ProcessTailoringLine (ref t,
 775                                                         sr.ReadLine ().Trim ());
 776                                         }
 777                                 } catch (Exception) {
 778                                         Console.Error.WriteLine ("ERROR at line {0}", line);
 779                                         throw;
 780                                 }
 781                         }
 782                 }
 783
 784                 // For now this is enough.
 785                 string ParseTailoringSourceValue (string s)
 786                 {
 787                         StringBuilder sb = new StringBuilder ();
 788                         for (int i = 0; i < s.Length; i++) {
 789                                 if (i + 5 < s.Length &&
 790                                         s [i] == '\\' && s [i + 1] == 'u') {
 791                                         sb.Append (
 792                                                 (char) int.Parse (
 793                                                         s.Substring (i + 2, 4),
 794                                                         NumberStyles.HexNumber),
 795                                                 1);
 796                                         i += 5;
 797                                 }
 798                                 else
 799                                         sb.Append (s [i]);
 800                         }
 801                         return sb.ToString ();
 802                 }
 803
 804                 void ProcessTailoringLine (ref Tailoring t, string s)
 805                 {
 806                         int idx = s.IndexOf ('#');
 807                         if (idx > 0)
 808                                 s = s.Substring (0, idx).Trim ();
 809                         if (s.Length == 0 || s [0] == '#')
 810                                 return;
 811                         if (s [0] == '@') {
 812                                 idx = s.IndexOf ('=');
 813                                 if (idx > 0)
 814                                         t = new Tailoring (
 815                                                 int.Parse (s.Substring (1, idx - 1)),
 816                                                 int.Parse (s.Substring (idx + 1)));
 817                                 else
 818                                         t = new Tailoring (int.Parse (s.Substring (1)));
 819                                 tailorings.Add (t);
 820                                 return;
 821                         }
 822                         if (s.StartsWith ("*FrenchSort")) {
 823                                 t.FrenchSort = true;
 824                                 return;
 825                         }
 826                         string d = "*Diacritical";
 827                         if (s.StartsWith (d)) {
 828                                 idx = s.IndexOf ("->");
 829                                 t.AddDiacriticalMap (
 830                                         byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
 831                                                 NumberStyles.HexNumber),
 832                                         byte.Parse (s.Substring (idx + 2).Trim (),
 833                                                 NumberStyles.HexNumber));
 834                                 return;
 835                         }
 836                         idx = s.IndexOf (':');
 837                         if (idx > 0) {
 838                                 string source = s.Substring (0, idx).Trim ();
 839                                 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
 840                                 byte [] b = new byte [4];
 841                                 for (int i = 0; i < 4; i++) {
 842                                         if (l [i] == "*")
 843                                                 b [i] = 0;
 844                                         else
 845                                                 b [i] = byte.Parse (l [i],
 846                                                         NumberStyles.HexNumber);
 847                                 }
 848                                 t.AddSortKeyMap (ParseTailoringSourceValue (source),
 849                                         b);
 850                         }
 851                         idx = s.IndexOf ('=');
 852                         if (idx > 0)
 853                                 t.AddReplacementMap (
 854                                         ParseTailoringSourceValue (
 855                                                 s.Substring (0, idx).Trim ()),
 856                                         ParseTailoringSourceValue (
 857                                                 s.Substring (idx + 1).Trim ()));
 858                 }
 859
 860                 void ParseDerivedAge (string filename)
 861                 {
 862                         using (StreamReader file =
 863                                 new StreamReader (filename)) {
 864                                 while (file.Peek () >= 0) {
 865                                         string s = file.ReadLine ();
 866                                         int idx = s.IndexOf ('#');
 867                                         if (idx >= 0)
 868                                                 s = s.Substring (0, idx);
 869                                         idx = s.IndexOf (';');
 870                                         if (idx < 0)
 871                                                 continue;
 872
 873                                         string cpspec = s.Substring (0, idx);
 874                                         idx = cpspec.IndexOf ("..");
 875                                         NumberStyles nf = NumberStyles.HexNumber |
 876                                                 NumberStyles.AllowTrailingWhite;
 877                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
 878                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
 879                                         string value = s.Substring (cpspec.Length + 1).Trim ();
 880
 881                                         // FIXME: use index
 882                                         if (cp > char.MaxValue)
 883                                                 continue;
 884
 885                                         double v = double.Parse (value);
 886                                         for (int i = cp; i <= cpEnd; i++)
 887                                                 unicodeAge [i] = v;
 888                                 }
 889                         }
 890                         unicodeAge [0] = double.MaxValue; // never be supported
 891                 }
 892
 893                 void ParseUnidata (string filename)
 894                 {
 895                         ArrayList decompValues = new ArrayList ();
 896                         using (StreamReader unidata =
 897                                 new StreamReader (filename)) {
 898                                 for (int line = 1; unidata.Peek () >= 0; line++) {
 899                                         try {
 900                                                 ProcessUnidataLine (unidata.ReadLine (), decompValues);
 901                                         } catch (Exception) {
 902                                                 Console.Error.WriteLine ("**** At line " + line);
 903                                                 throw;
 904                                         }
 905                                 }
 906                         }
 907                         this.decompValues = (int [])
 908                                 decompValues.ToArray (typeof (int));
 909                 }
 910
 911                 char previousLatinTarget = char.MinValue;
 912                 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
 913
 914                 void ProcessUnidataLine (string s, ArrayList decompValues)
 915                 {
 916                         int idx = s.IndexOf ('#');
 917                         if (idx >= 0)
 918                                 s = s.Substring (0, idx);
 919                         idx = s.IndexOf (';');
 920                         if (idx < 0)
 921                                 return;
 922                         int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
 923                         string [] values = s.Substring (idx + 1).Split (';');
 924
 925                         // FIXME: use index
 926                         if (cp > char.MaxValue)
 927                                 return;
 928                         if (IsIgnorable (cp))
 929                                 return;
 930
 931                         string name = values [0];
 932
 933                         // SPECIAL CASE: rename some characters for diacritical
 934                         // remapping. FIXME: why are they different?
 935                         // FIXME: it's still not working.
 936                         if (cp == 0x018B || cp == 0x018C)
 937                                 name = name.Replace ("TOPBAR", "STROKE");
 938
 939                         // isSmallCapital
 940                         if (s.IndexOf ("SMALL CAPITAL") > 0)
 941                                 isSmallCapital [cp] = true;
 942
 943                         // latin mapping by character name
 944                         if (s.IndexOf ("LATIN") >= 0) {
 945                                 int lidx = s.IndexOf ("LETTER DOTLESS ");
 946                                 int offset = lidx + 15;
 947                                 if (lidx < 0) {
 948                                         lidx = s.IndexOf ("LETTER TURNED ");
 949                                         offset = lidx + 14;
 950                                 }
 951                                 if (lidx < 0) {
 952                                         lidx = s.IndexOf ("LETTER CAPITAL ");
 953                                         offset = lidx + 15;
 954                                 }
 955                                 if (lidx < 0) {
 956                                         lidx = s.IndexOf ("LETTER SCRIPT ");
 957                                         offset = lidx + 14;
 958                                 }
 959                                 if (lidx < 0) {
 960                                         lidx = s.IndexOf ("LETTER ");
 961                                         offset = lidx + 7;
 962                                 }
 963                                 char c = lidx > 0 ? s [offset] : char.MinValue;
 964                                 char n = s [offset + 1];
 965                                 char target = char.MinValue;
 966                                 if ('A' <= c && c <= 'Z' &&
 967                                         (n == ' ') || n == ';') {
 968                                         target = c;
 969                                         // FIXME: After 'Z', I cannot reset this state.
 970                                         previousLatinTarget = c == 'Z' ? char.MinValue : c;
 971                                 }
 972
 973                                 if (s.Substring (offset).StartsWith ("ALPHA"))
 974                                         target = 'A';
 975                                 else if (s.Substring (offset).StartsWith ("TONE SIX"))
 976                                         target = 'B';
 977                                 else if (s.Substring (offset).StartsWith ("OPEN O"))
 978                                         target = 'C';
 979                                 else if (s.Substring (offset).StartsWith ("ETH"))
 980                                         target = 'D';
 981                                 else if (s.Substring (offset).StartsWith ("SCHWA"))
 982                                         target = 'E';
 983                                 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
 984                                         target = 'O';
 985                                 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
 986                                         target = 'R';
 987                                 else if (s.Substring (offset).StartsWith ("TONE TWO"))
 988                                         target = 'S';
 989                                 else if (s.Substring (offset).StartsWith ("ESH"))
 990                                         target = 'S';
 991                                 else if (s.Substring (offset).StartsWith ("OUNCE"))
 992                                         target = 'Z';
 993
 994                                 // For remaining IPA chars, direct mapping is
 995                                 // much faster.
 996                                 switch (cp) {
 997                                 case 0x0166: case 0x0167:
 998                                         // Though they are 'T', they have different weight
 999                                         target = char.MinValue; break;
1000                                 case 0x0299: target = 'B'; break;
1001                                 case 0x029A: target = 'E'; break;
1002                                 case 0x029B: target = 'G'; break;
1003                                 case 0x029C: target = 'H'; break;
1004                                 case 0x029D: target = 'J'; break;
1005                                 case 0x029E: target = 'K'; break;
1006                                 case 0x029F: target = 'L'; break;
1007                                 case 0x02A0: target = 'Q'; break;
1008                                 case 0x02A7: target = 'T'; break;
1009                                 case 0x02A8: target = 'T'; break;
1010                                 }
1011
1012                                 if (target == char.MinValue)
1013                                         target = previousLatinTarget;
1014
1015                                 if (target != char.MinValue) {
1016                                         ArrayList entry = (ArrayList) latinMap [target];
1017                                         if (entry == null) {
1018                                                 entry = new ArrayList ();
1019                                                 latinMap [target] = entry;
1020                                         }
1021                                         entry.Add (cp);
1022                                         // FIXME: This secondary weight is hack.
1023                                         // They are here because they must not
1024                                         // be identical to the corresponding
1025                                         // ASCII latins.
1026                                         if (c != target && diacritical [cp] == 0) {
1027                                                 diacriticalOffset [c - 'A']++;
1028                                                 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
1029                                         }
1030                                 }
1031                         }
1032
1033                         // Arrow names
1034                         if (0x2000 <= cp && cp < 0x3000) {
1035                                 int value = 0;
1036                                 // SPECIAL CASES. FIXME: why?
1037                                 switch (cp) {
1038                                 case 0x21C5: value = -1; break; // E2
1039                                 case 0x261D: value = 1; break;
1040                                 case 0x27A6: value = 3; break;
1041                                 case 0x21B0: value = 7; break;
1042                                 case 0x21B1: value = 3; break;
1043                                 case 0x21B2: value = 7; break;
1044                                 case 0x21B4: value = 5; break;
1045                                 case 0x21B5: value = 7; break;
1046                                 case 0x21B9: value = -1; break; // E1
1047                                 case 0x21CF: value = 7; break;
1048                                 case 0x21D0: value = 3; break;
1049                                 }
1050                                 string [] arrowTargets = new string [] {
1051                                         "",
1052                                         "UPWARDS",
1053                                         "NORTH EAST",
1054                                         "RIGHTWARDS",
1055                                         "SOUTH EAST",
1056                                         "DOWNWARDS",
1057                                         "SOUTH WEST",
1058                                         "LEFTWARDS",
1059                                         "NORTH WEST",
1060                                         "LEFT RIGHT",
1061                                         "UP DOWN",
1062                                         };
1063                                 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
1064                                         s.IndexOf ("LEFTWARDS") >= 0)
1065                                         value = 0xE1 - 0xD8;
1066                                 else if (s.IndexOf ("UPWARDS") >= 0 &&
1067                                         s.IndexOf ("DOWNWARDS") >= 0)
1068                                         value = 0xE2 - 0xD8;
1069                                 else if (s.IndexOf ("ARROW") >= 0 &&
1070                                         s.IndexOf ("COMBINING") < 0 &&
1071                                         s.IndexOf ("CLOCKWISE") >= 0)
1072                                         value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
1073                                 if (value == 0)
1074                                         for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
1075                                                 if (s.IndexOf (arrowTargets [i]) > 0 &&
1076                                                         s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
1077                                                         s.IndexOf (" OVER") < 0
1078                                                 )
1079                                                         value = i;
1080                                 if (value > 0)
1081                                         arrowValues.Add (new DictionaryEntry (
1082                                                 cp, value));
1083                         }
1084
1085                         // Box names
1086                         if (0x2500 <= cp && cp < 0x2600) {
1087                                 int value = int.MinValue;
1088                                 // flags:
1089                                 // up:1 down:2 right:4 left:8 vert:16 horiz:32
1090                                 // [h,rl] [r] [l]
1091                                 // [v,ud] [u] [d]
1092                                 // [dr] [dl] [ur] [ul]
1093                                 // [vr,udr] [vl,vdl]
1094                                 // [hd,rld] [hu,rlu]
1095                                 // [hv,udrl,rlv,udh]
1096                                 ArrayList flags = new ArrayList (new int [] {
1097                                         32, 8 + 4, 8, 4,
1098                                         16, 1 + 2, 1, 2,
1099                                         4 + 2, 8 + 2, 4 + 1, 8 + 1,
1100                                         16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
1101                                         32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
1102                                         16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
1103                                         });
1104                                 byte [] offsets = new byte [] {
1105                                         0, 0, 1, 2,
1106                                         3, 3, 4, 5,
1107                                         6, 7, 8, 9,
1108                                         10, 10, 11, 11,
1109                                         12, 12, 13, 13,
1110                                         14, 14, 14, 14};
1111                                 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
1112                                         int flag = 0;
1113                                         if (s.IndexOf (" UP") >= 0)
1114                                                 flag |= 1;
1115                                         if (s.IndexOf (" DOWN") >= 0)
1116                                                 flag |= 2;
1117                                         if (s.IndexOf (" RIGHT") >= 0)
1118                                                 flag |= 4;
1119                                         if (s.IndexOf (" LEFT") >= 0)
1120                                                 flag |= 8;
1121                                         if (s.IndexOf (" VERTICAL") >= 0)
1122                                                 flag |= 16;
1123                                         if (s.IndexOf (" HORIZONTAL") >= 0)
1124                                                 flag |= 32;
1125
1126                                         int fidx = flags.IndexOf (flag);
1127                                         if (fidx >= 0)
1128                                                 value = offsets [fidx];
1129                                 } else if (s.IndexOf ("BLOCK") >= 0) {
1130                                         if (s.IndexOf ("ONE EIGHTH") >= 0)
1131                                                 value = 0x12;
1132                                         else if (s.IndexOf ("ONE QUARTER") >= 0)
1133                                                 value = 0x13;
1134                                         else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1135                                                 value = 0x14;
1136                                         else if (s.IndexOf ("HALF") >= 0)
1137                                                 value = 0x15;
1138                                         else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1139                                                 value = 0x16;
1140                                         else if (s.IndexOf ("THREE QUARTERS") >= 0)
1141                                                 value = 0x17;
1142                                         else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1143                                                 value = 0x18;
1144                                         else
1145                                                 value = 0x19;
1146                                 }
1147                                 else if (s.IndexOf ("SHADE") >= 0)
1148                                         value = 0x19;
1149                                 else if (s.IndexOf ("SQUARE") >= 0)
1150                                         value = 0xBC - 0xE5;
1151                                 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1152                                         value = 0xBE - 0xE5;
1153                                 else if (s.IndexOf ("RECTANGLE") >= 0)
1154                                         value = 0xBD - 0xE5;
1155                                 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1156                                         value = 0xBF - 0xE5;
1157                                 else if (s.IndexOf ("TRIANGLE") >= 0) {
1158                                         if (s.IndexOf ("UP-POINTING") >= 0)
1159                                                 value = 0xC0 - 0xE5;
1160                                         else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1161                                                 value = 0xC1 - 0xE5;
1162                                         else if (s.IndexOf ("DOWN-POINTING") >= 0)
1163                                                 value = 0xC2 - 0xE5;
1164                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1165                                                 value = 0xC3 - 0xE5;
1166                                 }
1167                                 else if (s.IndexOf ("POINTER") >= 0) {
1168                                         if (s.IndexOf ("RIGHT-POINTING") >= 0)
1169                                                 value = 0xC4 - 0xE5;
1170                                         else if (s.IndexOf ("LEFT-POINTING") >= 0)
1171                                                 value = 0xC5 - 0xE5;
1172                                 }
1173                                 else if (s.IndexOf ("DIAMOND") >= 0)
1174                                         value = 0xC6 - 0xE5;
1175                                 else if (s.IndexOf ("FISHEYE") >= 0)
1176                                         value = 0xC7 - 0xE5;
1177                                 else if (s.IndexOf ("LOZENGE") >= 0)
1178                                         value = 0xC8 - 0xE5;
1179                                 else if (s.IndexOf ("BULLSEYE") >= 0)
1180                                         value = 0xC9 - 0xE5;
1181                                 else if (s.IndexOf ("CIRCLE") >= 0) {
1182                                         if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1183                                                 value = 0xCA - 0xE5;
1184                                         else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1185                                                 value = 0xCB - 0xE5;
1186                                         else
1187                                                 value = 0xC9 - 0xE5;
1188                                 }
1189                                 else if (s.IndexOf ("BULLET") >= 0)
1190                                         value = 0xCC - 0xE5;
1191                                 if (0x25DA <= cp && cp <= 0x25E5)
1192                                         value = 0xCD + cp - 0x25DA - 0xE5;
1193
1194                                 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1195                                 switch (cp) {
1196                                 case 0x2571: value = 0xF; break;
1197                                 case 0x2572: value = 0x10; break;
1198                                 case 0x2573: value = 0x11; break;
1199                                 }
1200                                 if (value != int.MinValue)
1201                                         boxValues.Add (new DictionaryEntry (
1202                                                 cp, value));
1203                         }
1204
1205                         // For some characters store the name and sort later
1206                         // to determine sorting.
1207                         if (0x2100 <= cp && cp <= 0x213F &&
1208                                 Char.IsSymbol ((char) cp))
1209                                 sortableCharNames.Add (
1210                                         new DictionaryEntry (cp, name));
1211                         else if (0x3380 <= cp && cp <= 0x33DD)
1212                                 sortableCharNames.Add (new DictionaryEntry (
1213                                         cp, name.Substring (7)));
1214
1215                         if (Char.GetUnicodeCategory ((char) cp) ==
1216                                 UnicodeCategory.MathSymbol) {
1217                                 if (name.StartsWith ("CIRCLED "))
1218                                         diacritical [cp] = 0xEE;
1219                                 if (name.StartsWith ("SQUARED "))
1220                                         diacritical [cp] = 0xEF;
1221                         }
1222
1223                         // diacritical weights by character name
1224 if (diacritics.Length != diacriticWeights.Length)
1225 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1226                         for (int d = diacritics.Length - 1; d >= 0; d--) {
1227                                 if (s.IndexOf (diacritics [d]) > 0) {
1228                                         diacritical [cp] += diacriticWeights [d];
1229                                         if (s.IndexOf ("COMBINING") >= 0)
1230                                                 diacritical [cp] -= (byte) 2;
1231                                         break;
1232                                 }
1233                                 // also process "COMBINING blah" here
1234                                 // For now it is limited to cp < 0x0370
1235 //                              if (cp < 0x0300 || cp >= 0x0370)
1236 //                                      continue;
1237                                 string tmp = diacritics [d].TrimEnd (';');
1238                                 if (tmp.IndexOf ("WITH ") == 0)
1239                                         tmp = tmp.Substring (4);
1240                                 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1241                                 if (name == tmp) {
1242                                         diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1243                                         break;
1244                                 }
1245 //if (name == tmp)
1246 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1247                         }
1248                         // Two-step grep required for it.
1249                         if (s.IndexOf ("FULL STOP") > 0 &&
1250                                 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1251                                 diacritical [cp] |= 0xF4;
1252                         if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1253                                 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1254                                         s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1255
1256                         // Arabic letter name
1257                         if (0x0621 <= cp && cp <= 0x064A &&
1258                                 Char.GetUnicodeCategory ((char) cp)
1259                                 == UnicodeCategory.OtherLetter) {
1260                                 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1261                                 switch (cp) {
1262                                 case 0x0621:
1263                                 case 0x0624:
1264                                 case 0x0626:
1265                                         // hamza, waw, yeh ... special cases.
1266                                         value = 0x07;
1267                                         break;
1268                                 case 0x0649:
1269                                 case 0x064A:
1270                                         value = 0x77; // special cases.
1271                                         break;
1272                                 default:
1273                                         // Get primary letter name i.e.
1274                                         // XXX part of ARABIC LETTER XXX yyy
1275                                         // e.g. that of "TEH MARBUTA" is "TEH".
1276                                         string letterName =
1277                                                 (cp == 0x0640) ?
1278                                                 // 0x0640 is special: it does
1279                                                 // not start with ARABIC LETTER
1280                                                 name :
1281                                                 name.Substring (14);
1282                                         int tmpIdx = letterName.IndexOf (' ');
1283                                         letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1284 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1285                                         if (arabicNameMap.ContainsKey (letterName))
1286                                                 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1287                                         else
1288                                                 arabicNameMap [letterName] = cp;
1289                                         break;
1290                                 }
1291                                 arabicLetterPrimaryValues [cp] = value;
1292                         }
1293
1294                         // Japanese square letter
1295                         if (0x3300 <= cp && cp <= 0x3357)
1296                                 if (!ExistsJIS (cp))
1297                                         nonJisJapanese.Add (new NonJISCharacter (cp, name));
1298
1299                         // normalizationType
1300                         string decomp = values [4];
1301                         idx = decomp.IndexOf ('<');
1302                         if (idx >= 0) {
1303                                 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1304                                 case "full":
1305                                         decompType [cp] = DecompositionFull;
1306                                         break;
1307                                 case "sub":
1308                                         decompType [cp] = DecompositionSub;
1309                                         break;
1310                                 case "super":
1311                                         decompType [cp] = DecompositionSuper;
1312                                         break;
1313                                 case "small":
1314                                         decompType [cp] = DecompositionSmall;
1315                                         break;
1316                                 case "isolated":
1317                                         decompType [cp] = DecompositionIsolated;
1318                                         break;
1319                                 case "initial":
1320                                         decompType [cp] = DecompositionInitial;
1321                                         break;
1322                                 case "final":
1323                                         decompType [cp] = DecompositionFinal;
1324                                         break;
1325                                 case "medial":
1326                                         decompType [cp] = DecompositionMedial;
1327                                         break;
1328                                 case "noBreak":
1329                                         decompType [cp] = DecompositionNoBreak;
1330                                         break;
1331                                 case "compat":
1332                                         decompType [cp] = DecompositionCompat;
1333                                         break;
1334                                 case "fraction":
1335                                         decompType [cp] = DecompositionFraction;
1336                                         break;
1337                                 case "font":
1338                                         decompType [cp] = DecompositionFont;
1339                                         break;
1340                                 case "circle":
1341                                         decompType [cp] = DecompositionCircle;
1342                                         break;
1343                                 case "square":
1344                                         decompType [cp] = DecompositionSquare;
1345                                         break;
1346                                 case "wide":
1347                                         decompType [cp] = DecompositionWide;
1348                                         break;
1349                                 case "narrow":
1350                                         decompType [cp] = DecompositionNarrow;
1351                                         break;
1352                                 case "vertical":
1353                                         decompType [cp] = DecompositionVertical;
1354                                         break;
1355                                 default:
1356                                         throw new Exception ("Support NFKD type : " + decomp);
1357                                 }
1358                         }
1359                         else
1360                                 decompType [cp] = DecompositionCanonical;
1361                         decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1362                         if (decomp.Length > 0) {
1363
1364                                 string [] velems = decomp.Split (' ');
1365                                 int didx = decompValues.Count;
1366                                 decompIndex [cp] = didx;
1367                                 foreach (string v in velems)
1368                                         decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1369                                 decompLength [cp] = velems.Length;
1370
1371                                 // [decmpType] -> this_cp
1372                                 int targetCP = (int) decompValues [didx];
1373                                 // for "(x)" it specially maps to 'x' .
1374                                 // FIXME: check if it is sane
1375                                 if (velems.Length == 3 &&
1376                                         (int) decompValues [didx] == '(' &&
1377                                         (int) decompValues [didx + 2] == ')')
1378                                         targetCP = (int) decompValues [didx + 1];
1379                                 // special: 0x215F "1/"
1380                                 else if (cp == 0x215F)
1381                                         targetCP = '1';
1382                                 else if (velems.Length > 1 &&
1383                                         (targetCP < 0x4C00 || 0x9FBB < targetCP))
1384                                         // skip them, except for CJK ideograph compat
1385                                         targetCP = 0;
1386
1387                                 if (targetCP != 0) {
1388                                         Hashtable entry = (Hashtable) nfkdMap [targetCP];
1389                                         if (entry == null) {
1390                                                 entry = new Hashtable ();
1391                                                 nfkdMap [targetCP] = entry;
1392                                         }
1393                                         entry [(byte) decompType [cp]] = cp;
1394                                 }
1395                         }
1396                         // numeric values
1397                         if (values [5].Length > 0)
1398                                 decimalValue [cp] = decimal.Parse (values [5]);
1399                         else if (values [6].Length > 0)
1400                                 decimalValue [cp] = decimal.Parse (values [6]);
1401                         else if (values [7].Length > 0) {
1402                                 string decstr = values [7];
1403                                 idx = decstr.IndexOf ('/');
1404                                 if (cp == 0x215F) // special. "1/"
1405                                         decimalValue [cp] = 0x1;
1406                                 else if (idx > 0)
1407                                         // m/n
1408                                         decimalValue [cp] =
1409                                                 decimal.Parse (decstr.Substring (0, idx))
1410                                                 / decimal.Parse (decstr.Substring (idx + 1));
1411                                 else if (decstr [0] == '(' &&
1412                                         decstr [decstr.Length - 1] == ')')
1413                                         // (n)
1414                                         decimalValue [cp] =
1415                                                 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1416                                 else if (decstr [decstr.Length - 1] == '.')
1417                                         // n.
1418                                         decimalValue [cp] =
1419                                                 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1420                                 else
1421                                         decimalValue [cp] = decimal.Parse (decstr);
1422                         }
1423                 }
1424
1425                 void ParseDerivedCoreProperties (string filename)
1426                 {
1427                         // IsUppercase
1428                         using (StreamReader file =
1429                                 new StreamReader (filename)) {
1430                                 for (int line = 1; file.Peek () >= 0; line++) {
1431                                         try {
1432                                                 ProcessDerivedCorePropLine (file.ReadLine ());
1433                                         } catch (Exception) {
1434                                                 Console.Error.WriteLine ("**** At line " + line);
1435                                                 throw;
1436                                         }
1437                                 }
1438                         }
1439                 }
1440
1441                 void ProcessDerivedCorePropLine (string s)
1442                 {
1443                         int idx = s.IndexOf ('#');
1444                         if (idx >= 0)
1445                                 s = s.Substring (0, idx);
1446                         idx = s.IndexOf (';');
1447                         if (idx < 0)
1448                                 return;
1449                         string cpspec = s.Substring (0, idx);
1450                         idx = cpspec.IndexOf ("..");
1451                         NumberStyles nf = NumberStyles.HexNumber |
1452                                 NumberStyles.AllowTrailingWhite;
1453                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1454                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1455                         string value = s.Substring (cpspec.Length + 1).Trim ();
1456
1457                         // FIXME: use index
1458                         if (cp > char.MaxValue)
1459                                 return;
1460
1461                         switch (value) {
1462                         case "Uppercase":
1463                                 for (int x = cp; x <= cpEnd; x++)
1464                                         isUppercase [x] = true;
1465                                 break;
1466                         }
1467                 }
1468
1469                 void ParseScripts (string filename)
1470                 {
1471                         ArrayList gurmukhi = new ArrayList ();
1472                         ArrayList gujarati = new ArrayList ();
1473                         ArrayList georgian = new ArrayList ();
1474                         ArrayList thaana = new ArrayList ();
1475
1476                         using (StreamReader file =
1477                                 new StreamReader (filename)) {
1478                                 while (file.Peek () >= 0) {
1479                                         string s = file.ReadLine ();
1480                                         int idx = s.IndexOf ('#');
1481                                         if (idx >= 0)
1482                                                 s = s.Substring (0, idx);
1483                                         idx = s.IndexOf (';');
1484                                         if (idx < 0)
1485                                                 continue;
1486
1487                                         string cpspec = s.Substring (0, idx);
1488                                         idx = cpspec.IndexOf ("..");
1489                                         NumberStyles nf = NumberStyles.HexNumber |
1490                                                 NumberStyles.AllowTrailingWhite;
1491                                         int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1492                                         int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1493                                         string value = s.Substring (cpspec.Length + 1).Trim ();
1494
1495                                         // FIXME: use index
1496                                         if (cp > char.MaxValue)
1497                                                 continue;
1498
1499                                         switch (value) {
1500                                         case "Gurmukhi":
1501                                                 for (int x = cp; x <= cpEnd; x++)
1502                                                         if (!IsIgnorable (x))
1503                                                                 gurmukhi.Add ((char) x);
1504                                                 break;
1505                                         case "Gujarati":
1506                                                 for (int x = cp; x <= cpEnd; x++)
1507                                                         if (!IsIgnorable (x))
1508                                                                 gujarati.Add ((char) x);
1509                                                 break;
1510                                         case "Georgian":
1511                                                 for (int x = cp; x <= cpEnd; x++)
1512                                                         if (!IsIgnorable (x))
1513                                                                 georgian.Add ((char) x);
1514                                                 break;
1515                                         case "Thaana":
1516                                                 for (int x = cp; x <= cpEnd; x++)
1517                                                         if (!IsIgnorable (x))
1518                                                                 thaana.Add ((char) x);
1519                                                 break;
1520                                         }
1521                                 }
1522                         }
1523                         gurmukhi.Sort (UCAComparer.Instance);
1524                         gujarati.Sort (UCAComparer.Instance);
1525                         georgian.Sort (UCAComparer.Instance);
1526                         thaana.Sort (UCAComparer.Instance);
1527                         orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1528                         orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1529                         orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1530                         orderedThaana = (char []) thaana.ToArray (typeof (char));
1531                 }
1532
1533                 void ParseJISOrder (string filename)
1534                 {
1535                         int line = 1;
1536                         try {
1537                                 using (StreamReader file =
1538                                         new StreamReader (filename)) {
1539                                         for (;file.Peek () >= 0; line++)
1540                                                 ProcessJISOrderLine (file.ReadLine ());
1541                                 }
1542                         } catch (Exception) {
1543                                 Console.Error.WriteLine ("---- line {0}", line);
1544                                 throw;
1545                         }
1546                 }
1547
1548                 char [] ws = new char [] {'\t', ' '};
1549
1550                 void ProcessJISOrderLine (string s)
1551                 {
1552                         int idx = s.IndexOf ('#');
1553                         if (idx >= 0)
1554                                 s = s.Substring (0, idx).Trim ();
1555                         if (s.Length == 0)
1556                                 return;
1557                         idx = s.IndexOfAny (ws);
1558                         if (idx < 0)
1559                                 return;
1560                         // They start with "0x" so cut them out.
1561                         int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1562                         int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1563                         jisJapanese.Add (new JISCharacter (cp, jis));
1564                 }
1565
1566                 void ParseCJK (string zhXML, string jaXML, string koXML)
1567                 {
1568                         XmlDocument doc = new XmlDocument ();
1569                         doc.XmlResolver = null;
1570                         int v;
1571                         string s;
1572                         string category;
1573                         int offset;
1574                         ushort [] arr;
1575
1576                         // Chinese Simplified
1577                         category = "chs";
1578                         arr = cjkCHS;
1579                         offset = 0;//char.MaxValue - arr.Length;
1580                         doc.Load (zhXML);
1581                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1582                         v = 0x8008;
1583                         foreach (char c in s) {
1584                                 if (c < '\u3100')
1585                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1586                                 else {
1587                                         arr [(int) c - offset] = (ushort) v++;
1588                                         if (v % 256 == 0)
1589                                                 v += 2;
1590                                 }
1591                         }
1592
1593                         // Chinese Traditional
1594                         category = "cht";
1595                         arr = cjkCHT;
1596                         offset = 0;//char.MaxValue - arr.Length;
1597                         s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1598                         v = 0x8002;
1599                         foreach (char c in s) {
1600                                 if (c < '\u4E00')
1601                                         Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1602                                 else {
1603                                         arr [(int) c - offset] = (ushort) v++;
1604                                         if (v % 256 == 0)
1605                                                 v += 2;
1606                                 }
1607                         }
1608
1609                         // Japanese
1610                         category = "ja";
1611                         arr = cjkJA;
1612                         offset = 0;//char.MaxValue - arr.Length;
1613
1614                         // SPECIAL CASES
1615                         arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1616                         arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1617                         arr [0x337E] = 0x8005;
1618                         arr [0x337D] = 0x8006;
1619                         arr [0x337C] = 0x8007;
1620
1621                         v = 0x8008;
1622                         foreach (JISCharacter jc in jisJapanese) {
1623                                 if (jc.JIS < 0x8800)
1624                                         continue;
1625                                 char c = (char) jc.CP;
1626
1627                                 if (c < '\u4E00')
1628                                         // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1629                                         continue;
1630                                 else {
1631                                         arr [(int) c - offset] = (ushort) v++;
1632                                         if (v % 256 == 0)
1633                                                 v += 2;
1634
1635                                         // SPECIAL CASES:
1636                                         if (c == '\u662D') // U+337C
1637                                                 continue;
1638                                         if (c == '\u5927') // U+337D
1639                                                 continue;
1640                                         if (c == '\u5E73') // U+337B
1641                                                 continue;
1642                                         if (c == '\u660E') // U+337E
1643                                                 continue;
1644                                         if (c == '\u9686') // U+F9DC
1645                                                 continue;
1646
1647                                         // FIXME: there are still remaining
1648                                         // characters after U+FA0C.
1649 //                                      for (int k = 0; k < char.MaxValue; k++) {
1650                                         for (int k = 0; k < '\uFA0D'; k++) {
1651                                                 if (decompIndex [k] == 0 || IsIgnorable (k))
1652                                                         continue;
1653                                                 if (decompValues [decompIndex [k]] == c /*&&
1654                                                         decompLength [k] == 1*/ ||
1655                                                         decompLength [k] == 3 &&
1656                                                         decompValues [decompIndex [k] + 1] == c) {
1657                                                         arr [k - offset] = (ushort) v++;
1658                                                         if (v % 256 == 0)
1659                                                                 v += 2;
1660                                                 }
1661                                         }
1662                                 }
1663                         }
1664
1665                         // Korean
1666                         // Korean weight is somewhat complex. It first shifts
1667                         // Hangul category from 52-x to 80-x (they are anyways
1668                         // computed). CJK ideographs are placed at secondary
1669                         // weight, like XX YY 01 zz 01, where XX and YY are
1670                         // corresponding "reset" value and zz is 41,43,45...
1671                         //
1672                         // Unlike chs,cht and ja, Korean value is a combined
1673                         // ushort which is computed as category
1674                         //
1675                         category = "ko";
1676                         arr = cjkKO;
1677                         offset = 0;//char.MaxValue - arr.Length;
1678                         doc.Load (koXML);
1679                         foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1680                                 XmlElement sc = (XmlElement) reset.NextSibling;
1681                                 // compute "category" and "level 1" for the
1682                                 // target "reset" Hangle syllable
1683                                 char rc = reset.InnerText [0];
1684                                 int ri = ((int) rc - 0xAC00) + 1;
1685                                 ushort p = (ushort)
1686                                         ((ri / 254) * 256 + (ri % 254) + 2);
1687                                 // Place the characters after the target.
1688                                 s = sc.InnerText;
1689                                 v = 0x41;
1690                                 foreach (char c in s) {
1691                                         arr [(int) c - offset] = p;
1692                                         cjkKOlv2 [(int) c - offset] = (byte) v;
1693                                         v += 2;
1694                                 }
1695                         }
1696                 }
1697
1698                 #endregion
1699
1700                 #region Generation
1701
1702                 void FillIgnorables ()
1703                 {
1704                         for (int i = 0; i <= char.MaxValue; i++) {
1705                                 if (Char.GetUnicodeCategory ((char) i) ==
1706                                         UnicodeCategory.OtherNotAssigned)
1707                                         continue;
1708                                 if (IsIgnorable (i))
1709                                         ignorableFlags [i] |= 1;
1710                                 if (IsIgnorableSymbol (i))
1711                                         ignorableFlags [i] |= 2;
1712                                 if (IsIgnorableNonSpacing (i))
1713                                         ignorableFlags [i] |= 4;
1714                         }
1715                 }
1716
1717                 void ModifyUnidata ()
1718                 {
1719                         ArrayList decompValues = new ArrayList (this.decompValues);
1720
1721                         // Hebrew uppercase letters.
1722                         foreach (int i in new int []
1723                                 {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6})
1724                                 isUppercase [i] = true;
1725
1726
1727                         // Modify some decomposition equivalence
1728                         for (int i = 0xFE31; i <= 0xFE34; i++) {
1729                                 decompType [i] = 0;
1730                                 decompIndex [i] = 0;
1731                                 decompLength [i] = 0;
1732                         }
1733                         decompType [0x037E] = 0;
1734                         decompIndex [0x037E] = 0;
1735                         decompLength [0x037E] = 0;
1736
1737                         // Hangzhou numbers
1738                         for (int i = 0x3021; i <= 0x3029; i++)
1739                                 diacritical [i] = 0x4E;
1740                         // Korean parens numbers
1741                         for (int i = 0x3200; i <= 0x321C; i++)
1742                                 diacritical [i] = 0xA;
1743                         for (int i = 0x3260; i <= 0x327B; i++)
1744                                 diacritical [i] = 0xC;
1745
1746                         // LAMESPEC: these remapping should not be done.
1747                         // Windows have incorrect CJK compat mappings.
1748                         decompValues [decompIndex [0x32A9]] = 0x91AB;
1749                         decompLength [0x323B] = 1;
1750                         decompValues [decompIndex [0x323B]] = 0x5B78;
1751                         decompValues [decompIndex [0x32AB]] = 0x5B78;
1752                         decompValues [decompIndex [0x32A2]] = 0x5BEB;
1753                         decompLength [0x3238] = 1;
1754                         decompValues [decompIndex [0x3238]] = 0x52DE;
1755                         decompValues [decompIndex [0x3298]] = 0x52DE;
1756
1757                         // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1758                         decompIndex [0xFA0C] = decompValues.Count;
1759                         decompValues.Add ((int) 0x5140);
1760                         decompLength [0xFA0C] = 1;
1761                         decompIndex [0xF929] = decompLength [0xF929] = 0;
1762
1763                         decompValues [decompIndex [0xF92C]] = 0x90DE;
1764
1765                         decompIndex [0x2125] = decompValues.Count;
1766                         decompValues.Add ((int) 0x005A);
1767                         decompLength [0x2125] = 1;
1768                         decompType [0x2125] = DecompositionFont;
1769
1770                         this.decompValues = decompValues.ToArray (typeof (int)) as int [];
1771                 }
1772
1773                 void ModifyParsedValues ()
1774                 {
1775                         // Sometimes STROKE don't work fine
1776                         diacritical [0xD8] = diacritical [0xF8] = 0x21;
1777                         diacritical [0x141] = diacritical [0x142] = 0x1F;
1778                         // FIXME: why?
1779                         diacritical [0xAA] = diacritical [0xBA] = 3;
1780                         diacritical [0xD0] = diacritical [0xF0] = 0x68;
1781                         diacritical [0x131] = 3;
1782                         diacritical [0x138] = 3;
1783                         // TOPBAR does not work as an identifier for the weight
1784                         diacritical [0x182] = diacritical [0x183] = 0x68; // B
1785                         diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D
1786                         // TONE TWO
1787                         diacritical [0x1A7] = diacritical [0x1A8] = 0x87;
1788                         // TONE SIX
1789                         diacritical [0x184] = diacritical [0x185] = 0x87;
1790                         // OPEN E
1791                         diacritical [0x190] = diacritical [0x25B] = 0x7B;
1792                         // There are many letters w/ diacritical weight 0x7B
1793                         diacritical [0x0192] = diacritical [0x0194] =
1794                         diacritical [0x0195] = diacritical [0x0196] =
1795                         diacritical [0x019C] = diacritical [0x019E] =
1796                         diacritical [0x01A6] = diacritical [0x01B1] =
1797                         diacritical [0x01B2] = diacritical [0x01BF] = 0x7B;
1798                         // ... as well as 0x7C
1799                         diacritical [0x01A2] = diacritical [0x01A3] = 0x7C;
1800
1801                         // <font> NFKD characters seem to have diacritical
1802                         // weight as 3,4,5... but the order does not look
1803                         // by codepoint and I have no idea how they are sorted.
1804                         diacritical [0x210E] = 3;
1805                         diacritical [0x210F] = 0x68;
1806                         diacritical [0x2110] = 4;
1807                         diacritical [0x2111] = 5;
1808                         diacritical [0x2112] = 4;
1809                         diacritical [0x2113] = 4;
1810                         diacritical [0x211B] = 4;
1811                         diacritical [0x211C] = 5;
1812
1813                         // some cyrillic diacritical weight. They seem to be
1814                         // based on old character names, so it's quicker to
1815                         // set them directly here.
1816                         // FIXME: they are by mostly unknown reason
1817                         diacritical [0x0496] = diacritical [0x0497] = 7;
1818                         diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1819                         diacritical [0x049A] = diacritical [0x049B] = 0x17;
1820                         diacritical [0x049C] = diacritical [0x049D] = 9;
1821                         diacritical [0x049E] = diacritical [0x049F] = 4;
1822                         diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1823                         diacritical [0x04A2] = diacritical [0x04A3] = 7;
1824                         diacritical [0x04A4] = diacritical [0x04A5] = 8;
1825                         diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA?
1826                         diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2
1827                         diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U?
1828                         diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC
1829                         diacritical [0x04B4] = diacritical [0x04B5] = 3;
1830                         diacritical [0x04B6] = 8;
1831                         diacritical [0x04B7] = 7;
1832                         diacritical [0x04B8] = diacritical [0x04B9] = 9;
1833                         diacritical [0x04BA] = diacritical [0x04BB] = 9;
1834
1835                         // number, secondary weights
1836                         byte weight = 0x38;
1837                         int [] numarr = numberSecondaryWeightBounds;
1838                         for (int i = 0; i < numarr.Length; i += 2, weight++)
1839                                 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1840                                         if (Char.IsNumber ((char) cp))
1841                                                 diacritical [cp] = weight;
1842
1843                         // Gurmukhi special letters' diacritical weight
1844                         for (int i = 0x0A50; i < 0x0A60; i++)
1845                                 diacritical [i] = 4;
1846                         // Oriya special letters' diacritical weight
1847                         for (int i = 0x0B5C; i < 0x0B60; i++)
1848                                 diacritical [i] = 6;
1849
1850                         // Update name part of named characters
1851                         for (int i = 0; i < sortableCharNames.Count; i++) {
1852                                 DictionaryEntry de =
1853                                         (DictionaryEntry) sortableCharNames [i];
1854                                 int cp = (int) de.Key;
1855                                 string renamed = null;
1856                                 switch (cp) {
1857                                 case 0x2101: renamed = "A_1"; break;
1858                                 case 0x33C3: renamed = "A_2"; break;
1859                                 case 0x2105: renamed = "C_1"; break;
1860                                 case 0x2106: renamed = "C_2"; break;
1861                                 case 0x211E: renamed = "R1"; break;
1862                                 case 0x211F: renamed = "R2"; break;
1863                                 // Remove some of them!
1864                                 case 0x2103:
1865                                 case 0x2109:
1866                                 case 0x2116:
1867                                 case 0x2117:
1868                                 case 0x2118:
1869                                 case 0x2125:
1870                                 case 0x2127:
1871                                 case 0x2129:
1872                                 case 0x212E:
1873                                 case 0x2132:
1874                                         sortableCharNames.RemoveAt (i);
1875                                         i--;
1876                                         continue;
1877                                 }
1878                                 if (renamed != null)
1879                                         sortableCharNames [i] =
1880                                                 new DictionaryEntry (cp, renamed);
1881                         }
1882                 }
1883
1884                 void GenerateCore ()
1885                 {
1886                         UnicodeCategory uc;
1887
1888                         #region Specially ignored // 01
1889                         // This will raise "Defined" flag up.
1890                         // FIXME: Check If it is really fine. Actually for
1891                         // Japanese voice marks this code does remapping.
1892                         foreach (char c in specialIgnore)
1893                                 map [(int) c] = new CharMapEntry (0, 0, 0);
1894                         #endregion
1895
1896                         #region Extenders (FF FF)
1897                         fillIndex [0xFF] = 0xFF;
1898                         char [] specialBiggest = new char [] {
1899                                 '\u3005', '\u3031', '\u3032', '\u309D',
1900                                 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1901                                 '\uFE7C', '\uFE7D', '\uFF70'};
1902                         foreach (char c in specialBiggest)
1903                                 AddCharMap (c, 0xFF, 0);
1904                         #endregion
1905
1906                         #region Variable weights
1907                         // Controls : 06 03 - 06 3D
1908                         fillIndex [0x6] = 3;
1909                         for (int i = 0; i < 65536; i++) {
1910                                 if (IsIgnorable (i))
1911                                         continue;
1912                                 char c = (char) i;
1913                                 uc = Char.GetUnicodeCategory (c);
1914                                 // NEL is whitespace but not ignored here.
1915                                 if (uc == UnicodeCategory.Control &&
1916                                         !Char.IsWhiteSpace (c) || c == '\u0085')
1917                                         AddCharMap (c, 6, 1);
1918                         }
1919
1920                         // Apostrophe 06 80
1921                         fillIndex [0x6] = 0x80;
1922                         AddCharMap ('\'', 6, 0);
1923                         AddCharMap ('\uFF07', 6, 1);
1924                         AddCharMap ('\uFE63', 6, 1);
1925
1926                         // SPECIAL CASE: fill FE32 here in prior to be added
1927                         // at 2013. Windows does not always respect NFKD.
1928                         map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1929
1930                         // Hyphen/Dash : 06 81 - 06 90
1931                         for (int i = 0; i < char.MaxValue; i++) {
1932                                 if (!IsIgnorable (i) &&
1933                                         Char.GetUnicodeCategory ((char) i) ==
1934                                         UnicodeCategory.DashPunctuation) {
1935                                         AddCharMapGroup2 ((char) i, 6, 1, 0);
1936                                         if (i == 0x2011) {
1937                                                 // SPECIAL: add 2027 and 2043
1938                                                 // Maybe they are regarded the
1939                                                 // same hyphens in "central"
1940                                                 // position.
1941                                                 AddCharMap ('\u2027', 6, 1);
1942                                                 AddCharMap ('\u2043', 6, 1);
1943                                         }
1944                                 }
1945                         }
1946                         // They are regarded as primarily equivalent to '-'
1947                         map [0x208B] = new CharMapEntry (6, 0x82, 0);
1948                         map [0x207B] = new CharMapEntry (6, 0x82, 0);
1949                         map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1950
1951                         // Arabic variable weight chars 06 A0 -
1952                         fillIndex [6] = 0xA0;
1953                         // vowels
1954                         for (int i = 0x64B; i <= 0x650; i++)
1955                                 AddArabicCharMap ((char) i, 6, 1, 0);
1956                         // sukun
1957                         AddCharMapGroup ('\u0652', 6, 1, 0);
1958                         // shadda
1959                         AddCharMapGroup ('\u0651', 6, 1, 0);
1960                         #endregion
1961
1962
1963                         #region Nonspacing marks // 01
1964                         // FIXME: 01 03 - 01 B6 ... annoyance :(
1965
1966                         // Combining diacritical marks: 01 DC -
1967
1968                         fillIndex [0x1] = 0x41;
1969                         for (int i = 0x030E; i <= 0x0326; i++)
1970                                 if (!IsIgnorable (i))
1971                                         AddCharMap ((char) i, 0x1, 1);
1972                         for (int i = 0x0329; i <= 0x0334; i++)
1973                                 if (!IsIgnorable (i))
1974                                         AddCharMap ((char) i, 0x1, 1);
1975                         fillIndex [0x1]++;
1976                         for (int i = 0x0339; i <= 0x0341; i++)
1977                                 if (!IsIgnorable (i))
1978                                         AddCharMap ((char) i, 0x1, 1);
1979                         fillIndex [0x1] = 0x74;
1980                         for (int i = 0x0346; i <= 0x0348; i++)
1981                                 if (!IsIgnorable (i))
1982                                         AddCharMap ((char) i, 0x1, 1);
1983                         for (int i = 0x02BE; i <= 0x02BF; i++)
1984                                 if (!IsIgnorable (i))
1985                                         AddCharMap ((char) i, 0x1, 1);
1986                         for (int i = 0x02C1; i <= 0x02C5; i++)
1987                                 if (!IsIgnorable (i))
1988                                         AddCharMap ((char) i, 0x1, 1);
1989                         for (int i = 0x02CE; i <= 0x02CF; i++)
1990                                 if (!IsIgnorable (i))
1991                                         AddCharMap ((char) i, 0x1, 1);
1992                         fillIndex [0x1]++;
1993                         for (int i = 0x02D1; i <= 0x02D3; i++)
1994                                 if (!IsIgnorable (i))
1995                                         AddCharMap ((char) i, 0x1, 1);
1996                         AddCharMap ('\u02DE', 0x1, 1);
1997                         for (int i = 0x02E4; i <= 0x02E9; i++)
1998                                 if (!IsIgnorable (i))
1999                                         AddCharMap ((char) i, 0x1, 1);
2000
2001
2002                         // FIXME: needs more love here (it should eliminate
2003                         // all the hacky code above).
2004                         for (int i = 0x0300; i < 0x0370; i++)
2005                                 if (!IsIgnorable (i) && diacritical [i] != 0
2006                                         && !map [i].Defined)
2007                                         map [i] = new CharMapEntry (
2008                                                 0x1, 0x1, diacritical [i]);
2009
2010                         // Cyrillic and Armenian nonspacing mark
2011                         fillIndex [0x1] = 0x94;
2012                         for (int i = 0x400; i < 0x580; i++)
2013                                 if (!IsIgnorable (i) &&
2014                                         Char.GetUnicodeCategory ((char) i) ==
2015                                         UnicodeCategory.NonSpacingMark)
2016                                         AddCharMap ((char) i, 1, 1);
2017
2018                         fillIndex [0x1] = 0x8D;
2019                         // syriac dotted nonspacing marks (1)
2020                         AddCharMap ('\u0740', 0x1, 1);
2021                         AddCharMap ('\u0741', 0x1, 1);
2022                         AddCharMap ('\u0742', 0x1, 1);
2023                         // syriac oblique nonspacing marks
2024                         AddCharMap ('\u0747', 0x1, 1);
2025                         AddCharMap ('\u0748', 0x1, 1);
2026                         // syriac dotted nonspacing marks (2)
2027                         fillIndex [0x1] = 0x94; // this reset is mandatory
2028                         AddCharMap ('\u0732', 0x1, 1);
2029                         AddCharMap ('\u0735', 0x1, 1);
2030                         AddCharMap ('\u0738', 0x1, 1);
2031                         AddCharMap ('\u0739', 0x1, 1);
2032                         AddCharMap ('\u073C', 0x1, 1);
2033                         // SPECIAL CASES: superscripts
2034                         AddCharMap ('\u073F', 0x1, 1);
2035                         AddCharMap ('\u0711', 0x1, 1);
2036                         // syriac "DOTS"
2037                         for (int i = 0x0743; i <= 0x0746; i++)
2038                                 AddCharMap ((char) i, 0x1, 1);
2039                         for (int i = 0x0730; i <= 0x0780; i++)
2040                                 if (!map [i].Defined &&
2041                                         Char.GetUnicodeCategory ((char) i) ==
2042                                         UnicodeCategory.NonSpacingMark)
2043                                         AddCharMap ((char) i, 0x1, 1);
2044
2045                         // LAMESPEC: It should not stop at '\u20E1'. There are
2046                         // a few more characters (that however results in
2047                         // overflow of level 2 unless we start before 0xDD).
2048                         fillIndex [0x1] = 0xDD;
2049                         for (int i = 0x20D0; i <= 0x20DC; i++)
2050                                 AddCharMap ((char) i, 0x1, 1);
2051                         fillIndex [0x1] = 0xEC;
2052                         for (int i = 0x20DD; i <= 0x20E1; i++)
2053                                 AddCharMap ((char) i, 0x1, 1);
2054                         fillIndex [0x1] = 0x4;
2055                         AddCharMap ('\u0CD5', 0x1, 1);
2056                         AddCharMap ('\u0CD6', 0x1, 1);
2057                         AddCharMap ('\u093C', 0x1, 1);
2058                         for (int i = 0x302A; i <= 0x302D; i++)
2059                                 AddCharMap ((char) i, 0x1, 1);
2060                         AddCharMap ('\u0C55', 0x1, 1);
2061                         AddCharMap ('\u0C56', 0x1, 1);
2062
2063                         fillIndex [0x1] = 0x50; // I wonder how they are sorted
2064                         for (int i = 0x02D4; i <= 0x02D7; i++)
2065                                 AddCharMap ((char) i, 0x1, 1);
2066
2067                         // They are not part of Nonspacing marks, but have
2068                         // only diacritical weight.
2069                         for (int i = 0x3099; i <= 0x309C; i++)
2070                                 map [i] = new CharMapEntry (1, 1, 1);
2071                         map [0xFF9E] = new CharMapEntry (1, 1, 1);
2072                         map [0xFF9F] = new CharMapEntry (1, 1, 2);
2073                         map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
2074                         map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
2075                         for (int i = 0x30FC; i <= 0x30FE; i++)
2076                                 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
2077
2078                         fillIndex [0x1] = 0xA;
2079                         for (int i = 0x0951; i <= 0x0954; i++)
2080                                 AddCharMap ((char) i, 0x1, 2);
2081
2082                         #endregion
2083
2084
2085                         #region Whitespaces // 07 03 -
2086                         fillIndex [0x7] = 0x2;
2087                         AddCharMap (' ', 0x7, 2);
2088                         AddCharMap ('\u00A0', 0x7, 1);
2089                         for (int i = 9; i <= 0xD; i++)
2090                                 AddCharMap ((char) i, 0x7, 1);
2091                         for (int i = 0x2000; i <= 0x200B; i++)
2092                                 AddCharMap ((char) i, 0x7, 1);
2093
2094                         fillIndex [0x7] = 0x17;
2095                         AddCharMapGroup ('\u2028', 0x7, 1, 0);
2096                         AddCharMapGroup ('\u2029', 0x7, 1, 0);
2097
2098                         // Characters which used to represent layout control.
2099                         // LAMESPEC: Windows developers seem to have thought
2100                         // that those characters are kind of whitespaces,
2101                         // while they aren't.
2102                         AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
2103                         AddCharMap ('\u2423', 0x7, 1, 0); // open box
2104
2105                         #endregion
2106
2107                         // category 09 - continued symbols from 08
2108                         fillIndex [0x9] = 2;
2109                         // misc tech mark
2110                         for (int cp = 0x2300; cp <= 0x237A; cp++)
2111                                 AddCharMap ((char) cp, 0x9, 1, 0);
2112
2113                         // arrows
2114                         byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
2115                         foreach (DictionaryEntry de in arrowValues) {
2116                                 int idx = (int) de.Value;
2117                                 int cp = (int) de.Key;
2118                                 if (map [cp].Defined)
2119                                         continue;
2120                                 fillIndex [0x9] = (byte) (0xD8 + idx);
2121                                 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
2122                                 arrowLv2 [idx]++;
2123                         }
2124                         // boxes
2125                         byte [] boxLv2 = new byte [128];
2126                         // 0-63 will be used for those offsets are positive,
2127                         // and 64-127 are for negative ones.
2128                         for (int i = 0; i < boxLv2.Length; i++)
2129                                 boxLv2 [i] = 3;
2130                         foreach (DictionaryEntry de in boxValues) {
2131                                 int cp = (int) de.Key;
2132                                 int off = (int) de.Value;
2133                                 if (map [cp].Defined)
2134                                         continue;
2135                                 if (off < 0) {
2136                                         fillIndex [0x9] = (byte) (0xE5 + off);
2137                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
2138                                 }
2139                                 else {
2140                                         fillIndex [0x9] = (byte) (0xE5 + off);
2141                                         AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
2142                                 }
2143                         }
2144                         // Some special characters (slanted)
2145                         fillIndex [0x9] = 0xF4;
2146                         AddCharMap ('\u2571', 0x9, 3);
2147                         AddCharMap ('\u2572', 0x9, 3);
2148                         AddCharMap ('\u2573', 0x9, 3);
2149
2150                         // FIXME: implement 0A
2151                         #region Symbols
2152                         fillIndex [0xA] = 2;
2153                         // byte currency symbols
2154                         for (int cp = 0; cp < 0x100; cp++) {
2155                                 uc = Char.GetUnicodeCategory ((char) cp);
2156                                 if (!IsIgnorable (cp) &&
2157                                         uc == UnicodeCategory.CurrencySymbol &&
2158                                         cp != '$')
2159                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
2160                         }
2161                         // byte other symbols
2162                         for (int cp = 0; cp < 0x100; cp++) {
2163                                 if (cp == 0xA6)
2164                                         continue; // SPECIAL: skip FIXME: why?
2165                                 uc = Char.GetUnicodeCategory ((char) cp);
2166                                 if (!IsIgnorable (cp) &&
2167                                         uc == UnicodeCategory.OtherSymbol ||
2168                                         cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
2169                                         AddCharMapGroup ((char) cp, 0xA, 1, 0);
2170                         }
2171                         // U+30FB here
2172                         AddCharMapGroup ('\u30FB', 0xA, 1, 0);
2173
2174                         for (int cp = 0x2020; cp <= 0x2031; cp++)
2175                                 if (Char.IsPunctuation ((char) cp))
2176                                         AddCharMap ((char) cp, 0xA, 1, 0);
2177                         // SPECIAL CASES: why?
2178                         AddCharMap ('\u203B', 0xA, 1, 0);
2179                         AddCharMap ('\u2040', 0xA, 1, 0);
2180                         AddCharMap ('\u2041', 0xA, 1, 0);
2181                         AddCharMap ('\u2042', 0xA, 1, 0);
2182
2183                         for (int cp = 0x20A0; cp <= 0x20AB; cp++)
2184                                 AddCharMap ((char) cp, 0xA, 1, 0);
2185
2186                         // 3004 is skipped at first...
2187                         for (int cp = 0x3010; cp <= 0x3040; cp++)
2188                                 if (Char.IsSymbol ((char) cp))
2189                                         AddCharMap ((char) cp, 0xA, 1, 0);
2190                         // SPECIAL CASES: added here
2191                         AddCharMap ('\u3004', 0xA, 1, 0);
2192                         AddCharMap ('\u327F', 0xA, 1, 0);
2193
2194                         for (int cp = 0x2600; cp <= 0x2613; cp++)
2195                                 AddCharMap ((char) cp, 0xA, 1, 0);
2196                         // Dingbats
2197                         for (int cp = 0x2620; cp <= 0x2770; cp++)
2198                                 if (Char.IsSymbol ((char) cp))
2199                                         AddCharMap ((char) cp, 0xA, 1, 0);
2200                         // OCR
2201                         for (int i = 0x2440; i < 0x2460; i++)
2202                                 AddCharMap ((char) i, 0xA, 1, 0);
2203
2204                         // SPECIAL CASES: why?
2205                         AddCharMap ('\u0E3F', 0xA, 1, 0);
2206                         AddCharMap ('\u2117', 0xA, 1, 0);
2207                         AddCharMap ('\u20AC', 0xA, 1, 0);
2208                         #endregion
2209
2210                         #region Numbers // 0C 02 - 0C E1
2211                         fillIndex [0xC] = 2;
2212
2213                         // 9F8 : Bengali "one less than the denominator"
2214                         AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2215
2216                         ArrayList numbers = new ArrayList ();
2217                         for (int i = 0; i < 65536; i++)
2218                                 if (!IsIgnorable (i) &&
2219                                         Char.IsNumber ((char) i) &&
2220                                         (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2221                                         numbers.Add (i);
2222
2223                         ArrayList numberValues = new ArrayList ();
2224                         foreach (int i in numbers)
2225                                 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2226                         // SPECIAL CASE: Cyrillic Thousand sign
2227                         numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2228                         numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2229
2230 //foreach (DictionaryEntry de in numberValues)
2231 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2232
2233                         // FIXME: fillIndex adjustment lines are too
2234                         // complicated. It must be simpler.
2235                         decimal prevValue = -1;
2236                         foreach (DictionaryEntry de in numberValues) {
2237                                 int cp = (int) de.Key;
2238                                 decimal currValue = (decimal) de.Value;
2239                                 bool addnew = false;
2240                                 if (prevValue < currValue &&
2241                                         prevValue - (int) prevValue == 0 &&
2242                                         prevValue >= 1) {
2243
2244                                         addnew = true;
2245                                         // Process Hangzhou and Roman numbers
2246
2247                                         // There are some SPECIAL cases.
2248                                         if (currValue != 4) // no increment for 4
2249                                                 fillIndex [0xC]++;
2250
2251                                         int xcp;
2252                                         if (currValue <= 13) {
2253                                                 if (currValue == 4)
2254                                                         fillIndex [0xC]++;
2255                                                 // SPECIAL CASE
2256                                                 if (currValue == 11)
2257                                                         AddCharMap ('\u0BF0', 0xC, 1);
2258                                                 xcp = (int) prevValue + 0x2160 - 1;
2259                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2260                                                 xcp = (int) prevValue + 0x2170 - 1;
2261                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2262                                                 fillIndex [0xC]++;
2263                                         }
2264                                         if (currValue < 12)
2265                                                 fillIndex [0xC]++;
2266                                         if (currValue <= 10) {
2267                                                 xcp = (int) prevValue + 0x3021 - 1;
2268                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2269                                                 fillIndex [0xC]++;
2270                                         }
2271                                 }
2272                                 if (prevValue < currValue)
2273                                         prevValue = currValue;
2274                                 if (map [cp].Defined)
2275                                         continue;
2276                                 // HangZhou and Roman are add later
2277                                 // (code is above)
2278                                 if (0x3021 <= cp && cp < 0x302A
2279                                         || 0x2160 <= cp && cp < 0x216C
2280                                         || 0x2170 <= cp && cp < 0x217C)
2281                                         continue;
2282
2283                                 if (cp == 0x215B) // FIXME: why?
2284                                         fillIndex [0xC] += 2;
2285                                 else if (cp == 0x3021) // FIXME: why?
2286                                         fillIndex [0xC]++;
2287                                 if (addnew || cp <= '9') {
2288                                         int mod = (int) currValue - 1;
2289                                         int xcp;
2290                                         if (1 <= currValue && currValue <= 11) {
2291                                                 xcp = mod + 0x2776;
2292                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2293                                                 xcp = mod + 0x2780;
2294                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2295                                                 xcp = mod + 0x278A;
2296                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2297                                         }
2298                                         if (1 <= currValue && currValue <= 20) {
2299                                                 xcp = mod + 0x2460;
2300                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2301                                                 xcp = mod + 0x2474;
2302                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2303                                                 xcp = mod + 0x2488;
2304                                                 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2305                                         }
2306                                 }
2307                                 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2308                                         fillIndex [0xC]++;
2309                                 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2310
2311                                 switch (cp) {
2312                                 // Maybe Bengali digit numbers do not increase
2313                                 // indexes, but 0x09E6 does.
2314                                 case 0x09E7: case 0x09E8: case 0x09E9:
2315                                 case 0x09EA:
2316                                 // SPECIAL CASES
2317                                 case 0x0BF0: case 0x2180: case 0x2181:
2318                                         break;
2319                                 // SPECIAL CASE
2320                                 case 0x0BF1:
2321                                         fillIndex [0xC]++;
2322                                         break;
2323                                 default:
2324                                         if (currValue < 11 || currValue == 1000)
2325                                                 fillIndex [0xC]++;
2326                                         break;
2327                                 }
2328
2329                                 // Add special cases that are not regarded as
2330                                 // numbers in UnicodeCategory speak.
2331                                 if (cp == '5') {
2332                                         // TONE FIVE
2333                                         AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2334                                         AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2335                                 }
2336                                 else if (cp == '2' || cp == '6') // FIXME: why?
2337                                         fillIndex [0xC]++;
2338                         }
2339
2340                         // 221E: infinity
2341                         fillIndex [0xC] = 0xFF;
2342                         AddCharMap ('\u221E', 0xC, 1);
2343                         #endregion
2344
2345                         #region Letters and NonSpacing Marks (general)
2346
2347                         // ASCII Latin alphabets
2348                         for (int i = 0; i < alphabets.Length; i++)
2349                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2350
2351                         // non-ASCII Latin alphabets
2352                         // FIXME: there is no such characters that are placed
2353                         // *after* "alphabets" array items. This is nothing
2354                         // more than a hack that creates dummy weight for
2355                         // primary characters.
2356                         for (int i = 0x0080; i < 0x0300; i++) {
2357                                 if (!Char.IsLetter ((char) i))
2358                                         continue;
2359                                 // For those Latin Letters which has NFKD are
2360                                 // not added as independent primary character.
2361                                 if (decompIndex [i] != 0)
2362                                         continue;
2363                                 // SPECIAL CASES:
2364                                 // 1.some alphabets have primarily
2365                                 //   equivalent ASCII alphabets.
2366                                 // 2.some have independent primary weights,
2367                                 //   but inside a-to-z range.
2368                                 // 3.there are some expanded characters that
2369                                 //   are not part of Unicode Standard NFKD.
2370                                 // 4. some characters are letter in IsLetter
2371                                 //   but not in sortkeys (maybe unicode version
2372                                 //   difference caused it).
2373                                 switch (i) {
2374                                 // 1. skipping them does not make sense
2375 //                              case 0xD0: case 0xF0: case 0x131: case 0x138:
2376 //                              case 0x184: case 0x185: case 0x186: case 0x189:
2377 //                              case 0x18D: case 0x18E: case 0x18F: case 0x190:
2378 //                              case 0x194: case 0x195: case 0x196: case 0x19A:
2379 //                              case 0x19B: case 0x19C:
2380                                 // 2. skipping them does not make sense
2381 //                              case 0x14A: // Ng
2382 //                              case 0x14B: // ng
2383                                 // 3.
2384                                 case 0xC6: // AE
2385                                 case 0xE6: // ae
2386                                 case 0xDE: // Icelandic Thorn
2387                                 case 0xFE: // Icelandic Thorn
2388                                 case 0xDF: // German ss
2389                                 case 0xFF: // German ss
2390                                 // 4.
2391                                 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2392                                 // not classified yet
2393 //                              case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2394 //                              case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2395 //                              case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2396 //                              case 0x1DD:
2397                                         continue;
2398                                 }
2399                                 AddCharMapGroup ((char) i, 0xE, 1, 0);
2400                         }
2401
2402                         // IPA extensions
2403                         // FIXME: this results in not equivalent values to
2404                         // Windows, but is safer for comparison.
2405                         char [] ipaArray = new char [0x300 - 0x250 + 0x20];
2406                         for (int i = 0x40; i < 0x60; i++)
2407                                 if (Char.IsLetter ((char) i))
2408                                         ipaArray [i - 0x40] = (char) (i);
2409                         for (int i = 0x250; i < 0x300; i++)
2410                                 if (Char.IsLetter ((char) i))
2411                                         ipaArray [i - 0x250 + 0x20] = (char) i;
2412                         Array.Sort (ipaArray, UCAComparer.Instance);
2413                         int targetASCII = 0;
2414                         byte latinDiacritical = 0x7B;
2415                         foreach (char c in ipaArray) {
2416                                 if (c <= 'Z') {
2417                                         targetASCII = c;
2418                                         latinDiacritical = 0x7B;
2419                                 }
2420                                 else
2421                                         map [(int) c] = new CharMapEntry (
2422                                                 0xE,
2423                                                 map [targetASCII].Level1,
2424                                                 latinDiacritical++);
2425                         }
2426
2427                         // Greek and Coptic
2428
2429                         // FIXME: this is (mysterious and) incomplete.
2430                         for (int i = 0x0380; i < 0x0400; i++)
2431                                 if (diacritical [i] == 0 &&
2432                                         decompLength [i] == 1 &&
2433                                         decompType [i] == DecompositionCompat)
2434                                         diacritical [i] = 3;
2435
2436                         fillIndex [0xF] = 2;
2437                         for (int i = 0x0391; i < 0x03AA; i++)
2438                                 if (i != 0x03A2)
2439                                         AddCharMap ((char) i, 0xF, 1,
2440                                                 diacritical [i]);
2441                         fillIndex [0xF] = 2;
2442                         for (int i = 0x03B1; i < 0x03CA; i++)
2443                                 if (i != 0x03C2)
2444                                         AddCharMap ((char) i, 0xF, 1,
2445                                                 diacritical [i]);
2446                         // Final Sigma
2447                         map [0x03C2] = new CharMapEntry (0xF,
2448                                 map [0x03C3].Level1, map [0x03C3].Level2);
2449
2450                         fillIndex [0xF] = 0x40;
2451                         for (int i = 0x03DA; i < 0x03F0; i++)
2452                                 AddCharMap ((char) i, 0xF,
2453                                         (byte) (i % 2 == 0 ? 0 : 2),
2454                                         diacritical [i]);
2455
2456                         // NFKD
2457                         for (int i = 0x0386; i <= 0x0400; i++)
2458                                 FillLetterNFKD (i, true, true);
2459
2460                         // Cyrillic.
2461                         // Cyrillic letters are sorted like Latin letters i.e.
2462                         // containing culture-specific letters between the
2463                         // standard Cyrillic sequence.
2464                         //
2465                         // We can't use UCA here; it has different sorting.
2466                         char [] orderedCyrillic = new char [] {
2467                                 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2468                                 '\u0452', // DJE for Serbocroatian
2469                                 '\u0435',
2470                                 '\u0454', // IE for Ukrainian
2471                                 '\u0436', '\u0437',
2472                                 '\u0455', // DZE
2473                                 '\u0438',
2474                                 '\u0456', // Byelorussian-Ukrainian I
2475                                 '\u0457', // YI
2476                                 '\u0439',
2477                                 '\u0458', // JE
2478                                 '\u043A', '\u043B',
2479                                 '\u0459', // LJE
2480                                 '\u043C', '\u043D',
2481                                 '\u045A', // NJE
2482                                 '\u043E',
2483                                 // 4E9 goes here.
2484                                 '\u043F', '\u0440', '\u0441', '\u0442',
2485                                 '\u045B', // TSHE for Serbocroatian
2486                                 '\u0443',
2487                                 '\u045E', // Short U for Byelorussian
2488                                 '\u04B1', // Straight U w/ stroke (diacritical!)
2489                                 '\u0444', '\u0445', '\u0446', '\u0447',
2490                                 '\u045F', // DZHE
2491                                 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2492                                 '\u044D', '\u044E', '\u044F'};
2493
2494                         // For some characters here is a map to basic cyrillic
2495                         // letters. See UnicodeData.txt character names for
2496                         // the sources. Here I simply declare an equiv. array.
2497                         // The content characters are map from U+490(,491),
2498                         // skipping small letters.
2499                         char [] cymap_src = new char [] {
2500                                 '\u0433', '\u0433', '\u0433', '\u0436',
2501                                 '\u0437', '\u043A', '\u043A', '\u043A',
2502                                 '\u043A', '\u043D', '\u043D', '\u043F',
2503                                 '\u0445', '\u0441', '\u0442', '\u0443',
2504                                 '\u0443', '\u0445', '\u0446', '\u0447',
2505                                 '\u0447', '\u0432', '\u0435', '\u0435',
2506                                 '\u0406', '\u0436', '\u043A', '\u043D',
2507                                 '\u0447', '\u0435'};
2508
2509                         fillIndex [0x10] = 0x8D;
2510                         for (int i = 0x0460; i < 0x0481; i++) {
2511                                 if (Char.IsLetter ((char) i)) {
2512                                         if (i == 0x0476)
2513                                                 // U+476/477 have the same
2514                                                 // primary weight as U+474/475.
2515                                                 fillIndex [0x10] -= 3;
2516                                         AddLetterMap ((char) i, 0x10, 3);
2517                                 }
2518                         }
2519
2520                         fillIndex [0x10] = 0x6;
2521                         for (int i = 0; i < orderedCyrillic.Length; i++) {
2522                                 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2523                                 if (!IsIgnorable ((int) c) &&
2524                                         Char.IsLetter (c) &&
2525                                         !map [c].Defined) {
2526                                         AddLetterMap (c, 0x10, 0);
2527                                         fillIndex [0x10] += 3;
2528                                 }
2529                         }
2530
2531                         // NFKD
2532                         for (int i = 0x0401; i <= 0x045F; i++)
2533                                 FillLetterNFKD (i, false, false);
2534
2535                         for (int i = 0; i < cymap_src.Length; i++) {
2536                                 char c = cymap_src [i];
2537                                 fillIndex [0x10] = map [c].Level1;
2538                                 int c2 = 0x0490 + i * 2;
2539                                 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2540                         }
2541
2542                         // Armenian
2543                         fillIndex [0x11] = 0x3;
2544                         fillIndex [0x1] = 0x98;
2545                         for (int i = 0x0531; i < 0x0586; i++) {
2546                                 if (i == 0x0559 || i == 0x55A)
2547                                         AddCharMap ((char) i, 1, 1);
2548                                 if (Char.IsLetter ((char) i))
2549                                         AddLetterMap ((char) i, 0x11, 1);
2550                         }
2551
2552                         // Hebrew
2553                         // -Letters
2554                         fillIndex [0x12] = 0x2;
2555                         for (int i = 0x05D0; i < 0x05FF; i++)
2556                                 if (Char.IsLetter ((char) i)) {
2557                                         if (isUppercase [i]) {
2558                                                 fillIndex [0x12]--;
2559                                                 AddLetterMap ((char) i, 0x12, 2);
2560                                         }
2561                                         else
2562                                                 AddLetterMap ((char) i, 0x12, 1);
2563                                 }
2564                         // -Accents
2565                         fillIndex [0x1] = 0x3;
2566                         for (int i = 0x0591; i <= 0x05C2; i++) {
2567                                 if (i == 0x05A3 || i == 0x05BB)
2568                                         fillIndex [0x1]++;
2569                                 if (i != 0x05BE)
2570                                         AddCharMap ((char) i, 0x1, 1);
2571                         }
2572
2573                         // Arabic
2574                         fillIndex [0x1] = 0x8E;
2575                         fillIndex [0x13] = 0x3;
2576                         for (int i = 0x0621; i <= 0x064A; i++) {
2577                                 // Abjad
2578                                 if (Char.GetUnicodeCategory ((char) i)
2579                                         != UnicodeCategory.OtherLetter) {
2580                                         // FIXME: arabic nonspacing marks are
2581                                         // in different order.
2582                                         AddCharMap ((char) i, 0x1, 1);
2583                                         continue;
2584                                 }
2585 //                              map [i] = new CharMapEntry (0x13,
2586 //                                      (byte) arabicLetterPrimaryValues [i], 1);
2587                                 fillIndex [0x13] =
2588                                         (byte) arabicLetterPrimaryValues [i];
2589                                 byte formDiacritical = 8; // default
2590                                 // SPECIAL CASES:
2591                                 switch (i) {
2592                                 case 0x0622: formDiacritical = 9; break;
2593                                 case 0x0623: formDiacritical = 0xA; break;
2594                                 case 0x0624: formDiacritical = 5; break;
2595                                 case 0x0625: formDiacritical = 0xB; break;
2596                                 case 0x0626: formDiacritical = 7; break;
2597                                 case 0x0649: formDiacritical = 5; break;
2598                                 case 0x064A: formDiacritical = 7; break;
2599                                 }
2600 //                              AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2601                                 AddArabicCharMap ((char) i, 0x13, 1, formDiacritical);
2602                         }
2603                         for (int i = 0x0670; i < 0x0673; i++)
2604                                 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2605                         fillIndex [0x13] = 0x84;
2606                         for (int i = 0x0674; i < 0x06D6; i++)
2607                                 if (Char.IsLetter ((char) i))
2608                                         AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2609
2610                         // Devanagari
2611
2612                         // FIXME: this could be fixed in more decent way
2613                         for (int i = 0x0958; i <= 0x095F; i++)
2614                                 diacritical [i] = 8;
2615
2616                         // FIXME: it does seem straight codepoint mapping.
2617                         fillIndex [0x14] = 04;
2618                         for (int i = 0x0901; i < 0x0905; i++)
2619                                 if (!IsIgnorable (i))
2620                                         AddLetterMap ((char) i, 0x14, 2);
2621                         fillIndex [0x14] = 0xB;
2622                         for (int i = 0x0905; i < 0x093A; i++) {
2623                                 if (i == 0x0928)
2624                                         AddCharMap ('\u0929', 0x14, 0, 8);
2625                                 if (i == 0x0930)
2626                                         AddCharMap ('\u0931', 0x14, 0, 8);
2627                                 if (i == 0x0933)
2628                                         AddCharMap ('\u0934', 0x14, 0, 8);
2629                                 if (Char.IsLetter ((char) i))
2630                                         AddLetterMap ((char) i, 0x14, 4);
2631                                 if (i == 0x090B)
2632                                         AddCharMap ('\u0960', 0x14, 4);
2633                                 if (i == 0x090C)
2634                                         AddCharMap ('\u0961', 0x14, 4);
2635                         }
2636                         fillIndex [0x14] = 0xDA;
2637                         for (int i = 0x093E; i < 0x0945; i++)
2638                                 if (!IsIgnorable (i))
2639                                         AddLetterMap ((char) i, 0x14, 2);
2640                         fillIndex [0x14] = 0xEC;
2641                         for (int i = 0x0945; i < 0x094F; i++)
2642                                 if (!IsIgnorable (i))
2643                                         AddLetterMap ((char) i, 0x14, 2);
2644
2645                         // Bengali
2646                         // -Letters
2647                         fillIndex [0x15] = 02;
2648                         for (int i = 0x0980; i < 0x9FF; i++) {
2649                                 if (IsIgnorable (i))
2650                                         continue;
2651                                 if (i == 0x09E0)
2652                                         fillIndex [0x15] = 0x3B;
2653                                 switch (Char.GetUnicodeCategory ((char) i)) {
2654                                 case UnicodeCategory.NonSpacingMark:
2655                                 case UnicodeCategory.DecimalDigitNumber:
2656                                 case UnicodeCategory.OtherNumber:
2657                                         continue;
2658                                 }
2659                                 AddLetterMap ((char) i, 0x15, 1);
2660                         }
2661                         // -Signs
2662                         fillIndex [0x1] = 0x3;
2663                         for (int i = 0x0981; i < 0x0A00; i++)
2664                                 if (Char.GetUnicodeCategory ((char) i) ==
2665                                         UnicodeCategory.NonSpacingMark)
2666                                         AddCharMap ((char) i, 0x1, 1);
2667
2668                         // Gurmukhi. orderedGurmukhi is from UCA
2669                         // FIXME: it does not look equivalent to UCA.
2670                         fillIndex [0x16] = 04;
2671                         fillIndex [0x1] = 3;
2672                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
2673                                 char c = orderedGurmukhi [i];
2674                                 if (IsIgnorable ((int) c))
2675                                         continue;
2676                                 if (IsIgnorableNonSpacing (c)) {
2677                                         AddLetterMap (c, 0x1, 1);
2678                                         continue;
2679                                 }
2680                                 if (c == '\u0A3C' || c == '\u0A4D' ||
2681                                         '\u0A66' <= c && c <= '\u0A71')
2682                                         continue;
2683                                 // SPECIAL CASES
2684                                 byte shift = 4;
2685                                 switch (c) {
2686                                 case '\u0A33': case '\u0A36': case '\u0A16':
2687                                 case '\u0A17': case '\u0A5B': case '\u0A5E':
2688                                         shift = 0;
2689                                         break;
2690                                 }
2691                                 if (c == '\u0A3E') // Skip
2692                                         fillIndex [0x16] = 0xC0;
2693                                 AddLetterMap (c, 0x16, shift);
2694                         }
2695
2696                         // Gujarati. orderedGujarati is from UCA
2697                         fillIndex [0x17] = 0x4;
2698                         // nonspacing marks
2699                         map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2700                         map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2701                         map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2702                         map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2703                         map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2704                         map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2705                         // letters go first.
2706                         for (int i = 0; i < orderedGujarati.Length; i++) {
2707                                 // SPECIAL CASE
2708                                 char c = orderedGujarati [i];
2709                                 if (Char.IsLetter (c)) {
2710                                         // SPECIAL CASES
2711                                         if (c == '\u0AB3' || c == '\u0A32')
2712                                                 continue;
2713                                         if (c == '\u0A33') {
2714                                                 AddCharMap ('\u0A32', 0x17, 0);
2715                                                 AddCharMap ('\u0A33', 0x17, 4, 4);
2716                                                 continue;
2717                                         }
2718                                         if (c == '\u0A8B')
2719                                                 AddCharMap ('\u0AE0', 0x17, 0, 5);
2720                                         AddCharMap (c, 0x17, 4);
2721
2722                                         if (c == '\u0AB9')
2723                                                 AddCharMap ('\u0AB3', 0x17, 6);
2724                                 }
2725                         }
2726                         // non-letters
2727                         byte gujaratiShift = 4;
2728                         fillIndex [0x17] = 0xC0;
2729                         for (int i = 0; i < orderedGujarati.Length; i++) {
2730                                 char c = orderedGujarati [i];
2731                                 if (fillIndex [0x17] == 0xCC)
2732                                         gujaratiShift = 3;
2733                                 if (!Char.IsLetter (c)) {
2734                                         // SPECIAL CASES
2735                                         if (c == '\u0A82')
2736                                                 AddCharMap ('\u0A81', 0x17, 2);
2737                                         if (c == '\u0AC2')
2738                                                 fillIndex [0x17]++;
2739                                         AddLetterMap (c, 0x17, gujaratiShift);
2740                                 }
2741                         }
2742
2743                         // Oriya
2744                         fillIndex [0x1] = 03;
2745                         fillIndex [0x18] = 02;
2746                         for (int i = 0x0B00; i < 0x0B7F; i++) {
2747                                 switch (Char.GetUnicodeCategory ((char) i)) {
2748                                 case UnicodeCategory.NonSpacingMark:
2749                                 case UnicodeCategory.DecimalDigitNumber:
2750                                         AddLetterMap ((char) i, 0x1, 1);
2751                                         continue;
2752                                 }
2753                                 AddLetterMapCore ((char) i, 0x18, 1, 0, true);
2754                         }
2755
2756                         // Tamil
2757                         fillIndex [0x19] = 2;
2758                         AddCharMap ('\u0BD7', 0x19, 0);
2759                         fillIndex [0x19] = 0xA;
2760                         // vowels
2761                         for (int i = 0x0B82; i <= 0x0B94; i++)
2762                                 if (!IsIgnorable ((char) i))
2763                                         AddCharMap ((char) i, 0x19, 2);
2764                         // special vowel
2765                         fillIndex [0x19] = 0x28;
2766                         // The array for Tamil consonants is a constant.
2767                         // Windows have almost similar sequence to TAM from
2768                         // tamilnet but a bit different in Grantha.
2769                         for (int i = 0; i < orderedTamilConsonants.Length; i++)
2770                                 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2771                         // combining marks
2772                         fillIndex [0x19] = 0x82;
2773                         for (int i = 0x0BBE; i < 0x0BCD; i++)
2774                                 if (Char.GetUnicodeCategory ((char) i) ==
2775                                         UnicodeCategory.SpacingCombiningMark
2776                                         || i == 0x0BC0)
2777                                         AddLetterMap ((char) i, 0x19, 2);
2778
2779                         // Telugu
2780                         fillIndex [0x1A] = 0x4;
2781                         for (int i = 0x0C00; i < 0x0C62; i++) {
2782                                 if (i == 0x0C55 || i == 0x0C56)
2783                                         continue; // skip
2784                                 AddCharMap ((char) i, 0x1A, 3);
2785                                 char supp = (i == 0x0C0B) ? '\u0C60':
2786                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
2787                                 if (supp == char.MinValue)
2788                                         continue;
2789                                 AddCharMap (supp, 0x1A, 3);
2790                         }
2791
2792                         // Kannada
2793                         fillIndex [0x1B] = 4;
2794                         for (int i = 0x0C80; i < 0x0CE5; i++) {
2795                                 if (i == 0x0CD5 || i == 0x0CD6)
2796                                         continue; // ignore
2797                                 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2798                                         continue; // shift after 0xCB9
2799                                 AddCharMap ((char) i, 0x1B, 3);
2800                                 if (i == 0x0CB9) {
2801                                         // SPECIAL CASES: but why?
2802                                         AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2803                                         AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2804                                         AddCharMap ('\u0CDE', 0x1B, 3); // FA
2805                                 }
2806                                 if (i == 0x0CB2)
2807                                         AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2808                         }
2809
2810                         // Malayalam
2811                         fillIndex [0x1C] = 2;
2812                         fillIndex [0x1] = 3;
2813                         for (int i = 0x0D02; i < 0x0D61; i++) {
2814                                 // FIXME: I avoided MSCompatUnicodeTable usage
2815                                 // here (it results in recursion). So check if
2816                                 // using NonSpacingMark makes sense or not.
2817                                 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2818 //                              if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2819                                         AddCharMap ((char) i, 0x1C, 1);
2820                                 else if (!IsIgnorable ((char) i))
2821                                         AddCharMap ((char) i, 1, 1);
2822                         }
2823
2824                         // Thai ... note that it breaks 0x1E wall after E2B!
2825                         // Also, all Thai characters have level 2 value 3.
2826                         fillIndex [0x1E] = 2;
2827                         fillIndex [0x1] = 3;
2828                         for (int i = 0xE40; i <= 0xE44; i++)
2829                                 AddCharMap ((char) i, 0x1E, 1, 3);
2830                         for (int i = 0xE01; i < 0xE2B; i++)
2831                                 AddCharMap ((char) i, 0x1E, 6, 3);
2832                         fillIndex [0x1F] = 5;
2833                         for (int i = 0xE2B; i < 0xE30; i++)
2834                                 AddCharMap ((char) i, 0x1F, 6, 3);
2835                         fillIndex [0x1F] = 0x1E;
2836                         for (int i = 0xE30; i < 0xE3B; i++)
2837                                 AddCharMap ((char) i, 0x1F, 1, 3);
2838                         // some Thai characters remains.
2839                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
2840                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2841                         foreach (char c in specialThai)
2842                                 AddCharMap (c, 0x1F, 1, 3);
2843
2844                         for (int i = 0xE00; i < 0xE80; i++)
2845                                 if (Char.GetUnicodeCategory ((char) i) ==
2846                                         UnicodeCategory.NonSpacingMark)
2847                                         AddCharMap ((char) i, 1, 1);
2848
2849                         // Lao
2850                         fillIndex [0x1F] = 2;
2851                         fillIndex [0x1] = 3;
2852                         for (int i = 0xE80; i < 0xEDF; i++) {
2853                                 if (IsIgnorable ((char) i))
2854                                         continue;
2855                                 else if (Char.IsLetter ((char) i))
2856                                         AddCharMap ((char) i, 0x1F, 1);
2857                                 else if (Char.GetUnicodeCategory ((char) i) ==
2858                                         UnicodeCategory.NonSpacingMark)
2859                                         AddCharMap ((char) i, 1, 1);
2860                         }
2861
2862                         // Georgian. orderedGeorgian is from UCA DUCET.
2863                         fillIndex [0x21] = 5;
2864                         for (int i = 0; i < orderedGeorgian.Length; i++) {
2865                                 char c = orderedGeorgian [i];
2866                                 if (map [(int) c].Defined)
2867                                         continue;
2868                                 AddCharMap (c, 0x21, 0);
2869                                 if (c < '\u10F6')
2870                                         AddCharMap ((char) (c - 0x30), 0x21, 0);
2871                                 fillIndex [0x21] += 5;
2872                         }
2873
2874                         // Japanese Kana.
2875                         fillIndex [0x22] = 2;
2876                         int kanaOffset = 0x3041;
2877                         byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2878
2879                         for (int gyo = 0; gyo < 9; gyo++) {
2880                                 for (int dan = 0; dan < 5; dan++) {
2881                                         if (gyo == 7 && dan % 2 == 1) {
2882                                                 // 'ya'-gyo
2883                                                 fillIndex [0x22]++;
2884                                                 kanaOffset -= 2; // There is no space for yi and ye.
2885                                                 continue;
2886                                         }
2887                                         int cp = kanaOffset + dan * kanaLines [gyo];
2888                                         // small lines (a-gyo, ya-gyo)
2889                                         if (gyo == 0 || gyo == 7) {
2890                                                 AddKanaMap (cp, 1); // small
2891                                                 AddKanaMap (cp + 1, 1);
2892                                         }
2893                                         else
2894                                                 AddKanaMap (cp, kanaLines [gyo]);
2895                                         fillIndex [0x22]++;
2896
2897                                         if (cp == 0x30AB) {
2898                                                 // add small 'ka' (before normal one)
2899                                                 AddKanaMap (0x30F5, 1);
2900                                                 kanaOffset++;
2901                                         }
2902                                         if (cp == 0x30B1) {
2903                                                 // add small 'ke' (before normal one)
2904                                                 AddKanaMap (0x30F6, 1);
2905                                                 kanaOffset++;
2906                                         }
2907                                         if (cp == 0x3061) {
2908                                                 // add small 'Tsu' (before normal one)
2909                                                 AddKanaMap (0x3063, 1);
2910                                                 kanaOffset++;
2911                                         }
2912                                 }
2913                                 fillIndex [0x22] += 3;
2914                                 kanaOffset += 5 * kanaLines [gyo];
2915                         }
2916
2917                         // Wa-gyo is almost special, so I just manually add.
2918                         AddLetterMap ((char) 0x308E, 0x22, 0);
2919                         AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2920                         AddLetterMap ((char) 0x308F, 0x22, 0);
2921                         AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2922                         fillIndex [0x22]++;
2923                         AddLetterMap ((char) 0x3090, 0x22, 0);
2924                         AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2925                         fillIndex [0x22] += 2;
2926                         // no "Wu" in Japanese.
2927                         AddLetterMap ((char) 0x3091, 0x22, 0);
2928                         AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2929                         fillIndex [0x22]++;
2930                         AddLetterMap ((char) 0x3092, 0x22, 0);
2931                         AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2932                         // Nn
2933                         fillIndex [0x22] = 0x80;
2934                         AddLetterMap ((char) 0x3093, 0x22, 0);
2935                         AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2936
2937                         map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2938                                 map [0x30A6].Level1, 3);// voiced hiragana U
2939                         map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2940                                 map [0x30A6].Level1, 3);// voiced katakana U
2941
2942                         map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2943                                 map [0x30AB].Level1, 0);// small katakana Ka
2944                         map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2945                                 map [0x30B1].Level1, 0);// small katakana Ke
2946                         // voiced Wa lines
2947                         for (int i = 0x30F7; i < 0x30FB; i++)
2948                                 map [i] = new CharMapEntry (map [i - 8].Category,
2949                                         map [i - 8].Level1,
2950                                         3);
2951
2952                         // JIS Japanese square chars.
2953                         fillIndex [0x22] = 0x97;
2954                         jisJapanese.Sort (JISComparer.Instance);
2955                         foreach (JISCharacter j in jisJapanese)
2956                                 if (0x3300 <= j.CP && j.CP <= 0x3357)
2957                                         AddCharMap ((char) j.CP, 0x22, 1);
2958                         // non-JIS Japanese square chars.
2959                         nonJisJapanese.Sort (NonJISComparer.Instance);
2960                         foreach (NonJISCharacter j in nonJisJapanese)
2961                                 AddCharMap ((char) j.CP, 0x22, 1);
2962
2963                         // Bopomofo
2964                         fillIndex [0x23] = 0x02;
2965                         for (int i = 0x3105; i <= 0x312C; i++)
2966                                 AddCharMap ((char) i, 0x23, 1);
2967
2968                         // Estrangela: ancient Syriac
2969                         fillIndex [0x24] = 0x0B;
2970                         // FIXME: is 0x71E really alternative form?
2971                         ArrayList syriacAlternatives = new ArrayList (
2972                                 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2973                         for (int i = 0x0710; i <= 0x072C; i++) {
2974                                 if (i == 0x0711) // NonSpacingMark
2975                                         continue;
2976                                 if (syriacAlternatives.Contains (i))
2977                                         continue;
2978                                 AddCharMap ((char) i, 0x24, 4);
2979                                 // FIXME: why?
2980                                 if (i == 0x721)
2981                                         fillIndex [0x24]++;
2982                         }
2983                         foreach (int cp in syriacAlternatives)
2984                                 map [cp] = new CharMapEntry (0x24,
2985                                         (byte) (map [cp - 1].Level1 + 2),
2986                                         0);
2987                         // FIXME: Syriac NonSpacingMark should go here.
2988
2989                         // Thaana
2990                         // FIXME: it turned out that it does not look like UCA
2991                         fillIndex [0x24] = 0x6E;
2992                         fillIndex [0x1] = 0xAC;
2993                         for (int i = 0; i < orderedThaana.Length; i++) {
2994                                 char c = orderedThaana [i];
2995                                 if (IsIgnorableNonSpacing ((int) c))
2996                                         AddCharMap (c, 1, 1);
2997                                 AddCharMap (c, 0x24, 2);
2998                                 if (c == '\u0782') // SPECIAL CASE: why?
2999                                         fillIndex [0x24] += 2;
3000                         }
3001                         #endregion
3002
3003                         // FIXME: Add more culture-specific letters (that are
3004                         // not supported in Windows collation) here.
3005
3006                         // Surrogate ... they are computed.
3007
3008                         #region Hangul
3009                         // Hangul.
3010                         //
3011                         // Unlike UCA Windows Hangul sequence mixes Jongseong
3012                         // with Choseong sequence as well as Jungseong,
3013                         // adjusted to have the same primary weight for the
3014                         // same base character. So it is impossible to compute
3015                         // those sort keys.
3016                         //
3017                         // Here I introduce an ordered sequence of mixed
3018                         // 'commands' and 'characters' that is similar to
3019                         // LDML text:
3020                         //      - ',' increases primary weight.
3021                         //      - [A B] means a range, increasing index
3022                         //      - {A B} means a range, without increasing index
3023                         //      - '=' is no operation (it means the characters
3024                         //        of both sides have the same weight).
3025                         //      - '>' inserts a Hangul Syllable block that
3026                         //        contains 0x251 characters.
3027                         //      - '<' decreases the index
3028                         //      - '0'-'9' means skip count
3029                         //      - whitespaces are ignored
3030                         //
3031
3032                         string hangulSequence =
3033                           "\u1100=\u11A8 > \u1101=\u11A9 >"
3034                         + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
3035                         + "<{\u1113 \u1116}, \u3165,"
3036                                 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
3037                                 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
3038                         + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
3039                         + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
3040                                 + "[\u11D1 \u11D2], \u11B2,"
3041                                 + "[\u11D3 \u11D5], \u11B3,"
3042                                 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
3043                                 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
3044                         + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
3045                         + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
3046                         + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
3047                                 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
3048                                 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
3049                         + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
3050                                 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
3051                         + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
3052                                 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
3053                         + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
3054                                 + "\u11F1,, \u11F2,,,"
3055                                 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
3056                         + "<\u114D, \u110D,,  >"
3057                         + "<{\u114E \u1151},, \u110E=\u11BE,,  >"
3058                         + "<{\u1152 \u1155},,, \u110F=\u11BF >"
3059                         + "\u1110=\u11C0 > \u1111=\u11C1 >"
3060                         + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
3061                         + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
3062                                 + "[\u11F5 \u11F8]"
3063                         ;
3064
3065                         byte hangulCat = 0x52;
3066                         fillIndex [hangulCat] = 0x2;
3067
3068                         int syllableBlock = 0;
3069                         for (int n = 0; n < hangulSequence.Length; n++) {
3070                                 char c = hangulSequence [n];
3071                                 int start, end;
3072                                 if (Char.IsWhiteSpace (c))
3073                                         continue;
3074                                 switch (c) {
3075                                 case '=':
3076                                         break; // NOP
3077                                 case ',':
3078                                         IncrementSequentialIndex (ref hangulCat);
3079                                         break;
3080                                 case '<':
3081                                         if (fillIndex [hangulCat] == 2)
3082                                                 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
3083                                         fillIndex [hangulCat]--;
3084                                         break;
3085                                 case '>':
3086                                         IncrementSequentialIndex (ref hangulCat);
3087                                         for (int l = 0; l < 0x15; l++)
3088                                                 for (int v = 0; v < 0x1C; v++) {
3089                                                         AddCharMap (
3090                                                                 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
3091                                                         IncrementSequentialIndex (ref hangulCat);
3092                                                 }
3093                                         syllableBlock++;
3094                                         break;
3095                                 case '[':
3096                                         start = hangulSequence [n + 1];
3097                                         end = hangulSequence [n + 3];
3098                                         for (int i = start; i <= end; i++) {
3099                                                 AddCharMap ((char) i, hangulCat, 0);
3100                                                 if (end > i)
3101                                                         IncrementSequentialIndex (ref hangulCat);
3102                                         }
3103                                         n += 4; // consumes 5 characters for this operation
3104                                         break;
3105                                 case '{':
3106                                         start = hangulSequence [n + 1];
3107                                         end = hangulSequence [n + 3];
3108                                         for (int i = start; i <= end; i++)
3109                                                 AddCharMap ((char) i, hangulCat, 0);
3110                                         n += 4; // consumes 5 characters for this operation
3111                                         break;
3112                                 default:
3113                                         AddCharMap (c, hangulCat, 0);
3114                                         break;
3115                                 }
3116                         }
3117
3118                         // Some Jamo NFKD.
3119                         for (int i = 0x3200; i < 0x3300; i++) {
3120                                 if (IsIgnorable (i) || map [i].Defined)
3121                                         continue;
3122                                 int ch = 0;
3123                                 // w/ bracket
3124                                 if (decompLength [i] == 4 &&
3125                                         decompValues [decompIndex [i]] == '(')
3126                                         ch = decompIndex [i] + 1;
3127                                 // circled
3128                                 else if (decompLength [i] == 2 &&
3129                                         decompValues [decompIndex [i] + 1] == '\u1161')
3130                                         ch = decompIndex [i];
3131                                 else if (decompLength [i] == 1)
3132                                         ch = decompIndex [i];
3133                                 else
3134                                         continue;
3135                                 ch = decompValues [ch];
3136                                 if (ch < 0x1100 || 0x1200 < ch &&
3137                                         ch < 0xAC00 || 0xD800 < ch)
3138                                         continue;
3139
3140                                 // SPECIAL CASE ?
3141                                 int offset = i < 0x3260 ? 1 : 0;
3142                                 if (0x326E <= i && i <= 0x3273)
3143                                         offset = 1;
3144
3145                                 map [i] = new CharMapEntry (map [ch].Category,
3146                                         (byte) (map [ch].Level1 + offset),
3147                                         map [ch].Level2);
3148 //                                      Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
3149                         }
3150
3151
3152                         #endregion
3153
3154                         // Letterlike characters and CJK compatibility square
3155                         sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
3156                         int [] counts = new int ['Z' - 'A' + 1];
3157                         char [] namedChars = new char [sortableCharNames.Count];
3158                         int nCharNames = 0;
3159                         foreach (DictionaryEntry de in sortableCharNames) {
3160                                 counts [((string) de.Value) [0] - 'A']++;
3161                                 namedChars [nCharNames++] = (char) ((int) de.Key);
3162                         }
3163                         nCharNames = 0; // reset
3164                         for (int a = 0; a < counts.Length; a++) {
3165                                 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
3166                                 for (int i = 0; i < counts [a]; i++)
3167 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
3168                                         AddCharMap (namedChars [nCharNames++], 0xE, 1);
3169                         }
3170
3171                         // CJK unified ideograph.
3172                         byte cjkCat = 0x9E;
3173                         fillIndex [cjkCat] = 0x2;
3174                         for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
3175                                 if (!IsIgnorable (cp))
3176                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
3177                         // CJK Extensions goes here.
3178                         // LAMESPEC: With this Windows style CJK layout, it is
3179                         // impossible to add more CJK ideograph i.e. 0x9FA6-
3180                         // 0x9FBB can never be added w/o breaking compat.
3181                         for (int cp = 0xF900; cp <= 0xFA2D; cp++)
3182                                 if (!IsIgnorable (cp))
3183                                         AddCharMapGroupCJK ((char) cp, ref cjkCat);
3184
3185                         // PrivateUse ... computed.
3186                         // remaining Surrogate ... computed.
3187
3188                         #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
3189                         // non-alphanumeric ASCII except for: + - < = > '
3190                         for (int i = 0x21; i < 0x7F; i++) {
3191                                 // SPECIAL CASE: 02C6 looks regarded as
3192                                 // equivalent to '^', which does not conform
3193                                 // to Unicode standard character database.
3194                                 if (i == 0x005B)
3195                                         AddCharMap ('\u2045', 0x7, 0, 0x1C);
3196                                 if (i == 0x005D)
3197                                         AddCharMap ('\u2046', 0x7, 0, 0x1C);
3198                                 if (i == 0x005E)
3199                                         AddCharMap ('\u02C6', 0x7, 0, 3);
3200                                 if (i == 0x0060)
3201                                         AddCharMap ('\u02CB', 0x7, 0, 3);
3202
3203                                 if (Char.IsLetterOrDigit ((char) i)
3204                                         || "+-<=>'".IndexOf ((char) i) >= 0)
3205                                         continue; // they are not added here.
3206
3207                                 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3208                                 // Insert 3001 after ',' and 3002 after '.'
3209                                 if (i == 0x2C)
3210                                         AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
3211                                 else if (i == 0x2E)
3212                                         AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
3213                                 else if (i == 0x3A)
3214                                         AddCharMap ('\uFE30', 0x7, 1, 0);
3215                         }
3216                         #endregion
3217
3218                         #region 07 - Punctuations and something else
3219                         for (int i = 0xA0; i < char.MaxValue; i++) {
3220                                 if (IsIgnorable (i))
3221                                         continue;
3222
3223                                 // FIXME: actually those reset should not be
3224                                 // done but here I put for easy goal.
3225                                 if (i == 0x05C3)
3226                                         fillIndex [0x7]++;
3227                                 if (i == 0x0700)
3228                                         fillIndex [0x7] = 0xE2;
3229                                 if (i == 0x2016)
3230                                         fillIndex [0x7] = 0x77;
3231                                 if (i == 0x3008)
3232                                         fillIndex [0x7] = 0x93;
3233
3234                                 if (0x02C8 <= i && i <= 0x02CD)
3235                                         continue; // nonspacing marks
3236
3237                                 // SPECIAL CASE: maybe they could be allocated
3238                                 // dummy NFKD mapping and no special processing
3239                                 // would be required here.
3240                                 if (i == 0x00AF)
3241                                         AddCharMap ('\u02C9', 0x7, 0, 3);
3242                                 if (i == 0x00B4)
3243                                         AddCharMap ('\u02CA', 0x7, 0, 3);
3244                                 if (i == 0x02C7)
3245                                         AddCharMap ('\u02D8', 0x7, 0, 3);
3246
3247                                 // SPECIAL CASES:
3248                                 switch (i) {
3249                                 case 0xAB: // 08
3250                                 case 0xB7: // 0A
3251                                 case 0xBB: // 08
3252                                 case 0x02B9: // 01
3253                                 case 0x02BA: // 01
3254                                 case 0x2329: // 09
3255                                 case 0x232A: // 09
3256                                         continue;
3257                                 }
3258
3259                                 switch (Char.GetUnicodeCategory ((char) i)) {
3260                                 case UnicodeCategory.OtherPunctuation:
3261                                 case UnicodeCategory.ClosePunctuation:
3262                                 case UnicodeCategory.OpenPunctuation:
3263                                 case UnicodeCategory.ConnectorPunctuation:
3264                                 case UnicodeCategory.InitialQuotePunctuation:
3265                                 case UnicodeCategory.FinalQuotePunctuation:
3266                                 case UnicodeCategory.ModifierSymbol:
3267                                         // SPECIAL CASES: // 0xA
3268                                         if (0x2020 <= i && i <= 0x2031)
3269                                                 continue;
3270                                         if (i == 0x3003) // added later
3271                                                 continue;
3272                                         AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3273                                         break;
3274                                 default:
3275                                         if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3276                                                 goto case UnicodeCategory.OtherPunctuation;
3277                                         break;
3278                                 }
3279                         }
3280
3281                         // Control pictures
3282                         // FIXME: it should not need to reset level 1, but
3283                         // it's for easy goal.
3284                         fillIndex [0x7] = 0xB6;
3285                         for (int i = 0x2400; i <= 0x2424; i++)
3286                                 AddCharMap ((char) i, 0x7, 1, 0);
3287
3288                         // FIXME: what are they?
3289                         AddCharMap ('\u3003', 0x7, 1);
3290                         AddCharMap ('\u3006', 0x7, 1);
3291                         AddCharMap ('\u02D0', 0x7, 1);
3292                         AddCharMap ('\u10FB', 0x7, 1);
3293                         AddCharMap ('\u0950', 0x7, 1);
3294                         AddCharMap ('\u093D', 0x7, 1);
3295                         AddCharMap ('\u0964', 0x7, 1);
3296                         AddCharMap ('\u0965', 0x7, 1);
3297                         AddCharMap ('\u0970', 0x7, 1);
3298
3299                         #endregion
3300
3301                         #region category 08 - symbols
3302                         fillIndex [0x8] = 2;
3303                         // Here Windows mapping is not straightforward. It is
3304                         // not based on computation but seems manual sorting.
3305                         AddCharMapGroup ('+', 0x8, 1, 0); // plus
3306                         AddCharMapGroup ('\u2212', 0x8, 1); // minus
3307                         AddCharMapGroup ('\u229D', 0x8, 1); // minus
3308                         AddCharMapGroup ('\u2297', 0x8, 1); // mul
3309                         AddCharMapGroup ('\u2044', 0x8, 1); // div
3310                         AddCharMapGroup ('\u2215', 0x8, 0); // div
3311                         AddCharMapGroup ('\u2298', 0x8, 1); // div slash
3312                         AddCharMapGroup ('\u2217', 0x8, 0); // mul
3313                         AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper
3314                         AddCharMapGroup ('\u2218', 0x8, 0); // ring
3315                         AddCharMapGroup ('\u229A', 0x8, 1); // ring
3316                         AddCharMapGroup ('\u2219', 0x8, 0); // bullet
3317                         AddCharMapGroup ('\u2299', 0x8, 1); // dot oper
3318                         AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus
3319                         AddCharMapGroup ('\u003C', 0x8, 1); // <
3320                         AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation
3321                         AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation
3322
3323                         for (int cp = 0; cp < 0x2300; cp++) {
3324                                 if (cp == 0xAC) // SPECIAL CASE: skip
3325                                         continue;
3326                                 if (cp == 0x200) {
3327                                         cp = 0x2200; // skip to 2200
3328                                         fillIndex [0x8] = 0x21;
3329                                 }
3330                                 if (cp == 0x2295)
3331                                         fillIndex [0x8] = 0x3;
3332                                 if (cp == 0x22A2)
3333                                         fillIndex [0x8] = 0xAB;
3334                                 if (cp == 0x22B2)
3335                                         fillIndex [0x8] = 0xB9;
3336                                 if (!map [cp].Defined &&
3337 //                                      Char.GetUnicodeCategory ((char) cp) ==
3338 //                                      UnicodeCategory.MathSymbol)
3339                                         Char.IsSymbol ((char) cp))
3340                                         AddCharMapGroup ((char) cp, 0x8, 1);
3341                                 // SPECIAL CASES: no idea why Windows sorts as such
3342                                 switch (cp) {
3343                                 case 0x3E:
3344                                         AddCharMap ('\u227B', 0x8, 1, 0);
3345                                         AddCharMap ('\u22B1', 0x8, 1, 0);
3346                                         break;
3347                                 case 0xB1:
3348                                         AddCharMapGroup ('\u00AB', 0x8, 1);
3349                                         AddCharMapGroup ('\u226A', 0x8, 1);
3350                                         AddCharMapGroup ('\u00BB', 0x8, 1);
3351                                         AddCharMapGroup ('\u226B', 0x8, 1);
3352                                         break;
3353                                 case 0xF7:
3354                                         AddCharMap ('\u01C0', 0x8, 1, 0);
3355                                         AddCharMap ('\u01C1', 0x8, 1, 0);
3356                                         AddCharMap ('\u01C2', 0x8, 1, 0);
3357                                         break;
3358                                 }
3359                         }
3360                         #endregion
3361
3362                         #region Hack!
3363
3364                         // Characters w/ diacritical marks (NFKD)
3365                         for (int i = 0; i <= char.MaxValue; i++) {
3366                                 if (map [i].Defined || IsIgnorable (i))
3367                                         continue;
3368                                 if (decompIndex [i] == 0)
3369                                         continue;
3370
3371                                 int start = decompIndex [i];
3372                                 int primaryChar = decompValues [start];
3373                                 int secondary = diacritical [i];
3374                                 bool skip = false;
3375                                 int length = decompLength [i];
3376                                 // special processing for parenthesized ones.
3377                                 if (length == 3 &&
3378                                         decompValues [start] == '(' &&
3379                                         decompValues [start + 2] == ')') {
3380                                         primaryChar = decompValues [start + 1];
3381                                         length = 1;
3382                                 }
3383
3384                                 if (map [primaryChar].Level1 == 0)
3385                                         continue;
3386
3387                                 for (int l = 1; l < length; l++) {
3388                                         int c = decompValues [start + l];
3389                                         if (map [c].Level1 != 0)
3390                                                 skip = true;
3391                                         secondary += diacritical [c];
3392                                 }
3393                                 if (skip)
3394                                         continue;
3395                                 map [i] = new CharMapEntry (
3396                                         map [primaryChar].Category,
3397                                         map [primaryChar].Level1,
3398                                         (byte) secondary);
3399
3400                         }
3401
3402                         // Diacritical weight adjustment
3403
3404                         // Arabic Hamzah
3405                         diacritical [0x624] = 0x5;
3406                         diacritical [0x626] = 0x7;
3407                         diacritical [0x622] = 0x9;
3408                         diacritical [0x623] = 0xA;
3409                         diacritical [0x625] = 0xB;
3410                         diacritical [0x649] = 0x5; // 'alif maqs.uurah
3411                         diacritical [0x64A] = 0x7; // Yaa'
3412
3413                         for (int i = 0; i < char.MaxValue; i++) {
3414                                 byte mod = 0;
3415                                 byte cat = map [i].Category;
3416                                 switch (cat) {
3417                                 case 0xE: // Latin diacritics
3418                                 case 0x22: // Japanese: circled characters
3419                                         mod = diacritical [i];
3420                                         break;
3421                                 case 0x13: // Arabic
3422                                         if (i == 0x0621)
3423                                                 break; // 0
3424                                         if (diacritical [i] == 0 && decompLength [i] != 0)
3425                                                 diacritical [i] = map [decompValues [decompIndex [i]]].Level2;
3426                                         if (diacritical [i] == 0 && i >= 0xFE8D)
3427                                                 mod = 0x8; // default for arabic
3428                                         break;
3429                                 }
3430                                 if (0x52 <= cat && cat <= 0x7F) // Hangul
3431                                         mod = diacritical [i];
3432                                 if (mod > 0)
3433                                         map [i] = new CharMapEntry (
3434                                                 cat, map [i].Level1, mod);
3435                         }
3436
3437                         // FIXME: this is halfly hack but those NonSpacingMark
3438                         // characters and still undefined are likely to
3439                         // be nonspacing.
3440                         for (int i = 0; i < char.MaxValue; i++) {
3441                                 if (map [i].Defined ||
3442                                         IsIgnorable (i))
3443                                         continue;
3444                                 switch (i) {
3445                                 // SPECIAL CASES.
3446                                 case 0x02B9:
3447                                 case 0x02BA:
3448                                         break;
3449                                 default:
3450                                         if (Char.GetUnicodeCategory ((char) i) !=
3451                                         UnicodeCategory.NonSpacingMark)
3452                                                 continue;
3453                                         break;
3454                                 }
3455                                 if (diacritical [i] != 0)
3456                                         map [i] = new CharMapEntry (1, 1, diacritical [i]);
3457                                 else
3458                                         AddCharMap ((char) i, 1, 1);
3459                         }
3460
3461                         #endregion
3462                 }
3463
3464                 TextInfo ti = CultureInfo.InvariantCulture.TextInfo;
3465
3466                 private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap)
3467                 {
3468                         if (map [i].Defined)
3469                                 return;
3470                         int up = (int) ti.ToUpper ((char) i);
3471                         if (checkUpper && map [up].Category == 0xF) {
3472                                 if (i == up)
3473                                         return;
3474                                 FillLetterNFKD (up, checkUpper, greekRemap);
3475                                 map [i] = new CharMapEntry (0xF,
3476                                         map [up].Level1,
3477                                         map [up].Level2);
3478                         } else {
3479                                 int idx = decompIndex [i];
3480                                 if (idx == 0)
3481                                         return;
3482                                 int primary = decompValues [decompIndex [i]];
3483                                 FillLetterNFKD (primary, checkUpper, greekRemap);
3484
3485                                 int lv2 = map [primary].Level2;
3486                                 byte off = 0;
3487                                 for (int l = 1; l < decompLength [i]; l++) {
3488                                         int tmp = decompValues [idx + l];
3489                                         if (map [tmp].Category != 1)
3490                                                 return;
3491                                         if (greekRemap && map [tmp].Level2 == 0xC)
3492                                                 off += 3;
3493                                         else
3494                                                 off += map [tmp].Level2;
3495                                 }
3496                                 if (off > 0) {
3497                                         if (lv2 == 0)
3498                                                 lv2 += 2;
3499                                         lv2 += off;
3500                                 }
3501                                 // ... but override if the value already exists.
3502                                 if (diacritical [i] != 0)
3503                                         lv2 = diacritical [i];
3504                                 map [i] = new CharMapEntry (
3505                                         map [primary].Category,
3506                                         map [primary].Level1,
3507                                         (byte) lv2);
3508                         }
3509                 }
3510
3511                 private void IncrementSequentialIndex (ref byte hangulCat)
3512                 {
3513                         fillIndex [hangulCat]++;
3514                         if (fillIndex [hangulCat] == 0) { // overflown
3515                                 hangulCat++;
3516                                 fillIndex [hangulCat] = 0x2;
3517                         }
3518                 }
3519
3520                 // Reset fillIndex to fixed value and call AddLetterMap().
3521                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3522                 {
3523                         fillIndex [category] = alphaWeight;
3524                         AddLetterMap (c, category, 0);
3525
3526                         ArrayList al = latinMap [c] as ArrayList;
3527                         if (al == null)
3528                                 return;
3529
3530                         foreach (int cp in al)
3531                                 AddLetterMap ((char) cp, category, 0);
3532                 }
3533
3534                 private void AddKanaMap (int i, byte voices)
3535                 {
3536                         for (byte b = 0; b < voices; b++) {
3537                                 char c = (char) (i + b);
3538                                 byte arg = (byte) (b > 0 ? b + 2 : 0);
3539                                 // Hiragana
3540                                 AddLetterMapCore (c, 0x22, 0, arg, false);
3541                                 // Katakana
3542                                 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3543                         }
3544                 }
3545
3546                 private void AddLetterMap (char c, byte category, byte updateCount)
3547                 {
3548                         AddLetterMapCore (c, category, updateCount, 0, true);
3549                 }
3550
3551                 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3552                 {
3553                         char c2;
3554                         // <small> updates index
3555                         c2 = ToSmallForm (c);
3556                         if (c2 != c)
3557                                 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3558                         c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3559                         if (c2 != c && !map [(int) c2].Defined)
3560                                 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3561                         bool doUpdate = true;
3562                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3563                                 doUpdate = false;
3564                         else
3565                                 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3566                         if (doUpdate)
3567                                 fillIndex [category] += updateCount;
3568                 }
3569
3570                 private bool AddCharMap (char c, byte category, byte increment)
3571                 {
3572                         return AddCharMap (c, category, increment, 0);
3573                 }
3574
3575                 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3576                 {
3577                         if (IsIgnorable ((int) c) || map [(int) c].Defined)
3578                                 return false; // do nothing
3579                         map [(int) c] = new CharMapEntry (category,
3580                                 category == 1 ? alt : fillIndex [category],
3581                                 category == 1 ? fillIndex [category] : alt);
3582                         fillIndex [category] += increment;
3583                         return true;
3584                 }
3585
3586                 //
3587                 // Adds characters to table in the order below
3588                 // (+ increases weight):
3589                 //      (<small> +)
3590                 //      itself
3591                 //      <fraction>
3592                 //      <full> | <super> | <sub>
3593                 //      <circle> | <wide> (| <narrow>)
3594                 //      +
3595                 //      (vertical +)
3596                 //
3597                 // level2 is fixed (does not increase).
3598                 int [] sameWeightItems = new int [] {
3599                         DecompositionFraction,
3600                         DecompositionFull,
3601                         DecompositionSuper,
3602                         DecompositionSub,
3603                         DecompositionCircle,
3604                         DecompositionWide,
3605                         DecompositionNarrow,
3606                         };
3607                 private void AddCharMapGroup (char c, byte category, byte updateCount)
3608                 {
3609                         AddCharMapGroup (c, category, updateCount, 0, true);
3610                 }
3611
3612                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3613                 {
3614                         AddCharMapGroup (c, category, updateCount, level2, false);
3615                 }
3616
3617                 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3618                 {
3619                         if (map [(int) c].Defined)
3620                                 return;
3621
3622                         if (deferLevel2)
3623                                 level2 = diacritical [(int) c];
3624
3625                         char small = char.MinValue;
3626                         char vertical = char.MinValue;
3627                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3628                         if (nfkd != null) {
3629                                 object smv = nfkd [(byte) DecompositionSmall];
3630                                 if (smv != null)
3631                                         small = (char) ((int) smv);
3632                                 object vv = nfkd [(byte) DecompositionVertical];
3633                                 if (vv != null)
3634                                         vertical = (char) ((int) vv);
3635                         }
3636
3637                         // <small> updates index
3638                         if (small != char.MinValue) {
3639                                 if (level2 == 0 && deferLevel2)
3640                                         level2 = diacritical [small];
3641                                 AddCharMap (small, category, updateCount, level2);
3642                         }
3643
3644                         // itself
3645                         AddCharMap (c, category, 0, level2);
3646
3647                         if (nfkd != null) {
3648                                 foreach (int weight in sameWeightItems) {
3649                                         object wv = nfkd [(byte) weight];
3650                                         if (wv != null) {
3651                                                 if (deferLevel2)
3652                                                         level2 = diacritical [(int) wv];
3653                                                 AddCharMap ((char) ((int) wv), category, 0, level2);
3654                                         }
3655                                 }
3656                         }
3657
3658                         // update index here.
3659                         fillIndex [category] += updateCount;
3660
3661                         if (vertical != char.MinValue) {
3662                                 if (level2 == 0 && deferLevel2)
3663                                         level2 = diacritical [vertical];
3664                                 AddCharMap (vertical, category, updateCount, level2);
3665                         }
3666                 }
3667
3668                 private void AddCharMapCJK (char c, ref byte category)
3669                 {
3670                         AddCharMap (c, category, 0, 0);
3671                         IncrementSequentialIndex (ref category);
3672
3673                         // Special. I wonder why but Windows skips 9E F9.
3674                         if (category == 0x9E && fillIndex [category] == 0xF9)
3675                                 IncrementSequentialIndex (ref category);
3676                 }
3677
3678                 private void AddCharMapGroupCJK (char c, ref byte category)
3679                 {
3680                         AddCharMapCJK (c, ref category);
3681
3682                         // LAMESPEC: see below.
3683                         if (c == '\u5B78') {
3684                                 AddCharMapCJK ('\u32AB', ref category);
3685                                 AddCharMapCJK ('\u323B', ref category);
3686                         }
3687                         if (c == '\u52DE') {
3688                                 AddCharMapCJK ('\u3298', ref category);
3689                                 AddCharMapCJK ('\u3238', ref category);
3690                         }
3691                         if (c == '\u5BEB')
3692                                 AddCharMapCJK ('\u32A2', ref category);
3693                         if (c == '\u91AB')
3694                                 // Especially this mapping order totally does
3695                                 // not make sense to me.
3696                                 AddCharMapCJK ('\u32A9', ref category);
3697
3698                         Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3699                         if (nfkd == null)
3700                                 return;
3701                         for (byte weight = 0; weight <= 0x12; weight++) {
3702                                 object wv = nfkd [weight];
3703                                 if (wv == null)
3704                                         continue;
3705                                 int w = (int) wv;
3706
3707                                 // Special: they are ignored in this area.
3708                                 // FIXME: check if it is sane
3709                                 if (0xF900 <= w && w <= 0xFAD9)
3710                                         continue;
3711                                 // LAMESPEC: on Windows some of CJK characters
3712                                 // in 3200-32B0 are incorrectly mapped. They
3713                                 // mix Chinise and Japanese Kanji when
3714                                 // ordering those characters.
3715                                 switch (w) {
3716                                 case 0x32A2: case 0x3298: case 0x3238:
3717                                 case 0x32A9: case 0x323B: case 0x32AB:
3718                                         continue;
3719                                 }
3720
3721                                 AddCharMapCJK ((char) w, ref category);
3722                         }
3723                 }
3724
3725                 // For now it is only for 0x7 category.
3726                 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3727                 {
3728                         if (map [(int) c].Defined)
3729                                 return;
3730
3731                         bool updateWeight = false;
3732                         // Process in advance (lower primary weight)
3733                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3734                                 if (!map [c2].Defined &&
3735                                         decompLength [c2] == 1 &&
3736                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3737                                         switch (decompType [c2]) {
3738                                         case DecompositionSmall:
3739                                                 updateWeight = true;
3740                                                 AddCharMap ((char) c2, category,
3741                                                         0, level2);
3742                                                 break;
3743                                         }
3744                                 }
3745                         }
3746                         if (updateWeight)
3747                                 fillIndex [category] = (byte)
3748                                         (fillIndex [category] + updateCount);
3749
3750                         // Identical weight
3751                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3752                                 if (!map [c2].Defined &&
3753                                         decompLength [c2] == 1 &&
3754                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3755                                         switch (decompType [c2]) {
3756                                         case DecompositionSub:
3757                                         case DecompositionSuper:
3758                                         case DecompositionWide:
3759                                         case DecompositionNarrow:
3760                                                 AddCharMap ((char) c2, category,
3761                                                         0, level2);
3762                                                 break;
3763                                         }
3764                                 }
3765                         }
3766
3767                         // itself
3768                         AddCharMap (c, category, updateCount, level2);
3769
3770                         // Since nfkdMap is problematic to have two or more
3771                         // NFKD to an identical character, here I iterate all.
3772                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3773                                 if (!map [c2].Defined &&
3774                                         decompLength [c2] == 1 &&
3775                                         (int) (decompValues [decompIndex [c2]]) == (int) c) {
3776                                         switch (decompType [c2]) {
3777                                         case DecompositionWide:
3778                                         case DecompositionNarrow:
3779                                         case DecompositionSmall:
3780                                         case DecompositionSub:
3781                                         case DecompositionSuper:
3782                                                 continue;
3783                                         default:
3784                                                 AddCharMap ((char) c2, category, updateCount, level2);
3785                                                 break;
3786                                         }
3787                                 }
3788                         }
3789                 }
3790
3791                 private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2)
3792                 {
3793                         // itself
3794                         AddCharMap (c, category, 0, level2);
3795
3796                         // Since nfkdMap is problematic to have two or more
3797                         // NFKD to an identical character, here I iterate all.
3798                         for (int c2 = 0; c2 < char.MaxValue; c2++) {
3799                                 if (decompLength [c2] == 0)
3800                                         continue;
3801                                 int idx = decompIndex [c2] + decompLength [c2] - 1;
3802                                 if ((int) (decompValues [idx]) == (int) c)
3803                                         AddCharMap ((char) c2, category,
3804                                                 0, level2);
3805                         }
3806                         fillIndex [category] += updateCount;
3807                 }
3808
3809                 char ToSmallForm (char c)
3810                 {
3811                         return ToDecomposed (c, DecompositionSmall, false);
3812                 }
3813
3814                 char ToDecomposed (char c, byte d, bool tail)
3815                 {
3816                         if (decompType [(int) c] != d)
3817                                 return c;
3818                         int idx = decompIndex [(int) c];
3819                         if (tail)
3820                                 idx += decompLength [(int) c] - 1;
3821                         return (char) decompValues [idx];
3822                 }
3823
3824                 bool ExistsJIS (int cp)
3825                 {
3826                         foreach (JISCharacter j in jisJapanese)
3827                                 if (j.CP == cp)
3828                                         return true;
3829                         return false;
3830                 }
3831
3832                 #endregion
3833
3834                 #region Level 3 properties (Case/Width)
3835
3836                 private byte ComputeLevel3Weight (char c)
3837                 {
3838                         byte b = ComputeLevel3WeightRaw (c);
3839                         return b > 0 ? (byte) (b + 2) : b;
3840                 }
3841
3842                 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3843                 {
3844                         // CJK compat
3845                         if ('\u3192' <= c && c <= '\u319F')
3846                                 return 0;
3847
3848                         // They have <narrow> NFKD mapping, and on Windows
3849                         // those narrow characters are regarded as "normal",
3850                         // thus those characters themselves are regarded as
3851                         // "wide". grep "<narrow>" and you can pick them up
3852                         // (ignoring Kana, Hangul etc.)
3853                         switch (c) {
3854                         case '\u3002':
3855                         case '\u300C':
3856                         case '\u300D':
3857                         case '\u3001':
3858                         case '\u30FB':
3859                         case '\u2502':
3860                         case '\u2190':
3861                         case '\u2191':
3862                         case '\u2192':
3863                         case '\u2193':
3864                         case '\u25A0':
3865                         case '\u25CB':
3866                                 return 1;
3867                         }
3868                         // Korean
3869                         if ('\u11A8' <= c && c <= '\u11F9')
3870                                 return 2;
3871                         if ('\uFFA0' <= c && c <= '\uFFDC')
3872                                 return 4;
3873                         if ('\u3130' <= c && c <= '\u3164')
3874                                 return 5;
3875                         if ('\u3165' <= c && c <= '\u318E')
3876                                 return 4;
3877                         // Georgian Capital letters
3878                         if ('\u10A0' <= c && c <= '\u10C5')
3879                                 return 0x10;
3880                         // numbers
3881                         if ('\u2776' <= c && c <= '\u277F')
3882                                 return 4;
3883                         if ('\u2780' <= c && c <= '\u2789')
3884                                 return 8;
3885                         if ('\u2776' <= c && c <= '\u2793')
3886                                 return 0xC;
3887                         if ('\u2160' <= c && c <= '\u216F')
3888                                 return 0x10;
3889                         if ('\u2181' <= c && c <= '\u2182')
3890                                 return 0x10;
3891                         // Arabic
3892                         if ('\u2135' <= c && c <= '\u2138')
3893                                 return 4;
3894                         // I believe that Windows has a bug on setting level 3
3895                         // weight here. NFKD results in different values.
3896                         if ('\uFE80' < c && c < '\uFF00') {
3897                                 // 2(Isolated)/8(Final)/0x18(Medial)
3898                                 switch (decompType [(int) c]) {
3899                                 case DecompositionIsolated:
3900                                         return 0; // 2;
3901                                 case DecompositionFinal:
3902                                         return 8;
3903                                 case DecompositionMedial:
3904                                         return 0x18;
3905                                 case DecompositionInitial:
3906                                         return 0x10;
3907                                 }
3908                         }
3909
3910                         // I have no idea why those symbols have level 3 weight
3911                         if (c == '\u2104' || c == '\u212B')
3912                                 return 0x18;
3913                         if ('\u211E' <= c && c <= '\u212B')
3914                                 return 0x10;
3915
3916                         // actually I dunno the reason why they have weights.
3917                         switch (c) {
3918                         case '\u01BC':
3919                                 return 0x10;
3920                         case '\u06A9':
3921                                 return 0x20;
3922                         case '\u06AA':
3923                                 return 0x28;
3924                         // Gurmukhi
3925                         case '\u0A39':
3926                         case '\u0A59':
3927                         case '\u0A5A':
3928                         case '\u0A5B':
3929                         case '\u0A5E':
3930                                 return 0x10;
3931                         }
3932
3933                         byte ret = 0;
3934                         switch (c) {
3935                         case '\u03C2':
3936                         case '\u212B':
3937                                 ret = 8;
3938                                 break;
3939                         case '\uFE42':
3940                                 ret = 0xA;
3941                                 break;
3942                         }
3943
3944                         // misc
3945                         switch (decompType [(int) c]) {
3946                         case DecompositionWide: // <wide>
3947                         case DecompositionSub: // <sub>
3948                         case DecompositionSuper: // <super>
3949                                 ret |= decompType [(int) c];
3950                                 break;
3951                         }
3952                         if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3953                                 ret |= 8;
3954                         if (isUppercase [(int) c]) // DerivedCoreProperties
3955                                 ret |= 0x10;
3956
3957                         return ret;
3958                 }
3959
3960                 #endregion
3961
3962                 #region IsIgnorable
3963 /*
3964                 static bool IsIgnorable (int i)
3965                 {
3966                         if (unicodeAge [i] >= 3.1)
3967                                 return true;
3968                         switch (char.GetUnicodeCategory ((char) i)) {
3969                         case UnicodeCategory.OtherNotAssigned:
3970                         case UnicodeCategory.Format:
3971                                 return true;
3972                         }
3973                         return false;
3974                 }
3975 */
3976
3977                 // FIXME: In the future use DerivedAge.txt to examine character
3978                 // versions and set those ones that have higher version than
3979                 // 1.0 as ignorable.
3980                 static bool IsIgnorable (int i)
3981                 {
3982                         switch (i) {
3983                         case 0:
3984                         // I guess, those characters are added between
3985                         // Unicode 1.0 (LCMapString) and Unicode 3.1
3986                         // (UnicodeCategory), so they used to be
3987                         // something like OtherNotAssigned as of Unicode 1.1.
3988                         case 0x2df: case 0x387:
3989                         case 0x3d7: case 0x3d8: case 0x3d9:
3990                         case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3991                         case 0x400: case 0x40d: case 0x450: case 0x45d:
3992                         case 0x587: case 0x58a: case 0x5c4: case 0x640:
3993                         case 0x653: case 0x654: case 0x655: case 0x66d:
3994                         case 0xb56:
3995                         case 0x1e9b: case 0x202f: case 0x20ad:
3996                         case 0x20ae: case 0x20af:
3997                         case 0x20e2: case 0x20e3:
3998                         case 0x2139: case 0x213a: case 0x2183:
3999                         case 0x2425: case 0x2426: case 0x2619:
4000                         case 0x2670: case 0x2671: case 0x3007:
4001                         case 0x3190: case 0x3191:
4002                         case 0xfffc: case 0xfffd:
4003                                 return true;
4004                         // exceptional characters filtered by the
4005                         // following conditions. Originally those exceptional
4006                         // ranges are incorrect (they should not be ignored)
4007                         // and most of those characters are unfortunately in
4008                         // those ranges.
4009                         case 0x4d8: case 0x4d9:
4010                         case 0x4e8: case 0x4e9:
4011                         case 0x70F:
4012                         case 0x3036: case 0x303f:
4013                         case 0x337b: case 0xfb1e:
4014                                 return false;
4015                         }
4016
4017                         if (
4018                                 // The whole Sinhala characters.
4019                                 0x0D82 <= i && i <= 0x0DF4
4020                                 // The whole Tibetan characters.
4021                                 || 0x0F00 <= i && i <= 0x0FD1
4022                                 // The whole Myanmar characters.
4023                                 || 0x1000 <= i && i <= 0x1059
4024                                 // The whole Etiopic, Cherokee,
4025                                 // Canadian Syllablic, Ogham, Runic,
4026                                 // Tagalog, Hanunoo, Philippine,
4027                                 // Buhid, Tagbanwa, Khmer and Mongorian
4028                                 // characters.
4029                                 || 0x1200 <= i && i <= 0x1DFF
4030                                 // Greek extension characters.
4031                                 || 0x1F00 <= i && i <= 0x1FFF
4032                                 // The whole Braille characters.
4033                                 || 0x2800 <= i && i <= 0x28FF
4034                                 // CJK radical characters.
4035                                 || 0x2E80 <= i && i <= 0x2EF3
4036                                 // Kangxi radical characters.
4037                                 || 0x2F00 <= i && i <= 0x2FD5
4038                                 // Ideographic description characters.
4039                                 || 0x2FF0 <= i && i <= 0x2FFB
4040                                 // Bopomofo letter and final
4041                                 || 0x31A0 <= i && i <= 0x31B7
4042                                 // White square with quadrant characters.
4043                                 || 0x25F0 <= i && i <= 0x25F7
4044                                 // Ideographic telegraph symbols.
4045                                 || 0x32C0 <= i && i <= 0x32CB
4046                                 || 0x3358 <= i && i <= 0x3370
4047                                 || 0x33E0 <= i && i <= 0x33FF
4048                                 // The whole YI characters.
4049                                 || 0xA000 <= i && i <= 0xA48C
4050                                 || 0xA490 <= i && i <= 0xA4C6
4051                                 // American small ligatures
4052                                 || 0xFB13 <= i && i <= 0xFB17
4053                                 // hebrew, arabic, variation selector.
4054                                 || 0xFB1D <= i && i <= 0xFE2F
4055                                 // Arabic ligatures.
4056                                 || 0xFEF5 <= i && i <= 0xFEFC
4057                                 // FIXME: why are they excluded?
4058                                 || 0x01F6 <= i && i <= 0x01F9
4059                                 || 0x0218 <= i && i <= 0x0233
4060                                 || 0x02A9 <= i && i <= 0x02AD
4061                                 || 0x02EA <= i && i <= 0x02EE
4062                                 || 0x0349 <= i && i <= 0x036F
4063                                 || 0x0488 <= i && i <= 0x048F
4064                                 || 0x04D0 <= i && i <= 0x04FF
4065                                 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
4066                                 || 0x06D6 <= i && i <= 0x06ED
4067                                 || 0x06FA <= i && i <= 0x06FE
4068                                 || 0x2048 <= i && i <= 0x204D
4069                                 || 0x20e4 <= i && i <= 0x20ea
4070                                 || 0x213C <= i && i <= 0x214B
4071                                 || 0x21EB <= i && i <= 0x21FF
4072                                 || 0x22F2 <= i && i <= 0x22FF
4073                                 || 0x237B <= i && i <= 0x239A
4074                                 || 0x239B <= i && i <= 0x23CF
4075                                 || 0x24EB <= i && i <= 0x24FF
4076                                 || 0x2596 <= i && i <= 0x259F
4077                                 || 0x25F8 <= i && i <= 0x25FF
4078                                 || 0x2672 <= i && i <= 0x2689
4079                                 || 0x2768 <= i && i <= 0x2775
4080                                 || 0x27d0 <= i && i <= 0x27ff
4081                                 || 0x2900 <= i && i <= 0x2aff
4082                                 || 0x3033 <= i && i <= 0x303F
4083                                 || 0x31F0 <= i && i <= 0x31FF
4084                                 || 0x3250 <= i && i <= 0x325F
4085                                 || 0x32B1 <= i && i <= 0x32BF
4086                                 || 0x3371 <= i && i <= 0x337B
4087                                 || 0xFA30 <= i && i <= 0xFA6A
4088                         )
4089                                 return true;
4090
4091                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4092                         switch (uc) {
4093                         case UnicodeCategory.PrivateUse:
4094                         case UnicodeCategory.Surrogate:
4095                                 return false;
4096                         // ignored by nature
4097                         case UnicodeCategory.Format:
4098                         case UnicodeCategory.OtherNotAssigned:
4099                                 return true;
4100                         default:
4101                                 return false;
4102                         }
4103                 }
4104
4105                 // To check IsIgnorable sanity, try the driver below under MS.NET.
4106
4107                 /*
4108                 public static void Main ()
4109                 {
4110                         for (int i = 0; i <= char.MaxValue; i++)
4111                                 Dump (i, IsIgnorable (i));
4112                 }
4113
4114                 static void Dump (int i, bool ignore)
4115                 {
4116                         switch (Char.GetUnicodeCategory ((char) i)) {
4117                         case UnicodeCategory.PrivateUse:
4118                         case UnicodeCategory.Surrogate:
4119                                 return; // check nothing
4120                         }
4121
4122                         string s1 = "";
4123                         string s2 = new string ((char) i, 10);
4124                         int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
4125                         if ((ret == 0) == ignore)
4126                                 return;
4127                         Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
4128                 }
4129                 */
4130                 #endregion // IsIgnorable
4131
4132                 #region IsIgnorableSymbol
4133                 static bool IsIgnorableSymbol (int i)
4134                 {
4135                         if (IsIgnorable (i))
4136                                 return true;
4137
4138                         switch (i) {
4139                         // *Letter
4140                         case 0x00b5: case 0x01C0: case 0x01C1:
4141                         case 0x01C2: case 0x01C3: case 0x01F6:
4142                         case 0x01F7: case 0x01F8: case 0x01F9:
4143                         case 0x02D0: case 0x02EE: case 0x037A:
4144                         case 0x03D7: case 0x03F3:
4145                         case 0x0400: case 0x040d:
4146                         case 0x0450: case 0x045d:
4147                         case 0x048C: case 0x048D:
4148                         case 0x048E: case 0x048F:
4149                         case 0x0587: case 0x0640: case 0x06E5:
4150                         case 0x06E6: case 0x06FA: case 0x06FB:
4151                         case 0x06FC: case 0x093D: case 0x0950:
4152                         case 0x1E9B: case 0x2139: case 0x3006:
4153                         case 0x3033: case 0x3034: case 0x3035:
4154                         case 0xFE7E: case 0xFE7F:
4155                         // OtherNumber
4156                         case 0x16EE: case 0x16EF: case 0x16F0:
4157                         // LetterNumber
4158                         case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
4159                         case 0x3007: // IDEOGRAPHIC NUMBER ZERO
4160                         case 0x3038: // HANGZHOU NUMERAL TEN
4161                         case 0x3039: // HANGZHOU NUMERAL TWENTY
4162                         case 0x303a: // HANGZHOU NUMERAL THIRTY
4163                         // OtherSymbol
4164                         case 0x2117:
4165                         case 0x327F:
4166                                 return true;
4167                         // ModifierSymbol
4168                         case 0x02B9: case 0x02BA: case 0x02C2:
4169                         case 0x02C3: case 0x02C4: case 0x02C5:
4170                         case 0x02C8: case 0x02CC: case 0x02CD:
4171                         case 0x02CE: case 0x02CF: case 0x02D2:
4172                         case 0x02D3: case 0x02D4: case 0x02D5:
4173                         case 0x02D6: case 0x02D7: case 0x02DE:
4174                         case 0x02E5: case 0x02E6: case 0x02E7:
4175                         case 0x02E8: case 0x02E9:
4176                         case 0x309B: case 0x309C:
4177                         // OtherPunctuation
4178                         case 0x055A: // American Apos
4179                         case 0x05C0: // Hebrew Punct
4180                         case 0x0E4F: // Thai FONGMAN
4181                         case 0x0E5A: // Thai ANGKHANKHU
4182                         case 0x0E5B: // Thai KHOMUT
4183                         // CurencySymbol
4184                         case 0x09F2: // Bengali Rupee Mark
4185                         case 0x09F3: // Bengali Rupee Sign
4186                         // MathSymbol
4187                         case 0x221e: // INF.
4188                         // OtherSymbol
4189                         case 0x0482:
4190                         case 0x09FA:
4191                         case 0x0B70:
4192                                 return false;
4193                         }
4194
4195                         // *Letter
4196                         if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
4197                                 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
4198                                 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
4199                         )
4200                                 return true;
4201
4202                         UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4203                         switch (uc) {
4204                         case UnicodeCategory.Surrogate:
4205                                 return false; // inconsistent
4206
4207                         case UnicodeCategory.SpacingCombiningMark:
4208                         case UnicodeCategory.EnclosingMark:
4209                         case UnicodeCategory.NonSpacingMark:
4210                         case UnicodeCategory.PrivateUse:
4211                                 // NonSpacingMark
4212                                 if (0x064B <= i && i <= 0x0652) // Arabic
4213                                         return true;
4214                                 return false;
4215
4216                         case UnicodeCategory.Format:
4217                         case UnicodeCategory.OtherNotAssigned:
4218                                 return true;
4219
4220                         default:
4221                                 bool use = false;
4222                                 // OtherSymbols
4223                                 if (
4224                                         // latin in a circle
4225                                         0x249A <= i && i <= 0x24E9
4226                                         || 0x2100 <= i && i <= 0x2132
4227                                         // Japanese
4228                                         || 0x3196 <= i && i <= 0x31A0
4229                                         // Korean
4230                                         || 0x3200 <= i && i <= 0x321C
4231                                         // Chinese/Japanese
4232                                         || 0x322A <= i && i <= 0x3243
4233                                         // CJK
4234                                         || 0x3260 <= i && i <= 0x32B0
4235                                         || 0x32D0 <= i && i <= 0x3357
4236                                         || 0x337B <= i && i <= 0x33DD
4237                                 )
4238                                         use = !Char.IsLetterOrDigit ((char) i);
4239                                 if (use)
4240                                         return false;
4241
4242                                 // This "Digit" rule is mystery.
4243                                 // It filters some symbols out.
4244                                 if (Char.IsLetterOrDigit ((char) i))
4245                                         return false;
4246                                 if (Char.IsNumber ((char) i))
4247                                         return false;
4248                                 if (Char.IsControl ((char) i)
4249                                         || Char.IsSeparator ((char) i)
4250                                         || Char.IsPunctuation ((char) i))
4251                                         return true;
4252                                 if (Char.IsSymbol ((char) i))
4253                                         return true;
4254
4255                                 // FIXME: should check more
4256                                 return false;
4257                         }
4258                 }
4259
4260                 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
4261 /*
4262                 public static void Main ()
4263                 {
4264                         CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
4265                         for (int i = 0; i <= char.MaxValue; i++) {
4266                                 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4267                                 if (uc == UnicodeCategory.Surrogate)
4268                                         continue;
4269
4270                                 bool ret = IsIgnorableSymbol (i);
4271
4272                                 string s1 = "TEST ";
4273                                 string s2 = "TEST " + (char) i;
4274
4275                                 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
4276
4277                                 if (ret != (result == 0))
4278                                         Console.WriteLine ("{0} : {1:x}[{2}]({3})",
4279                                                 ret ? "should not ignore" :
4280                                                         "should ignore",
4281                                                 i,(char) i, uc);
4282                         }
4283                 }
4284 */
4285                 #endregion
4286
4287                 #region NonSpacing
4288                 static bool IsIgnorableNonSpacing (int i)
4289                 {
4290                         if (IsIgnorable (i))
4291                                 return true;
4292
4293                         switch (i) {
4294                         case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
4295                         case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
4296                         case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
4297                                 return true;
4298                         case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
4299                         case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
4300                         case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
4301                         case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
4302                         case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
4303                         case 0x0CCD: case 0x0E4E:
4304                                 return false;
4305                         }
4306
4307                         if (0x02b9 <= i && i <= 0x02c5
4308                                 || 0x02cc <= i && i <= 0x02d7
4309                                 || 0x02e4 <= i && i <= 0x02ef
4310                                 || 0x20DD <= i && i <= 0x20E0
4311                         )
4312                                 return true;
4313
4314                         if (0x064B <= i && i <= 0x00652
4315                                 || 0x0941 <= i && i <= 0x0948
4316                                 || 0x0AC1 <= i && i <= 0x0ACD
4317                                 || 0x0C3E <= i && i <= 0x0C4F
4318                                 || 0x0E31 <= i && i <= 0x0E3F
4319                         )
4320                                 return false;
4321
4322                         return Char.GetUnicodeCategory ((char) i) ==
4323                                 UnicodeCategory.NonSpacingMark;
4324                 }
4325
4326                 // We can reuse IsIgnorableSymbol testcode
4327                 // for IsIgnorableNonSpacing.
4328                 #endregion
4329         }
4330
4331         struct CharMapEntry
4332         {
4333                 public byte Category;
4334                 public byte Level1;
4335                 public byte Level2; // It is always single byte.
4336                 public bool Defined;
4337
4338                 public CharMapEntry (byte category, byte level1, byte level2)
4339                 {
4340                         Category = category;
4341                         Level1 = level1;
4342                         Level2 = level2;
4343                         Defined = true;
4344                 }
4345         }
4346
4347         class JISCharacter
4348         {
4349                 public readonly int CP;
4350                 public readonly int JIS;
4351
4352                 public JISCharacter (int cp, int cpJIS)
4353                 {
4354                         CP = cp;
4355                         JIS = cpJIS;
4356                 }
4357         }
4358
4359         class JISComparer : IComparer
4360         {
4361                 public static readonly JISComparer Instance =
4362                         new JISComparer ();
4363
4364                 public int Compare (object o1, object o2)
4365                 {
4366                         JISCharacter j1 = (JISCharacter) o1;
4367                         JISCharacter j2 = (JISCharacter) o2;
4368                         return j1.JIS - j2.JIS;
4369                 }
4370         }
4371
4372         class NonJISCharacter
4373         {
4374                 public readonly int CP;
4375                 public readonly string Name;
4376
4377                 public NonJISCharacter (int cp, string name)
4378                 {
4379                         CP = cp;
4380                         Name = name;
4381                 }
4382         }
4383
4384         class NonJISComparer : IComparer
4385         {
4386                 public static readonly NonJISComparer Instance =
4387                         new NonJISComparer ();
4388
4389                 public int Compare (object o1, object o2)
4390                 {
4391                         NonJISCharacter j1 = (NonJISCharacter) o1;
4392                         NonJISCharacter j2 = (NonJISCharacter) o2;
4393                         return string.CompareOrdinal (j1.Name, j2.Name);
4394                 }
4395         }
4396
4397         class DecimalDictionaryValueComparer : IComparer
4398         {
4399                 public static readonly DecimalDictionaryValueComparer Instance
4400                         = new DecimalDictionaryValueComparer ();
4401
4402                 private DecimalDictionaryValueComparer ()
4403                 {
4404                 }
4405
4406                 public int Compare (object o1, object o2)
4407                 {
4408                         DictionaryEntry e1 = (DictionaryEntry) o1;
4409                         DictionaryEntry e2 = (DictionaryEntry) o2;
4410                         // FIXME: in case of 0, compare decomposition categories
4411                         int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4412                         if (ret != 0)
4413                                 return ret;
4414                         int i1 = (int) e1.Key;
4415                         int i2 = (int) e2.Key;
4416                         return i1 - i2;
4417                 }
4418         }
4419
4420         class StringDictionaryValueComparer : IComparer
4421         {
4422                 public static readonly StringDictionaryValueComparer Instance
4423                         = new StringDictionaryValueComparer ();
4424
4425                 private StringDictionaryValueComparer ()
4426                 {
4427                 }
4428
4429                 public int Compare (object o1, object o2)
4430                 {
4431                         DictionaryEntry e1 = (DictionaryEntry) o1;
4432                         DictionaryEntry e2 = (DictionaryEntry) o2;
4433                         int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4434                         if (ret != 0)
4435                                 return ret;
4436                         int i1 = (int) e1.Key;
4437                         int i2 = (int) e2.Key;
4438                         return i1 - i2;
4439                 }
4440         }
4441
4442         class UCAComparer : IComparer
4443         {
4444                 public static readonly UCAComparer Instance
4445                         = new UCAComparer ();
4446
4447                 private UCAComparer ()
4448                 {
4449                 }
4450
4451                 public int Compare (object o1, object o2)
4452                 {
4453                         char i1 = (char) o1;
4454                         char i2 = (char) o2;
4455
4456                         int l1 = CollationElementTable.GetSortKeyCount (i1);
4457                         int l2 = CollationElementTable.GetSortKeyCount (i2);
4458                         int l = l1 > l2 ? l2 : l1;
4459
4460                         for (int i = 0; i < l; i++) {
4461                                 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4462                                 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4463                                 int v = k1.Primary - k2.Primary;
4464                                 if (v != 0)
4465                                         return v;
4466                                 v = k1.Secondary - k2.Secondary;
4467                                 if (v != 0)
4468                                         return v;
4469                                 v = k1.Thirtiary - k2.Thirtiary;
4470                                 if (v != 0)
4471                                         return v;
4472                                 v = k1.Quarternary - k2.Quarternary;
4473                                 if (v != 0)
4474                                         return v;
4475                         }
4476                         return l1 - l2;
4477                 }
4478         }
4479
4480         class Tailoring
4481         {
4482                 int lcid;
4483                 int alias;
4484                 bool frenchSort;
4485                 ArrayList items = new ArrayList ();
4486
4487                 public Tailoring (int lcid)
4488                         : this (lcid, 0)
4489                 {
4490                 }
4491
4492                 public Tailoring (int lcid, int alias)
4493                 {
4494                         this.lcid = lcid;
4495                         this.alias = alias;
4496                 }
4497
4498                 public int LCID {
4499                         get { return lcid; }
4500                 }
4501
4502                 public int Alias {
4503                         get { return alias; }
4504                 }
4505
4506                 public bool FrenchSort {
4507                         get { return frenchSort; }
4508                         set { frenchSort = value; }
4509                 }
4510
4511                 public void AddDiacriticalMap (byte target, byte replace)
4512                 {
4513                         items.Add (new DiacriticalMap (target, replace));
4514                 }
4515
4516                 public void AddSortKeyMap (string source, byte [] sortkey)
4517                 {
4518                         items.Add (new SortKeyMap (source, sortkey));
4519                 }
4520
4521                 public void AddReplacementMap (string source, string replace)
4522                 {
4523                         items.Add (new ReplacementMap (source, replace));
4524                 }
4525
4526                 public char [] ItemToCharArray ()
4527                 {
4528                         ArrayList al = new ArrayList ();
4529                         foreach (ITailoringMap m in items)
4530                                 al.AddRange (m.ToCharArray ());
4531                         return al.ToArray (typeof (char)) as char [];
4532                 }
4533
4534                 interface ITailoringMap
4535                 {
4536                         char [] ToCharArray ();
4537                 }
4538
4539                 class DiacriticalMap : ITailoringMap
4540                 {
4541                         public readonly byte Target;
4542                         public readonly byte Replace;
4543
4544                         public DiacriticalMap (byte target, byte replace)
4545                         {
4546                                 Target = target;
4547                                 Replace = replace;
4548                         }
4549
4550                         public char [] ToCharArray ()
4551                         {
4552                                 char [] ret = new char [3];
4553                                 ret [0] = (char) 02; // kind:DiacriticalMap
4554                                 ret [1] = (char) Target;
4555                                 ret [2] = (char) Replace;
4556                                 return ret;
4557                         }
4558                 }
4559
4560                 class SortKeyMap : ITailoringMap
4561                 {
4562                         public readonly string Source;
4563                         public readonly byte [] SortKey;
4564
4565                         public SortKeyMap (string source, byte [] sortkey)
4566                         {
4567                                 Source = source;
4568                                 SortKey = sortkey;
4569                         }
4570
4571                         public char [] ToCharArray ()
4572                         {
4573                                 char [] ret = new char [Source.Length + 7];
4574                                 ret [0] = (char) 01; // kind:SortKeyMap
4575                                 for (int i = 0; i < Source.Length; i++)
4576                                         ret [i + 1] = Source [i];
4577                                 // null terminate
4578                                 for (int i = 0; i < 4; i++)
4579                                         ret [i + Source.Length + 2] = (char) SortKey [i];
4580                                 return ret;
4581                         }
4582                 }
4583
4584                 class ReplacementMap : ITailoringMap
4585                 {
4586                         public readonly string Source;
4587                         public readonly string Replace;
4588
4589                         public ReplacementMap (string source, string replace)
4590                         {
4591                                 Source = source;
4592                                 Replace = replace;
4593                         }
4594
4595                         public char [] ToCharArray ()
4596                         {
4597                                 char [] ret = new char [Source.Length + Replace.Length + 3];
4598                                 ret [0] = (char) 03; // kind:ReplaceMap
4599                                 int pos = 1;
4600                                 for (int i = 0; i < Source.Length; i++)
4601                                         ret [pos++] = Source [i];
4602                                 // null terminate
4603                                 pos++;
4604                                 for (int i = 0; i < Replace.Length; i++)
4605                                         ret [pos++] = Replace [i];
4606                                 // null terminate
4607                                 return ret;
4608                         }
4609                 }
4610         }
4611 }