util/unicode/main.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
   4 ** All rights reserved.
   5 ** Contact: Nokia Corporation (qt-info@nokia.com)
   6 **
   7 ** This file is part of the utils of the Qt Toolkit.
   8 **
   9 ** $QT_BEGIN_LICENSE:LGPL$
  10 ** No Commercial Usage
  11 ** This file contains pre-release code and may not be distributed.
  12 ** You may use this file in accordance with the terms and conditions
  13 ** contained in the Technology Preview License Agreement accompanying
  14 ** this package.
  15 **
  16 ** GNU Lesser General Public License Usage
  17 ** Alternatively, this file may be used under the terms of the GNU Lesser
  18 ** General Public License version 2.1 as published by the Free Software
  19 ** Foundation and appearing in the file LICENSE.LGPL included in the
  20 ** packaging of this file.  Please review the following information to
  21 ** ensure the GNU Lesser General Public License version 2.1 requirements
  22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  23 **
  24 ** In addition, as a special exception, Nokia gives you certain additional
  25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
  26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  27 **
  28 ** If you have questions regarding the use of this file, please contact
  29 ** Nokia at qt-info@nokia.com.
  30 **
  31 **
  32 **
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41 #include <qlist.h>
  42 #include <qhash.h>
  43 #include <qfile.h>
  44 #include <qstring.h>
  45 #include <qchar.h>
  46 #include <private/qunicodetables_p.h>
  47 #include <qvector.h>
  48 #include <qdebug.h>
  49
  50
  51 static struct AgeMap {
  52     const char *age;
  53     const QChar::UnicodeVersion version;
  54 } ageMap [] = {
  55     { "1.1", QChar::Unicode_1_1 },
  56     { "2.0", QChar::Unicode_2_0 },
  57     { "2.1", QChar::Unicode_2_1_2 },
  58     { "3.0", QChar::Unicode_3_0 },
  59     { "3.1", QChar::Unicode_3_1 },
  60     { "3.2", QChar::Unicode_3_2 },
  61     { "4.0", QChar::Unicode_4_0 },
  62     { "4.1", QChar::Unicode_4_1 },
  63     { "5.0", QChar::Unicode_5_0 },
  64     { 0, QChar::Unicode_Unassigned }
  65 };
  66 #define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0"
  67
  68 static const char *grapheme_break_string =
  69     "    enum GraphemeBreak {\n"
  70     "        GraphemeBreakOther, \n"
  71     "        GraphemeBreakCR,\n"
  72     "        GraphemeBreakLF,\n"
  73     "        GraphemeBreakControl,\n"
  74     "        GraphemeBreakExtend,\n"
  75     "        GraphemeBreakL,\n"
  76     "        GraphemeBreakV,\n"
  77     "        GraphemeBreakT,\n"
  78     "        GraphemeBreakLV,\n"
  79     "        GraphemeBreakLVT\n"
  80     "    };\n\n";
  81
  82 enum GraphemeBreak {
  83     GraphemeBreakOther,
  84     GraphemeBreakCR,
  85     GraphemeBreakLF,
  86     GraphemeBreakControl,
  87     GraphemeBreakExtend,
  88     GraphemeBreakL,
  89     GraphemeBreakV,
  90     GraphemeBreakT,
  91     GraphemeBreakLV,
  92     GraphemeBreakLVT
  93 };
  94
  95 QHash<QByteArray, GraphemeBreak> grapheme_break_map;
  96
  97 static void initGraphemeBreak()
  98 {
  99     struct GraphemeBreakList {
 100         GraphemeBreak brk;
 101         const char *name;
 102     } breaks[] = {
 103         { GraphemeBreakOther, "Other" },
 104         { GraphemeBreakCR, "CR" },
 105         { GraphemeBreakLF, "LF" },
 106         { GraphemeBreakControl, "Control" },
 107         { GraphemeBreakExtend, "Extend" },
 108         { GraphemeBreakL, "L" },
 109         { GraphemeBreakV, "V" },
 110         { GraphemeBreakT, "T" },
 111         { GraphemeBreakLV, "LV" },
 112         { GraphemeBreakLVT, "LVT" },
 113         { GraphemeBreakOther, 0 }
 114     };
 115     GraphemeBreakList *d = breaks;
 116     while (d->name) {
 117         grapheme_break_map.insert(d->name, d->brk);
 118         ++d;
 119     }
 120 }
 121
 122 const char *word_break_string =
 123     "    enum WordBreak {\n"
 124     "        WordBreakOther,\n"
 125     "        WordBreakFormat,\n"
 126     "        WordBreakKatakana,\n"
 127     "        WordBreakALetter,\n"
 128     "        WordBreakMidLetter,\n"
 129     "        WordBreakMidNum,\n"
 130     "        WordBreakNumeric,\n"
 131     "        WordBreakExtendNumLet\n"
 132     "    };\n\n";
 133
 134 enum WordBreak {
 135     WordBreakOther,
 136     WordBreakFormat,
 137     WordBreakKatakana,
 138     WordBreakALetter,
 139     WordBreakMidLetter,
 140     WordBreakMidNum,
 141     WordBreakNumeric,
 142     WordBreakExtendNumLet
 143 };
 144
 145
 146 QHash<QByteArray, WordBreak> word_break_map;
 147
 148 static void initWordBreak()
 149 {
 150     struct WordBreakList {
 151         WordBreak brk;
 152         const char *name;
 153     } breaks[] = {
 154         { WordBreakFormat, "Format" },
 155         { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
 156         { WordBreakKatakana, "Katakana" },
 157         { WordBreakALetter, "ALetter" },
 158         { WordBreakMidLetter, "MidLetter" },
 159         { WordBreakMidNum, "MidNum" },
 160         { WordBreakNumeric, "Numeric" },
 161         { WordBreakExtendNumLet, "ExtendNumLet" },
 162         { WordBreakFormat,  0 }
 163     };
 164     WordBreakList *d = breaks;
 165     while (d->name) {
 166         word_break_map.insert(d->name, d->brk);
 167         ++d;
 168     }
 169 }
 170
 171
 172 static const char *sentence_break_string =
 173     "    enum SentenceBreak {\n"
 174     "        SentenceBreakOther,\n"
 175     "        SentenceBreakSep,\n"
 176     "        SentenceBreakFormat,\n"
 177     "        SentenceBreakSp,\n"
 178     "        SentenceBreakLower,\n"
 179     "        SentenceBreakUpper,\n"
 180     "        SentenceBreakOLetter,\n"
 181     "        SentenceBreakNumeric,\n"
 182     "        SentenceBreakATerm,\n"
 183     "        SentenceBreakSTerm,\n"
 184     "        SentenceBreakClose\n"
 185     "    };\n\n";
 186
 187 enum SentenceBreak {
 188     SentenceBreakOther,
 189     SentenceBreakSep,
 190     SentenceBreakFormat,
 191     SentenceBreakSp,
 192     SentenceBreakLower,
 193     SentenceBreakUpper,
 194     SentenceBreakOLetter,
 195     SentenceBreakNumeric,
 196     SentenceBreakATerm,
 197     SentenceBreakSTerm,
 198     SentenceBreakClose
 199 };
 200
 201
 202 QHash<QByteArray, SentenceBreak> sentence_break_map;
 203
 204 static void initSentenceBreak()
 205 {
 206     struct SentenceBreakList {
 207         SentenceBreak brk;
 208         const char *name;
 209     } breaks[] = {
 210         { SentenceBreakOther, "Other" },
 211         { SentenceBreakSep, "Sep" },
 212         { SentenceBreakFormat, "Format" },
 213         { SentenceBreakSp, "Sp" },
 214         { SentenceBreakLower, "Lower" },
 215         { SentenceBreakUpper, "Upper" },
 216         { SentenceBreakOLetter, "OLetter" },
 217         { SentenceBreakNumeric, "Numeric" },
 218         { SentenceBreakATerm, "ATerm" },
 219         { SentenceBreakSTerm, "STerm" },
 220         { SentenceBreakClose, "Close" },
 221         { SentenceBreakOther,  0 }
 222     };
 223     SentenceBreakList *d = breaks;
 224     while (d->name) {
 225         sentence_break_map.insert(d->name, d->brk);
 226         ++d;
 227     }
 228 }
 229
 230
 231 // Keep this one in sync with the code in createPropertyInfo
 232 const char *property_string =
 233     "    struct Properties {\n"
 234     "        ushort category : 8;\n"
 235     "        ushort line_break_class : 8;\n"
 236     "        ushort direction : 8;\n"
 237     "        ushort combiningClass :8;\n"
 238     "        ushort joining : 2;\n"
 239     "        signed short digitValue : 6; /* 5 needed */\n"
 240     "        ushort unicodeVersion : 4;\n"
 241     "        ushort lowerCaseSpecial : 1;\n"
 242     "        ushort upperCaseSpecial : 1;\n"
 243     "        ushort titleCaseSpecial : 1;\n"
 244     "        ushort caseFoldSpecial : 1; /* currently unused */\n"
 245     "        signed short mirrorDiff : 16;\n"
 246     "        signed short lowerCaseDiff : 16;\n"
 247     "        signed short upperCaseDiff : 16;\n"
 248     "        signed short titleCaseDiff : 16;\n"
 249     "        signed short caseFoldDiff : 16;\n"
 250     "        ushort graphemeBreak : 8;\n"
 251     "        ushort wordBreak : 8;\n"
 252     "        ushort sentenceBreak : 8;\n"
 253     "    };\n"
 254     "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
 255     "    Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
 256
 257 const char *lineBreakClass =
 258     "    // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
 259     "    // we don't use the XX, AI and CB properties and map them to AL instead.\n"
 260     "    // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
 261     "    enum LineBreakClass {\n"
 262     "        LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
 263     "        LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
 264     "        LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
 265     "        LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
 266     "        LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
 267     "        LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
 268     "        LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
 269     "    };\n\n";
 270
 271 const char *methods =
 272     "    Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
 273     "    inline int lineBreakClass(const QChar &ch) {\n"
 274     "        return QUnicodeTables::lineBreakClass(ch.unicode());\n"
 275     "    }\n"
 276     "\n"
 277     "    Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
 278     "    Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n"
 279     "        return script(ch.unicode());\n"
 280     "    }\n\n";
 281
 282
 283 struct PropertyFlags {
 284     bool operator ==(const PropertyFlags &o) {
 285         return (combiningClass == o.combiningClass
 286                 && category == o.category
 287                 && direction == o.direction
 288                 && joining == o.joining
 289                 && age == o.age
 290                 && digitValue == o.digitValue
 291                 && line_break_class == o.line_break_class
 292                 && mirrorDiff == o.mirrorDiff
 293                 && lowerCaseDiff == o.lowerCaseDiff
 294                 && upperCaseDiff == o.upperCaseDiff
 295                 && titleCaseDiff == o.titleCaseDiff
 296                 && caseFoldDiff == o.caseFoldDiff
 297                 && lowerCaseSpecial == o.lowerCaseSpecial
 298                 && upperCaseSpecial == o.upperCaseSpecial
 299                 && titleCaseSpecial == o.titleCaseSpecial
 300                 && caseFoldSpecial == o.caseFoldSpecial
 301                 && graphemeBreak == o.graphemeBreak
 302                 && wordBreak == o.wordBreak
 303                 && sentenceBreak == o.sentenceBreak
 304             );
 305     }
 306     // from UnicodeData.txt
 307     uchar combiningClass : 8;
 308     QChar::Category category : 5;
 309     QChar::Direction direction : 5;
 310     // from ArabicShaping.txt
 311     QChar::Joining joining : 2;
 312     // from DerivedAge.txt
 313     QChar::UnicodeVersion age : 4;
 314     int digitValue;
 315     uint line_break_class : 5;
 316
 317     int mirrorDiff : 16;
 318
 319     int lowerCaseDiff;
 320     int upperCaseDiff;
 321     int titleCaseDiff;
 322     int caseFoldDiff;
 323     bool lowerCaseSpecial;
 324     bool upperCaseSpecial;
 325     bool titleCaseSpecial;
 326     bool caseFoldSpecial;
 327     GraphemeBreak graphemeBreak;
 328     WordBreak wordBreak;
 329     SentenceBreak sentenceBreak;
 330 };
 331
 332 QList<int> specialCaseMap;
 333 int specialCaseMaxLen = 0;
 334
 335 static int appendToSpecialCaseMap(const QList<int> &map)
 336 {
 337     QList<int> utf16map;
 338     for (int i = 0; i < map.size(); ++i) {
 339         int val = map.at(i);
 340         if (val > 0xffff) {
 341             utf16map << QChar::highSurrogate(val);
 342             utf16map << QChar::lowSurrogate(val);
 343         } else {
 344             utf16map << val;
 345         }
 346     }
 347     specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size());
 348     utf16map << 0;
 349
 350     for (int i = 0; i < specialCaseMap.size() - utf16map.size() - 1; ++i) {
 351         int j;
 352         for (j = 0; j < utf16map.size(); ++j) {
 353             if (specialCaseMap.at(i+j) != utf16map.at(j))
 354                 break;
 355         }
 356         if (j == utf16map.size())
 357             return i;
 358     }
 359
 360     int pos = specialCaseMap.size();
 361     specialCaseMap << utf16map;
 362     return pos;
 363 }
 364
 365 struct UnicodeData {
 366     UnicodeData(int codepoint = 0) {
 367         p.category = QChar::NoCategory;
 368         p.combiningClass = 0;
 369
 370         p.direction = QChar::DirL;
 371         // DirR for:  U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
 372         if ((codepoint >= 0x590 && codepoint <= 0x5ff)
 373             || (codepoint >= 0x7c0 && codepoint <= 0x8ff)
 374             || (codepoint >= 0xfb1d && codepoint <= 0xfb4f)
 375             || (codepoint >= 0x10800 && codepoint <= 0x10fff))
 376             p.direction = QChar::DirR;
 377         // DirAL for: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE
 378         if ((codepoint >= 0x600 && codepoint <= 0x7bf)
 379             || (codepoint >= 0xfb50 && codepoint <= 0xfdcf)
 380             || (codepoint >= 0xfdf0 && codepoint <= 0xfdff)
 381             || (codepoint >= 0xfe70 && codepoint <= 0xfefe))
 382             p.direction = QChar::DirAL;
 383
 384         mirroredChar = 0;
 385         decompositionType = QChar::NoDecomposition;
 386         p.joining = QChar::OtherJoining;
 387         p.age = QChar::Unicode_Unassigned;
 388         p.mirrorDiff = 0;
 389         p.digitValue = -1;
 390         p.line_break_class = QUnicodeTables::LineBreak_AL;
 391         p.lowerCaseDiff = 0;
 392         p.upperCaseDiff = 0;
 393         p.titleCaseDiff = 0;
 394         p.caseFoldDiff = 0;
 395         p.lowerCaseSpecial = 0;
 396         p.upperCaseSpecial = 0;
 397         p.titleCaseSpecial = 0;
 398         p.caseFoldSpecial = 0;
 399         p.graphemeBreak = GraphemeBreakOther;
 400         p.wordBreak = WordBreakOther;
 401         p.sentenceBreak = SentenceBreakOther;
 402         propertyIndex = -1;
 403         excludedComposition = false;
 404     }
 405     PropertyFlags p;
 406
 407     // from UnicodeData.txt
 408     QChar::Decomposition decompositionType;
 409     QList<int> decomposition;
 410
 411     QList<int> specialFolding;
 412
 413     // from BidiMirroring.txt
 414     int mirroredChar;
 415
 416     // CompositionExclusions.txt
 417     bool excludedComposition;
 418
 419     // computed position of unicode property set
 420     int propertyIndex;
 421 };
 422
 423 enum UniDataFields {
 424     UD_Value,
 425     UD_Name,
 426     UD_Category,
 427     UD_CombiningClass,
 428     UD_BidiCategory,
 429     UD_Decomposition,
 430     UD_DecimalDigitValue,
 431     UD_DigitValue,
 432     UD_NumericValue,
 433     UD_Mirrored,
 434     UD_OldName,
 435     UD_Comment,
 436     UD_UpperCase,
 437     UD_LowerCase,
 438     UD_TitleCase
 439 };
 440
 441 QHash<QByteArray, QChar::Category> categoryMap;
 442
 443 static void initCategoryMap()
 444 {
 445     struct Cat {
 446         QChar::Category cat;
 447         const char *name;
 448     } categories [] = {
 449         { QChar::Mark_NonSpacing,          "Mn" },
 450         { QChar::Mark_SpacingCombining,    "Mc" },
 451         { QChar::Mark_Enclosing,           "Me" },
 452
 453         { QChar::Number_DecimalDigit,      "Nd" },
 454         { QChar::Number_Letter,            "Nl" },
 455         { QChar::Number_Other,             "No" },
 456
 457         { QChar::Separator_Space,          "Zs" },
 458         { QChar::Separator_Line,           "Zl" },
 459         { QChar::Separator_Paragraph,      "Zp" },
 460
 461         { QChar::Other_Control,            "Cc" },
 462         { QChar::Other_Format,             "Cf" },
 463         { QChar::Other_Surrogate,          "Cs" },
 464         { QChar::Other_PrivateUse,         "Co" },
 465         { QChar::Other_NotAssigned,        "Cn" },
 466
 467         { QChar::Letter_Uppercase,         "Lu" },
 468         { QChar::Letter_Lowercase,         "Ll" },
 469         { QChar::Letter_Titlecase,         "Lt" },
 470         { QChar::Letter_Modifier,          "Lm" },
 471         { QChar::Letter_Other,             "Lo" },
 472
 473         { QChar::Punctuation_Connector,    "Pc" },
 474         { QChar::Punctuation_Dash,         "Pd" },
 475         { QChar::Punctuation_Open,         "Ps" },
 476         { QChar::Punctuation_Close,        "Pe" },
 477         { QChar::Punctuation_InitialQuote, "Pi" },
 478         { QChar::Punctuation_FinalQuote,   "Pf" },
 479         { QChar::Punctuation_Other,        "Po" },
 480
 481         { QChar::Symbol_Math,              "Sm" },
 482         { QChar::Symbol_Currency,          "Sc" },
 483         { QChar::Symbol_Modifier,          "Sk" },
 484         { QChar::Symbol_Other,             "So" },
 485         { QChar::NoCategory, 0 }
 486     };
 487     Cat *c = categories;
 488     while (c->cat != QChar::NoCategory) {
 489         categoryMap.insert(c->name, c->cat);
 490         ++c;
 491     }
 492 }
 493
 494 QHash<QByteArray, QChar::Direction> directionMap;
 495
 496 static void initDirectionMap()
 497 {
 498     struct Dir {
 499         QChar::Direction dir;
 500         const char *name;
 501     } directions[] = {
 502         { QChar::DirL, "L" },
 503         { QChar::DirR, "R" },
 504         { QChar::DirEN, "EN" },
 505         { QChar::DirES, "ES" },
 506         { QChar::DirET, "ET" },
 507         { QChar::DirAN, "AN" },
 508         { QChar::DirCS, "CS" },
 509         { QChar::DirB, "B" },
 510         { QChar::DirS, "S" },
 511         { QChar::DirWS, "WS" },
 512         { QChar::DirON, "ON" },
 513         { QChar::DirLRE, "LRE" },
 514         { QChar::DirLRO, "LRO" },
 515         { QChar::DirAL, "AL" },
 516         { QChar::DirRLE, "RLE" },
 517         { QChar::DirRLO, "RLO" },
 518         { QChar::DirPDF, "PDF" },
 519         { QChar::DirNSM, "NSM" },
 520         { QChar::DirBN, "BN" },
 521         { QChar::DirL, 0 }
 522     };
 523     Dir *d = directions;
 524     while (d->name) {
 525         directionMap.insert(d->name, d->dir);
 526         ++d;
 527     }
 528 }
 529
 530
 531 QHash<QByteArray, QChar::Decomposition> decompositionMap;
 532
 533 static void initDecompositionMap()
 534 {
 535     struct Dec {
 536         QChar::Decomposition dec;
 537         const char *name;
 538     } decompositions[] = {
 539         { QChar::Canonical, "<canonical>" },
 540         { QChar::Font, "<font>" },
 541         { QChar::NoBreak, "<noBreak>" },
 542         { QChar::Initial, "<initial>" },
 543         { QChar::Medial, "<medial>" },
 544         { QChar::Final, "<final>" },
 545         { QChar::Isolated, "<isolated>" },
 546         { QChar::Circle, "<circle>" },
 547         { QChar::Super, "<super>" },
 548         { QChar::Sub, "<sub>" },
 549         { QChar::Vertical, "<vertical>" },
 550         { QChar::Wide, "<wide>" },
 551         { QChar::Narrow, "<narrow>" },
 552         { QChar::Small, "<small>" },
 553         { QChar::Square, "<square>" },
 554         { QChar::Compat, "<compat>" },
 555         { QChar::Fraction, "<fraction>" },
 556         { QChar::NoDecomposition,  0 }
 557     };
 558     Dec *d = decompositions;
 559     while (d->name) {
 560         decompositionMap.insert(d->name, d->dec);
 561         ++d;
 562     }
 563 }
 564
 565
 566 QHash<int, UnicodeData> unicodeData;
 567 QList<PropertyFlags> uniqueProperties;
 568
 569
 570 QHash<int, int> decompositionLength;
 571 int highestComposedCharacter = 0;
 572 int numLigatures = 0;
 573 int highestLigature = 0;
 574
 575 struct Ligature {ushort u1; ushort u2; ushort ligature;};
 576 // we need them sorted after the first component for fast lookup
 577 bool operator < (const Ligature &l1, const Ligature &l2) {
 578     return l1.u1 < l2.u1;
 579 }
 580
 581 QHash<ushort, QList<Ligature> > ligatureHashes;
 582
 583 QHash<int, int> combiningClassUsage;
 584
 585 int maxLowerCaseDiff = 0;
 586 int maxUpperCaseDiff = 0;
 587 int maxTitleCaseDiff = 0;
 588
 589 static void readUnicodeData()
 590 {
 591     QFile f("data/UnicodeData.txt");
 592     if (!f.exists())
 593         qFatal("Couldn't find UnicodeData.txt");
 594
 595     f.open(QFile::ReadOnly);
 596
 597     while (!f.atEnd()) {
 598         QByteArray line;
 599         line.resize(1024);
 600         int len = f.readLine(line.data(), 1024);
 601         line.truncate(len-1);
 602
 603         int comment = line.indexOf('#');
 604         if (comment >= 0)
 605             line = line.left(comment);
 606         if (line.isEmpty())
 607             continue;
 608
 609         QList<QByteArray> properties = line.split(';');
 610         bool ok;
 611         int codepoint = properties[UD_Value].toInt(&ok, 16);
 612         int lastCodepoint = codepoint;
 613
 614         QByteArray name = properties[UD_Name];
 615         if (name.startsWith('<') && name.contains("First")) {
 616             QByteArray nextLine;
 617             nextLine.resize(1024);
 618             f.readLine(nextLine.data(), 1024);
 619             QList<QByteArray> properties = nextLine.split(';');
 620             lastCodepoint = properties[UD_Value].toInt(&ok, 16);
 621         }
 622
 623         UnicodeData data(codepoint);
 624         data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory);
 625         data.p.combiningClass = properties[UD_CombiningClass].toInt();
 626
 627         if (!combiningClassUsage.contains(data.p.combiningClass))
 628             combiningClassUsage[data.p.combiningClass] = 1;
 629         else
 630             ++combiningClassUsage[data.p.combiningClass];
 631
 632         data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
 633
 634         if (!properties[UD_UpperCase].isEmpty()) {
 635             int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
 636             Q_ASSERT(ok);
 637             data.p.upperCaseDiff = upperCase - codepoint;
 638             maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff));
 639             if (codepoint > 0xffff) {
 640                 // if the condition below doesn't hold anymore we need to modify our case folding code
 641                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
 642                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
 643             }
 644         }
 645         if (!properties[UD_LowerCase].isEmpty()) {
 646             int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
 647             Q_ASSERT (ok);
 648             data.p.lowerCaseDiff = lowerCase - codepoint;
 649             maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff));
 650             if (codepoint > 0xffff) {
 651                 // if the condition below doesn't hold anymore we need to modify our case folding code
 652                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
 653                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
 654             }
 655         }
 656         // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
 657         if (properties[UD_TitleCase].isEmpty())
 658             properties[UD_TitleCase] = properties[UD_UpperCase];
 659         if (!properties[UD_TitleCase].isEmpty()) {
 660             int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
 661             Q_ASSERT (ok);
 662             data.p.titleCaseDiff = titleCase - codepoint;
 663             maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(data.p.titleCaseDiff));
 664             if (codepoint > 0xffff) {
 665                 // if the condition below doesn't hold anymore we need to modify our case folding code
 666                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
 667                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
 668             }
 669         }
 670
 671         if (!properties[UD_DigitValue].isEmpty())
 672             data.p.digitValue = properties[UD_DigitValue].toInt();
 673
 674         // decompositition
 675         QByteArray decomposition = properties[UD_Decomposition];
 676         if (!decomposition.isEmpty()) {
 677             highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
 678             QList<QByteArray> d = decomposition.split(' ');
 679             if (d[0].contains('<')) {
 680                 data.decompositionType = decompositionMap.value(d[0], QChar::Canonical);
 681                 d.takeFirst();
 682             } else {
 683                 data.decompositionType = QChar::Canonical;
 684             }
 685             for (int i = 0; i < d.size(); ++i)
 686                 data.decomposition.append(d[i].toInt(&ok, 16));
 687             if (!decompositionLength.contains(data.decomposition.size()))
 688                 decompositionLength[data.decomposition.size()] = 1;
 689             else
 690                 ++decompositionLength[data.decomposition.size()];
 691         }
 692
 693         for (int i = codepoint; i <= lastCodepoint; ++i)
 694             unicodeData.insert(i, data);
 695     }
 696
 697 }
 698
 699 static int maxMirroredDiff = 0;
 700
 701 static void readBidiMirroring()
 702 {
 703     QFile f("data/BidiMirroring.txt");
 704     if (!f.exists())
 705         qFatal("Couldn't find BidiMirroring.txt");
 706
 707     f.open(QFile::ReadOnly);
 708
 709     while (!f.atEnd()) {
 710         QByteArray line;
 711         line.resize(1024);
 712         int len = f.readLine(line.data(), 1024);
 713         line.resize(len-1);
 714
 715         int comment = line.indexOf('#');
 716         if (comment >= 0)
 717             line = line.left(comment);
 718
 719         if (line.isEmpty())
 720             continue;
 721         line = line.replace(" ", "");
 722
 723         QList<QByteArray> pair = line.split(';');
 724         Q_ASSERT(pair.size() == 2);
 725
 726         bool ok;
 727         int codepoint = pair[0].toInt(&ok, 16);
 728         int mirror = pair[1].toInt(&ok, 16);
 729
 730         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
 731         d.mirroredChar = mirror;
 732         if (qAbs(codepoint-d.mirroredChar) > maxMirroredDiff)
 733             maxMirroredDiff = qAbs(codepoint - d.mirroredChar);
 734
 735         d.p.mirrorDiff = d.mirroredChar - codepoint;
 736         unicodeData.insert(codepoint, d);
 737     }
 738 }
 739
 740 static void readArabicShaping()
 741 {
 742     QFile f("data/ArabicShaping.txt");
 743     if (!f.exists())
 744         qFatal("Couldn't find ArabicShaping.txt");
 745
 746     f.open(QFile::ReadOnly);
 747
 748     while (!f.atEnd()) {
 749         QByteArray line;
 750         line.resize(1024);
 751         int len = f.readLine(line.data(), 1024);
 752         line.resize(len-1);
 753
 754         int comment = line.indexOf('#');
 755         if (comment >= 0)
 756             line = line.left(comment);
 757         line = line.trimmed();
 758
 759         if (line.isEmpty())
 760             continue;
 761
 762         QList<QByteArray> shaping = line.split(';');
 763         Q_ASSERT(shaping.size() == 4);
 764
 765         bool ok;
 766         int codepoint = shaping[0].toInt(&ok, 16);
 767         QChar::Joining j = QChar::OtherJoining;
 768         QByteArray shape = shaping[2].trimmed();
 769         if (shape == "R")
 770             j = QChar::Right;
 771         else if (shape == "D")
 772             j = QChar::Dual;
 773         else if (shape == "C")
 774             j = QChar::Center;
 775
 776         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
 777         d.p.joining = j;
 778         unicodeData.insert(codepoint, d);
 779     }
 780 }
 781
 782 static void readDerivedAge()
 783 {
 784     QFile f("data/DerivedAge.txt");
 785     if (!f.exists())
 786         qFatal("Couldn't find DerivedAge.txt");
 787
 788     f.open(QFile::ReadOnly);
 789
 790     while (!f.atEnd()) {
 791         QByteArray line;
 792         line.resize(1024);
 793         int len = f.readLine(line.data(), 1024);
 794         line.resize(len-1);
 795
 796         int comment = line.indexOf('#');
 797         if (comment >= 0)
 798             line = line.left(comment);
 799         line.replace(" ", "");
 800
 801         if (line.isEmpty())
 802             continue;
 803
 804         QList<QByteArray> l = line.split(';');
 805         Q_ASSERT(l.size() == 2);
 806
 807         QByteArray codes = l[0];
 808         codes.replace("..", ".");
 809         QList<QByteArray> cl = codes.split('.');
 810
 811         bool ok;
 812         int from = cl[0].toInt(&ok, 16);
 813         int to = from;
 814         if (cl.size() == 2)
 815             to = cl[1].toInt(&ok, 16);
 816
 817         QChar::UnicodeVersion age = QChar::Unicode_Unassigned;
 818         QByteArray ba = l[1];
 819         AgeMap *map = ageMap;
 820         while (map->age) {
 821             if (ba == map->age) {
 822                 age = map->version;
 823                 break;
 824             }
 825             ++map;
 826         }
 827         //qDebug() << hex << from << ".." << to << ba << age;
 828         Q_ASSERT(age != QChar::Unicode_Unassigned);
 829
 830         for (int codepoint = from; codepoint <= to; ++codepoint) {
 831             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
 832             d.p.age = age;
 833             unicodeData.insert(codepoint, d);
 834         }
 835     }
 836 }
 837
 838
 839 static void readCompositionExclusion()
 840 {
 841     QFile f("data/CompositionExclusions.txt");
 842     if (!f.exists())
 843         qFatal("Couldn't find CompositionExclusions.txt");
 844
 845     f.open(QFile::ReadOnly);
 846
 847     while (!f.atEnd()) {
 848         QByteArray line;
 849         line.resize(1024);
 850         int len = f.readLine(line.data(), 1024);
 851         line.resize(len-1);
 852
 853         int comment = line.indexOf('#');
 854         if (comment >= 0)
 855             line = line.left(comment);
 856         line.replace(" ", "");
 857
 858         if (line.isEmpty())
 859             continue;
 860
 861         Q_ASSERT(!line.contains(".."));
 862
 863         bool ok;
 864         int codepoint = line.toInt(&ok, 16);
 865
 866         UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
 867         d.excludedComposition = true;
 868         unicodeData.insert(codepoint, d);
 869     }
 870
 871     for (int i = 0; i < 0x110000; ++i) {
 872         UnicodeData data = unicodeData.value(i, UnicodeData(i));
 873         if (!data.excludedComposition
 874             && data.decompositionType == QChar::Canonical
 875             && data.decomposition.size() > 1) {
 876             Q_ASSERT(data.decomposition.size() == 2);
 877
 878             uint part1 = data.decomposition.at(0);
 879             uint part2 = data.decomposition.at(1);
 880             UnicodeData first = unicodeData.value(part1, UnicodeData(part1));
 881             if (first.p.combiningClass != 0)
 882                 continue;
 883
 884             ++numLigatures;
 885             highestLigature = qMax(highestLigature, (int)part1);
 886             Ligature l = {(ushort)part1, (ushort)part2, i};
 887             ligatureHashes[part2].append(l);
 888         }
 889     }
 890 }
 891
 892 struct NormalizationCorrection {
 893     uint codepoint;
 894     uint mapped;
 895     uint version;
 896 };
 897
 898 static QByteArray createNormalizationCorrections()
 899 {
 900     QFile f("data/NormalizationCorrections.txt");
 901     if (!f.exists())
 902         qFatal("Couldn't find NormalizationCorrections.txt");
 903
 904     f.open(QFile::ReadOnly);
 905
 906     QByteArray out;
 907
 908     out += "struct NormalizationCorrection {\n"
 909            "    uint ucs4;\n"
 910            "    uint old_mapping;\n"
 911            "    int version;\n"
 912            "};\n\n"
 913
 914            "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
 915
 916     int numCorrections = 0;
 917     while (!f.atEnd()) {
 918         QByteArray line;
 919         line.resize(1024);
 920         int len = f.readLine(line.data(), 1024);
 921         line.resize(len-1);
 922
 923         int comment = line.indexOf('#');
 924         if (comment >= 0)
 925             line = line.left(comment);
 926         line.replace(" ", "");
 927
 928         if (line.isEmpty())
 929             continue;
 930
 931         Q_ASSERT(!line.contains(".."));
 932
 933         QList<QByteArray> fields = line.split(';');
 934         Q_ASSERT(fields.size() == 4);
 935
 936         NormalizationCorrection c;
 937         bool ok;
 938         c.codepoint = fields.at(0).toInt(&ok, 16);
 939         c.mapped = fields.at(1).toInt(&ok, 16);
 940         if (fields.at(3) == "3.2.0")
 941             c.version = QChar::Unicode_3_2;
 942         else if (fields.at(3) == "4.0.0")
 943             c.version = QChar::Unicode_4_0;
 944         else
 945             qFatal("unknown unicode version in NormalizationCorrection.txt");
 946
 947         out += "    { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
 948              + ", " + QString::number(c.version) + " },\n";
 949         ++numCorrections;
 950     }
 951
 952     out += "};\n\n"
 953
 954            "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n";
 955
 956
 957     return out;
 958 }
 959
 960
 961 static void computeUniqueProperties()
 962 {
 963     qDebug("computeUniqueProperties:");
 964     for (int uc = 0; uc < 0x110000; ++uc) {
 965         UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
 966
 967         int index = uniqueProperties.indexOf(d.p);
 968         if (index == -1) {
 969             index = uniqueProperties.size();
 970             uniqueProperties.append(d.p);
 971         }
 972         d.propertyIndex = index;
 973         unicodeData.insert(uc, d);
 974     }
 975     qDebug("    %d unicode properties found", uniqueProperties.size());
 976 }
 977
 978
 979 static void readLineBreak()
 980 {
 981     QFile f("data/LineBreak.txt");
 982     if (!f.exists())
 983         qFatal("Couldn't find LineBreak.txt");
 984
 985     f.open(QFile::ReadOnly);
 986
 987     while (!f.atEnd()) {
 988         QByteArray line;
 989         line.resize(1024);
 990         int len = f.readLine(line.data(), 1024);
 991         line.resize(len-1);
 992
 993         int comment = line.indexOf('#');
 994         if (comment >= 0)
 995             line = line.left(comment);
 996         line.replace(" ", "");
 997
 998         if (line.isEmpty())
 999             continue;
1000
1001         QList<QByteArray> l = line.split(';');
1002         Q_ASSERT(l.size() == 2);
1003
1004         QByteArray codes = l[0];
1005         codes.replace("..", ".");
1006         QList<QByteArray> cl = codes.split('.');
1007
1008         bool ok;
1009         int from = cl[0].toInt(&ok, 16);
1010         int to = from;
1011         if (cl.size() == 2)
1012             to = cl[1].toInt(&ok, 16);
1013
1014         // ### Classes XX and AI are left out and mapped to AL for now
1015         QUnicodeTables::LineBreakClass lb = QUnicodeTables::LineBreak_AL;
1016         QByteArray ba = l[1];
1017
1018         if (ba == "AI") lb = QUnicodeTables::LineBreak_AL;
1019         else if (ba == "XX") lb = QUnicodeTables::LineBreak_AL;
1020         else if (ba == "NL") lb = QUnicodeTables::LineBreak_AL;
1021         else if (ba == "OP") lb = QUnicodeTables::LineBreak_OP;
1022         else if (ba == "CL") lb = QUnicodeTables::LineBreak_CL;
1023         else if (ba == "QU") lb = QUnicodeTables::LineBreak_QU;
1024         else if (ba == "GL") lb = QUnicodeTables::LineBreak_GL;
1025         else if (ba == "NS") lb = QUnicodeTables::LineBreak_NS;
1026         else if (ba == "EX") lb = QUnicodeTables::LineBreak_EX;
1027         else if (ba == "SY") lb = QUnicodeTables::LineBreak_SY;
1028         else if (ba == "IS") lb = QUnicodeTables::LineBreak_IS;
1029         else if (ba == "PR") lb = QUnicodeTables::LineBreak_PR;
1030         else if (ba == "PO") lb = QUnicodeTables::LineBreak_PO;
1031         else if (ba == "NU") lb = QUnicodeTables::LineBreak_NU;
1032         else if (ba == "AL") lb = QUnicodeTables::LineBreak_AL;
1033         else if (ba == "ID") lb = QUnicodeTables::LineBreak_ID;
1034         else if (ba == "IN") lb = QUnicodeTables::LineBreak_IN;
1035         else if (ba == "HY") lb = QUnicodeTables::LineBreak_HY;
1036         else if (ba == "BA") lb = QUnicodeTables::LineBreak_BA;
1037         else if (ba == "BB") lb = QUnicodeTables::LineBreak_BB;
1038         else if (ba == "B2") lb = QUnicodeTables::LineBreak_B2;
1039         else if (ba == "ZW") lb = QUnicodeTables::LineBreak_ZW;
1040         else if (ba == "CM") lb = QUnicodeTables::LineBreak_CM;
1041         else if (ba == "SA") lb = QUnicodeTables::LineBreak_SA;
1042         else if (ba == "BK") lb = QUnicodeTables::LineBreak_BK;
1043         else if (ba == "CR") lb = QUnicodeTables::LineBreak_CR;
1044         else if (ba == "LF") lb = QUnicodeTables::LineBreak_LF;
1045         else if (ba == "SG") lb = QUnicodeTables::LineBreak_SG;
1046         else if (ba == "CB") lb = QUnicodeTables::LineBreak_AL;
1047         else if (ba == "SP") lb = QUnicodeTables::LineBreak_SP;
1048         else if (ba == "WJ") lb = QUnicodeTables::LineBreak_WJ;
1049         else if (ba == "H2") lb = QUnicodeTables::LineBreak_H2;
1050         else if (ba == "H3") lb = QUnicodeTables::LineBreak_H3;
1051         else if (ba == "JL") lb = QUnicodeTables::LineBreak_JL;
1052         else if (ba == "JV") lb = QUnicodeTables::LineBreak_JV;
1053         else if (ba == "JT") lb = QUnicodeTables::LineBreak_JT;
1054         else {
1055             qDebug() << "unhandled line break class:" << ba;
1056         }
1057
1058         for (int codepoint = from; codepoint <= to; ++codepoint) {
1059             UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1060             d.p.line_break_class = lb;
1061             unicodeData.insert(codepoint, d);
1062         }
1063     }
1064 }
1065
1066
1067 static void readSpecialCasing()
1068 {
1069 //     qDebug() << "Reading SpecialCasing.txt";
1070     QFile f("data/SpecialCasing.txt");
1071     if (!f.exists())
1072         qFatal("Couldn't find SpecialCasing.txt");
1073
1074     f.open(QFile::ReadOnly);
1075
1076     while (!f.atEnd()) {
1077         QByteArray line;
1078         line.resize(1024);
1079         int len = f.readLine(line.data(), 1024);
1080         line.resize(len-1);
1081
1082         int comment = line.indexOf('#');
1083         if (comment >= 0)
1084             line = line.left(comment);
1085
1086         if (line.isEmpty())
1087             continue;
1088
1089         QList<QByteArray> l = line.split(';');
1090
1091         QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1092         if (!condition.isEmpty())
1093             // #####
1094             continue;
1095
1096         bool ok;
1097         int codepoint = l[0].trimmed().toInt(&ok, 16);
1098         Q_ASSERT(ok);
1099         Q_ASSERT(codepoint <= 0xffff);
1100
1101 //         qDebug() << "codepoint" << hex << codepoint;
1102 //         qDebug() << line;
1103
1104         QList<QByteArray> lower = l[1].trimmed().split(' ');
1105         QList<int> lowerMap;
1106         for (int i = 0; i < lower.size(); ++i) {
1107             bool ok;
1108             lowerMap.append(lower.at(i).toInt(&ok, 16));
1109             Q_ASSERT(ok);
1110         }
1111
1112         QList<QByteArray> title = l[2].trimmed().split(' ');
1113         QList<int> titleMap;
1114         for (int i = 0; i < title.size(); ++i) {
1115             bool ok;
1116             titleMap.append(title.at(i).toInt(&ok, 16));
1117             if (!ok)
1118                 qDebug() << line << title.at(i);
1119             Q_ASSERT(ok);
1120         }
1121
1122         QList<QByteArray> upper = l[3].trimmed().split(' ');
1123         QList<int> upperMap;
1124         for (int i = 0; i < upper.size(); ++i) {
1125             bool ok;
1126             upperMap.append(upper.at(i).toInt(&ok, 16));
1127             Q_ASSERT(ok);
1128         }
1129
1130
1131         UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1132
1133         Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1134         Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1135         Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1136
1137         if (lowerMap.size() > 1) {
1138             ud.p.lowerCaseSpecial = true;
1139             ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1140         }
1141         if (titleMap.size() > 1) {
1142             ud.p.titleCaseSpecial = true;
1143             ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1144         }
1145         if (upperMap.size() > 1) {
1146             ud.p.upperCaseSpecial = true;
1147             ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);;
1148         }
1149
1150         unicodeData.insert(codepoint, ud);
1151     }
1152 }
1153
1154 int maxCaseFoldDiff = 0;
1155
1156 static void readCaseFolding()
1157 {
1158     qDebug() << "Reading CaseFolding.txt";
1159     QFile f("data/CaseFolding.txt");
1160     if (!f.exists())
1161         qFatal("Couldn't find CaseFolding.txt");
1162
1163     f.open(QFile::ReadOnly);
1164
1165     while (!f.atEnd()) {
1166         QByteArray line;
1167         line.resize(1024);
1168         int len = f.readLine(line.data(), 1024);
1169         line.resize(len-1);
1170
1171         int comment = line.indexOf('#');
1172         if (comment >= 0)
1173             line = line.left(comment);
1174
1175         if (line.isEmpty())
1176             continue;
1177
1178         QList<QByteArray> l = line.split(';');
1179
1180         bool ok;
1181         uint codepoint = l[0].trimmed().toInt(&ok, 16);
1182         Q_ASSERT(ok);
1183
1184
1185         l[1] = l[1].trimmed();
1186         if (l[1] == "F" || l[1] == "T")
1187             continue;
1188
1189 //         qDebug() << "codepoint" << hex << codepoint;
1190 //         qDebug() << line;
1191         QList<QByteArray> fold = l[2].trimmed().split(' ');
1192         QList<int> foldMap;
1193         for (int i = 0; i < fold.size(); ++i) {
1194             bool ok;
1195             foldMap.append(fold.at(i).toInt(&ok, 16));
1196             Q_ASSERT(ok);
1197         }
1198
1199         UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1200         if (foldMap.size() == 1) {
1201             ud.p.caseFoldDiff = foldMap.at(0) - codepoint;
1202             maxCaseFoldDiff = qMax(maxCaseFoldDiff, ud.p.caseFoldDiff);
1203             if (codepoint > 0xffff) {
1204                 // if the condition below doesn't hold anymore we need to modify our case folding code
1205                 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
1206                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(foldMap.at(0)));
1207             }
1208             if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff)
1209                 qDebug() << hex << codepoint;
1210         } else {
1211             Q_ASSERT(false); // we currently don't support full case foldings
1212 //             qDebug() << "special" << hex << foldMap;
1213             ud.p.caseFoldSpecial = true;
1214             ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1215         }
1216         unicodeData.insert(codepoint, ud);
1217     }
1218 }
1219
1220 static void readGraphemeBreak()
1221 {
1222     qDebug() << "Reading GraphemeBreakProperty.txt";
1223     QFile f("data/GraphemeBreakProperty.txt");
1224     if (!f.exists())
1225         qFatal("Couldn't find GraphemeBreakProperty.txt");
1226
1227     f.open(QFile::ReadOnly);
1228
1229     while (!f.atEnd()) {
1230         QByteArray line;
1231         line.resize(1024);
1232         int len = f.readLine(line.data(), 1024);
1233         line.resize(len-1);
1234
1235         int comment = line.indexOf('#');
1236         if (comment >= 0)
1237             line = line.left(comment);
1238
1239         if (line.isEmpty())
1240             continue;
1241
1242         QList<QByteArray> l = line.split(';');
1243
1244         QByteArray codes = l[0].trimmed();
1245         codes.replace("..", ".");
1246         QList<QByteArray> cl = codes.split('.');
1247
1248         bool ok;
1249         int from = cl[0].toInt(&ok, 16);
1250         Q_ASSERT(ok);
1251         int to = from;
1252         if (cl.size() == 2) {
1253             to = cl[1].toInt(&ok, 16);
1254             Q_ASSERT(ok);
1255         }
1256
1257         GraphemeBreak brk = grapheme_break_map.value(l[1].trimmed(), GraphemeBreakOther);
1258
1259         for (int codepoint = from; codepoint <= to; ++codepoint) {
1260             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1261             ud.p.graphemeBreak = brk;
1262             unicodeData.insert(codepoint, ud);
1263         }
1264     }
1265 }
1266
1267 static void readWordBreak()
1268 {
1269     qDebug() << "Reading WordBreakProperty.txt";
1270     QFile f("data/WordBreakProperty.txt");
1271     if (!f.exists())
1272         qFatal("Couldn't find WordBreakProperty.txt");
1273
1274     f.open(QFile::ReadOnly);
1275
1276     while (!f.atEnd()) {
1277         QByteArray line;
1278         line.resize(1024);
1279         int len = f.readLine(line.data(), 1024);
1280         line.resize(len-1);
1281
1282         int comment = line.indexOf('#');
1283         if (comment >= 0)
1284             line = line.left(comment);
1285
1286         if (line.isEmpty())
1287             continue;
1288
1289         QList<QByteArray> l = line.split(';');
1290
1291         QByteArray codes = l[0].trimmed();
1292         codes.replace("..", ".");
1293         QList<QByteArray> cl = codes.split('.');
1294
1295         bool ok;
1296         int from = cl[0].toInt(&ok, 16);
1297         Q_ASSERT(ok);
1298         int to = from;
1299         if (cl.size() == 2) {
1300             to = cl[1].toInt(&ok, 16);
1301             Q_ASSERT(ok);
1302         }
1303
1304         WordBreak brk = word_break_map.value(l[1].trimmed(), WordBreakOther);
1305         Q_ASSERT(brk != WordBreakOther);
1306
1307         for (int codepoint = from; codepoint <= to; ++codepoint) {
1308             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1309             ud.p.wordBreak = brk;
1310             unicodeData.insert(codepoint, ud);
1311         }
1312     }
1313 }
1314
1315 static void readSentenceBreak()
1316 {
1317     qDebug() << "Reading SentenceBreakProperty.txt";
1318     QFile f("data/SentenceBreakProperty.txt");
1319     if (!f.exists())
1320         qFatal("Couldn't find SentenceBreakProperty.txt");
1321
1322     f.open(QFile::ReadOnly);
1323
1324     while (!f.atEnd()) {
1325         QByteArray line;
1326         line.resize(1024);
1327         int len = f.readLine(line.data(), 1024);
1328         line.resize(len-1);
1329
1330         int comment = line.indexOf('#');
1331         if (comment >= 0)
1332             line = line.left(comment);
1333
1334         if (line.isEmpty())
1335             continue;
1336
1337         QList<QByteArray> l = line.split(';');
1338
1339         QByteArray codes = l[0].trimmed();
1340         codes.replace("..", ".");
1341         QList<QByteArray> cl = codes.split('.');
1342
1343         bool ok;
1344         int from = cl[0].toInt(&ok, 16);
1345         Q_ASSERT(ok);
1346         int to = from;
1347         if (cl.size() == 2) {
1348             to = cl[1].toInt(&ok, 16);
1349             Q_ASSERT(ok);
1350         }
1351
1352         SentenceBreak brk = sentence_break_map.value(l[1].trimmed(), SentenceBreakOther);
1353         Q_ASSERT(brk != SentenceBreakOther);
1354
1355         for (int codepoint = from; codepoint <= to; ++codepoint) {
1356             UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1357             ud.p.sentenceBreak = brk;
1358             unicodeData.insert(codepoint, ud);
1359         }
1360     }
1361 }
1362
1363 #if 0
1364 // this piece of code does full case folding and comparison. We currently
1365 // don't use it, since this gives lots of issues with things as case insensitive
1366 // search and replace.
1367 static inline void foldCase(uint ch, ushort *out)
1368 {
1369     const QUnicodeTables::Properties *p = qGetProp(ch);
1370     if (!p->caseFoldSpecial) {
1371         *(out++) = ch + p->caseFoldDiff;
1372     } else {
1373         const ushort *folded = specialCaseMap + p->caseFoldDiff;
1374         while (*folded)
1375             *out++ = *folded++;
1376     }
1377     *out = 0;
1378 }
1379
1380 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
1381 {
1382     if (a == b)
1383         return 0;
1384     if (a == 0)
1385         return 1;
1386     if (b == 0)
1387         return -1;
1388
1389     while (a != ae && b != be) {
1390         const QUnicodeTables::Properties *pa = qGetProp(*a);
1391         const QUnicodeTables::Properties *pb = qGetProp(*b);
1392         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1393             goto special;
1394             int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1395         if ((diff))
1396             return diff;
1397         ++a;
1398         ++b;
1399         }
1400     }
1401     if (a == ae) {
1402         if (b == be)
1403             return 0;
1404         return -1;
1405     }
1406     return 1;
1407 special:
1408     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1409     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1410     abuf[0] = bbuf[0] = 0;
1411     ushort *ap = abuf;
1412     ushort *bp = bbuf;
1413     while (1) {
1414         if (!*ap) {
1415             if (a == ae) {
1416                 if (!*bp && b == be)
1417                     return 0;
1418                 return -1;
1419             }
1420             foldCase(*(a++), abuf);
1421             ap = abuf;
1422         }
1423         if (!*bp) {
1424             if (b == be)
1425                 return 1;
1426             foldCase(*(b++), bbuf);
1427             bp = bbuf;
1428         }
1429         if (*ap != *bp)
1430             return (int)*ap - (int)*bp;
1431         ++ap;
1432         ++bp;
1433     }
1434 }
1435
1436
1437 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
1438 {
1439     if (a == 0)
1440         return 1;
1441     if (b == 0)
1442         return -1;
1443
1444     while (a != ae && *b) {
1445         const QUnicodeTables::Properties *pa = qGetProp(*a);
1446         const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
1447         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1448             goto special;
1449         int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1450         if ((diff))
1451             return diff;
1452         ++a;
1453         ++b;
1454     }
1455     if (a == ae) {
1456         if (!*b)
1457             return 0;
1458         return -1;
1459     }
1460     return 1;
1461
1462 special:
1463     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1464     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1465     abuf[0] = bbuf[0] = 0;
1466     ushort *ap = abuf;
1467     ushort *bp = bbuf;
1468     while (1) {
1469         if (!*ap) {
1470             if (a == ae) {
1471                 if (!*bp && !*b)
1472                     return 0;
1473                 return -1;
1474             }
1475             foldCase(*(a++), abuf);
1476             ap = abuf;
1477         }
1478         if (!*bp) {
1479             if (!*b)
1480                 return 1;
1481             foldCase(*(b++), bbuf);
1482             bp = bbuf;
1483         }
1484         if (*ap != *bp)
1485             return (int)*ap - (int)*bp;
1486         ++ap;
1487         ++bp;
1488     }
1489 }
1490 #endif
1491
1492 #if 0
1493 static QList<QByteArray> blockNames;
1494 struct BlockInfo
1495 {
1496     int blockIndex;
1497     int firstCodePoint;
1498     int lastCodePoint;
1499 };
1500 static QList<BlockInfo> blockInfoList;
1501
1502 static void readBlocks()
1503 {
1504     QFile f("data/Blocks.txt");
1505     if (!f.exists())
1506         qFatal("Couldn't find Blocks.txt");
1507
1508     f.open(QFile::ReadOnly);
1509
1510     while (!f.atEnd()) {
1511         QByteArray line = f.readLine();
1512         line.resize(line.size() - 1);
1513
1514         int comment = line.indexOf("#");
1515         if (comment >= 0)
1516             line = line.left(comment);
1517
1518         line.replace(" ", "");
1519
1520         if (line.isEmpty())
1521             continue;
1522
1523         int semicolon = line.indexOf(';');
1524         Q_ASSERT(semicolon >= 0);
1525         QByteArray codePoints = line.left(semicolon);
1526         QByteArray blockName = line.mid(semicolon + 1);
1527
1528         int blockIndex = blockNames.indexOf(blockName);
1529         if (blockIndex < 0) {
1530             blockNames.append(blockName);
1531             blockIndex = blockNames.indexOf(blockName);
1532             Q_ASSERT(blockIndex >= 0);
1533         }
1534
1535         int dotdot = codePoints.indexOf("..");
1536         Q_ASSERT(dotdot >= 0);
1537         bool unused;
1538         int first = codePoints.left(dotdot).toInt(&unused, 16);
1539         int last = codePoints.mid(dotdot + 2).toInt(&unused, 16);
1540
1541         BlockInfo blockInfo = { blockIndex, first, last };
1542         blockInfoList.append(blockInfo);
1543     }
1544 }
1545 #endif
1546
1547 static QList<QByteArray> scriptNames;
1548 static QHash<int, int> scriptAssignment;
1549 static QHash<int, int> scriptHash;
1550
1551 struct ExtraBlock {
1552     int block;
1553     QVector<int> vector;
1554 };
1555
1556 static QList<ExtraBlock> extraBlockList;
1557
1558
1559 static void readScripts()
1560 {
1561     scriptNames.append("Common");
1562
1563     static const char *files[] = {
1564         "data/ScriptsInitial.txt",
1565         "data/Scripts.txt",
1566         "data/ScriptsCorrections.txt"
1567     };
1568     enum { fileCount = sizeof(files) / sizeof(const char *) };
1569
1570     for (int i = 0; i < fileCount; ++i) {
1571         QFile f(files[i]);
1572         if (!f.exists())
1573             qFatal("Couldn't find %s", files[i]);
1574
1575
1576         f.open(QFile::ReadOnly);
1577
1578         while (!f.atEnd()) {
1579             QByteArray line = f.readLine();
1580             line.resize(line.size() - 1);
1581
1582             int comment = line.indexOf("#");
1583             if (comment >= 0)
1584                 line = line.left(comment);
1585
1586             line.replace(" ", "");
1587             line.replace("_", "");
1588
1589             if (line.isEmpty())
1590                 continue;
1591
1592             int semicolon = line.indexOf(';');
1593             Q_ASSERT(semicolon >= 0);
1594             QByteArray codePoints = line.left(semicolon);
1595             QByteArray scriptName = line.mid(semicolon + 1);
1596
1597             int scriptIndex = scriptNames.indexOf(scriptName);
1598             if (scriptIndex < 0) {
1599                 scriptNames.append(scriptName);
1600                 scriptIndex = scriptNames.indexOf(scriptName);
1601                 Q_ASSERT(scriptIndex >= 0);
1602             }
1603
1604             int dotdot = codePoints.indexOf("..");
1605             bool unused;
1606             int first = -1, last = -1;
1607             if (dotdot >= 0) {
1608                 first = codePoints.left(dotdot).toInt(&unused, 16);
1609                 last = codePoints.mid(dotdot + 2).toInt(&unused, 16);
1610             } else {
1611                 first = codePoints.toInt(&unused, 16);
1612             }
1613
1614             if (last != -1) {
1615                 for (int i = first; i <= last; ++i)
1616                     scriptAssignment[i] = scriptIndex;
1617             } else {
1618                 scriptAssignment[first] = scriptIndex;
1619             }
1620         }
1621     }
1622 }
1623
1624
1625 static int scriptSentinel = 0;
1626
1627 QByteArray createScriptEnumDeclaration()
1628 {
1629     static const char *specialScripts[] = {
1630         "Common",
1631         "Arabic",
1632         "Armenian",
1633         "Bengali",
1634         "Cyrillic",
1635         "Devanagari",
1636         "Georgian",
1637         "Greek",
1638         "Gujarati",
1639         "Gurmukhi",
1640         "Hangul",
1641         "Hebrew",
1642         "Kannada",
1643         "Khmer",
1644         "Lao",
1645         "Malayalam",
1646         "Myanmar",
1647         "Ogham",
1648         "Oriya",
1649         "Runic",
1650         "Sinhala",
1651         "Syriac",
1652         "Tamil",
1653         "Telugu",
1654         "Thaana",
1655         "Thai",
1656         "Tibetan",
1657         "Inherited"
1658     };
1659     const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *);
1660
1661     // generate script enum
1662     QByteArray declaration;
1663
1664     declaration += "    // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n";
1665     declaration += "    enum Script {\n        Common";
1666
1667     int uniqueScripts = 1; // Common
1668
1669     // output the ones with special processing first
1670     for (int i = 1; i < scriptNames.size(); ++i) {
1671         QByteArray scriptName = scriptNames.at(i);
1672         // does the script require special processing?
1673         bool special = false;
1674         for (int s = 0; !special && s < specialScriptsCount; ++s) {
1675             if (scriptName == specialScripts[s])
1676                 special = true;
1677         }
1678         if (!special) {
1679             scriptHash[i] =  0; // alias for 'Common'
1680             continue;
1681         } else {
1682             ++uniqueScripts;
1683             scriptHash[i] = i;
1684         }
1685
1686         declaration += ",\n        ";
1687         declaration += scriptName;
1688     }
1689     declaration += ",\n        ScriptCount = Inherited";
1690
1691     // output the ones that are an alias for 'Common'
1692     for (int i = 1; i < scriptNames.size(); ++i) {
1693         if (scriptHash.value(i) != 0)
1694             continue;
1695         QByteArray scriptName = scriptNames.at(i);
1696         scriptName += " = Common";
1697         declaration += ",\n        ";
1698         declaration += scriptName;
1699     }
1700
1701     declaration += "\n    };\n";
1702
1703     scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32
1704     declaration += "    enum { ScriptSentinel = ";
1705     declaration += QByteArray::number(scriptSentinel);
1706     declaration += " };\n\n";
1707     return declaration;
1708 }
1709
1710 QByteArray createScriptTableDeclaration()
1711 {
1712     Q_ASSERT(scriptSentinel > 0);
1713
1714     QByteArray declaration;
1715
1716     const int unicodeBlockCount = 512; // number of unicode blocks
1717     const int unicodeBlockSize = 128; // size of each block
1718     declaration = "enum { UnicodeBlockCount = ";
1719     declaration += QByteArray::number(unicodeBlockCount);
1720     declaration += " }; // number of unicode blocks\n";
1721     declaration += "enum { UnicodeBlockSize = ";
1722     declaration += QByteArray::number(unicodeBlockSize);
1723     declaration += " }; // size of each block\n\n";
1724
1725     // script table
1726     declaration += "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n";
1727     for (int i = 0; i < unicodeBlockCount; ++i) {
1728         int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80));
1729         int blockAssignment[unicodeBlockSize];
1730         for (int x = 0; x < unicodeBlockSize; ++x) {
1731             int codePoint = (i << 7) | x;
1732             blockAssignment[x] = scriptAssignment.value(codePoint, 0);
1733         }
1734         bool allTheSame = true;
1735         const int originalScript = blockAssignment[0];
1736         const int script = scriptHash.value(originalScript);
1737         for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) {
1738             const int s = scriptHash.value(blockAssignment[x]);
1739             if (s != script)
1740                 allTheSame = false;
1741         }
1742
1743         if (allTheSame) {
1744             declaration += "    ";
1745             declaration += scriptNames.value(originalScript);
1746             declaration += ", /* U+";
1747             declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1748             declaration += '-';
1749             declaration +=
1750                 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1751             declaration += " */\n";
1752         } else {
1753             const int value = extraBlockList.size() + scriptSentinel;
1754             const int offset =
1755                 ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1756
1757             declaration += "    ";
1758             declaration += QByteArray::number(value);
1759             declaration += ", /* U+";
1760             declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1761             declaration += '-';
1762             declaration +=
1763                 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1764             declaration += " at offset ";
1765             declaration += QByteArray::number(offset);
1766             declaration += " */\n";
1767
1768             ExtraBlock extraBlock;
1769             extraBlock.block = block;
1770             extraBlock.vector.resize(unicodeBlockSize);
1771             for (int x = 0; x < unicodeBlockSize; ++x)
1772                 extraBlock.vector[x] = blockAssignment[x];
1773
1774             extraBlockList.append(extraBlock);
1775         }
1776     }
1777
1778     for (int i = 0; i < extraBlockList.size(); ++i) {
1779         const int value = i + scriptSentinel;
1780         const int offset =
1781             ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1782         const ExtraBlock &extraBlock = extraBlockList.at(i);
1783         const int block = extraBlock.block;
1784
1785         declaration += "\n\n    /* U+";
1786         declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1787         declaration += '-';
1788         declaration +=
1789             QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1790         declaration += " at offset ";
1791         declaration += QByteArray::number(offset);
1792         declaration += " */\n    ";
1793
1794         for (int x = 0; x < extraBlock.vector.size(); ++x) {
1795             const int o = extraBlock.vector.at(x);
1796
1797             declaration += scriptNames.value(o);
1798             if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1)
1799                 declaration += ',';
1800             if ((x & 7) == 7 && x < extraBlock.vector.size() - 1)
1801                 declaration += "\n    ";
1802             else
1803                 declaration += ' ';
1804         }
1805     }
1806     declaration += "\n};\n\n} // namespace QUnicodeTables\n\n";
1807
1808     qDebug("createScriptTableDeclaration: table size is %d bytes",
1809            unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize));
1810
1811     return declaration;
1812 }
1813
1814 #if 0
1815 static void dump(int from, int to)
1816 {
1817     for (int i = from; i <= to; ++i) {
1818         UnicodeData d = unicodeData.value(i, UnicodeData(i));
1819         qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
1820                i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
1821         if (d.decompositionType != QChar::NoDecomposition) {
1822             qDebug("    decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
1823                    d.decomposition[0]);
1824         }
1825     }
1826     qDebug(" ");
1827 }
1828 #endif
1829
1830 struct PropertyBlock {
1831     PropertyBlock() { index = -1; }
1832     int index;
1833     QList<int> properties;
1834     bool operator ==(const PropertyBlock &other) { return properties == other.properties; }
1835 };
1836
1837 static QByteArray createPropertyInfo()
1838 {
1839     qDebug("createPropertyInfo:");
1840
1841     const int BMP_BLOCKSIZE=32;
1842     const int BMP_SHIFT = 5;
1843     const int BMP_END = 0x11000;
1844     const int SMP_END = 0x110000;
1845     const int SMP_BLOCKSIZE = 256;
1846     const int SMP_SHIFT = 8;
1847
1848     QList<PropertyBlock> blocks;
1849     QList<int> blockMap;
1850
1851     int used = 0;
1852
1853     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
1854         PropertyBlock b;
1855         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
1856             int uc = block*BMP_BLOCKSIZE + i;
1857             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
1858             b.properties.append(d.propertyIndex);
1859         }
1860         int index = blocks.indexOf(b);
1861         if (index == -1) {
1862             index = blocks.size();
1863             b.index = used;
1864             used += BMP_BLOCKSIZE;
1865             blocks.append(b);
1866         }
1867         blockMap.append(blocks.at(index).index);
1868     }
1869
1870     int bmp_blocks = blocks.size();
1871     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
1872
1873     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
1874         PropertyBlock b;
1875         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
1876             int uc = block*SMP_BLOCKSIZE + i;
1877             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
1878             b.properties.append(d.propertyIndex);
1879         }
1880         int index = blocks.indexOf(b);
1881         if (index == -1) {
1882             index = blocks.size();
1883             b.index = used;
1884             used += SMP_BLOCKSIZE;
1885             blocks.append(b);
1886         }
1887         blockMap.append(blocks.at(index).index);
1888     }
1889
1890     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
1891     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
1892     int bmp_mem = bmp_block_data + bmp_trie;
1893     qDebug("    %d unique blocks in BMP.",blocks.size());
1894     qDebug("        block data uses: %d bytes", bmp_block_data);
1895     qDebug("        trie data uses : %d bytes", bmp_trie);
1896
1897     int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
1898     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
1899     int smp_mem = smp_block_data + smp_trie;
1900     qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
1901     qDebug("        block data uses: %d bytes", smp_block_data);
1902     qDebug("        trie data uses : %d bytes", smp_trie);
1903
1904     qDebug("\n        properties use : %d bytes", uniqueProperties.size()*20);
1905     qDebug("    memory usage: %d bytes", bmp_mem+smp_mem + uniqueProperties.size()*20);
1906
1907     QByteArray out;
1908     out += "static const unsigned short uc_property_trie[] = {\n";
1909
1910     // first write the map
1911     out += "    // 0x" + QByteArray::number(BMP_END, 16);
1912     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
1913         if (!(i % 8)) {
1914             if (out.endsWith(' '))
1915                 out.chop(1);
1916             if (!((i*BMP_BLOCKSIZE) % 0x1000))
1917                 out += "\n";
1918             out += "\n    ";
1919         }
1920         out += QByteArray::number(blockMap.at(i) + blockMap.size());
1921         out += ", ";
1922     }
1923     if (out.endsWith(' '))
1924         out.chop(1);
1925     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
1926     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
1927         if (!(i % 8)) {
1928             if (out.endsWith(' '))
1929                 out.chop(1);
1930             if (!(i % (0x10000/SMP_BLOCKSIZE)))
1931                 out += "\n";
1932             out += "\n    ";
1933         }
1934         out += QByteArray::number(blockMap.at(i) + blockMap.size());
1935         out += ", ";
1936     }
1937     if (out.endsWith(' '))
1938         out.chop(1);
1939     out += "\n";
1940     // write the data
1941     for (int i = 0; i < blocks.size(); ++i) {
1942         if (out.endsWith(' '))
1943             out.chop(1);
1944         out += "\n";
1945         const PropertyBlock &b = blocks.at(i);
1946         for (int j = 0; j < b.properties.size(); ++j) {
1947             if (!(j % 8)) {
1948                 if (out.endsWith(' '))
1949                     out.chop(1);
1950                 out += "\n    ";
1951             }
1952             out += QByteArray::number(b.properties.at(j));
1953             out += ", ";
1954         }
1955     }
1956
1957     // we reserve one bit more than in the assert below for the sign
1958     Q_ASSERT(maxMirroredDiff < (1<<12));
1959     Q_ASSERT(maxLowerCaseDiff < (1<<14));
1960     Q_ASSERT(maxUpperCaseDiff < (1<<14));
1961     Q_ASSERT(maxTitleCaseDiff < (1<<14));
1962     Q_ASSERT(maxCaseFoldDiff < (1<<14));
1963
1964     if (out.endsWith(' '))
1965         out.chop(1);
1966     out += "\n};\n\n"
1967
1968            "#define GET_PROP_INDEX(ucs4) \\\n"
1969            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
1970            "        ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
1971            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
1972            "        : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
1973            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
1974            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
1975            "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
1976            "(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
1977            "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"
1978
1979
1980            "static const QUnicodeTables::Properties uc_properties [] = {\n";
1981
1982     // keep in sync with the property declaration
1983     for (int i = 0; i < uniqueProperties.size(); ++i) {
1984         PropertyFlags p = uniqueProperties.at(i);
1985         out += "    { ";
1986 //     "        ushort category : 8;\n"
1987         out += QByteArray::number( p.category );
1988         out += ", ";
1989 //     "        ushort line_break_class : 8;\n"
1990         out += QByteArray::number( p.line_break_class );
1991         out += ", ";
1992 //     "        ushort direction : 8;\n"
1993         out += QByteArray::number( p.direction );
1994         out += ", ";
1995 //     "        ushort combiningClass :8;\n"
1996         out += QByteArray::number( p.combiningClass );
1997         out += ", ";
1998 //     "        ushort joining : 2;\n"
1999         out += QByteArray::number( p.joining );
2000         out += ", ";
2001 //     "        signed short digitValue : 6;\n /* 5 needed */"
2002         out += QByteArray::number( p.digitValue );
2003         out += ", ";
2004 //     "        ushort unicodeVersion : 4;\n"
2005         out += QByteArray::number( p.age );
2006         out += ", ";
2007 //     "        ushort lowerCaseSpecial : 1;\n"
2008 //     "        ushort upperCaseSpecial : 1;\n"
2009 //     "        ushort titleCaseSpecial : 1;\n"
2010 //     "        ushort caseFoldSpecial : 1;\n"
2011         out += QByteArray::number( p.lowerCaseSpecial );
2012         out += ", ";
2013         out += QByteArray::number( p.upperCaseSpecial );
2014         out += ", ";
2015         out += QByteArray::number( p.titleCaseSpecial );
2016         out += ", ";
2017         out += QByteArray::number( p.caseFoldSpecial );
2018         out += ", ";
2019 //     "        signed short mirrorDiff : 16;\n"
2020 //     "        signed short lowerCaseDiff : 16;\n"
2021 //     "        signed short upperCaseDiff : 16;\n"
2022 //     "        signed short titleCaseDiff : 16;\n"
2023 //     "        signed short caseFoldDiff : 16;\n"
2024         out += QByteArray::number( p.mirrorDiff );
2025         out += ", ";
2026         out += QByteArray::number( p.lowerCaseDiff );
2027         out += ", ";
2028         out += QByteArray::number( p.upperCaseDiff );
2029         out += ", ";
2030         out += QByteArray::number( p.titleCaseDiff );
2031         out += ", ";
2032         out += QByteArray::number( p.caseFoldDiff );
2033         out += ", ";
2034         out += QByteArray::number( p.graphemeBreak );
2035         out += ", ";
2036         out += QByteArray::number( p.wordBreak );
2037         out += ", ";
2038         out += QByteArray::number( p.sentenceBreak );
2039         out += "},\n";
2040     }
2041     out += "};\n\n";
2042
2043     out += "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n"
2044            "{\n"
2045            "    int index = GET_PROP_INDEX(ucs4);\n"
2046            "    return uc_properties + index;\n"
2047            "}\n"
2048            "\n"
2049            "static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n"
2050            "{\n"
2051            "    int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2052            "    return uc_properties + index;\n"
2053            "}\n"
2054            "\n"
2055            "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n"
2056            "{\n"
2057            "    int index = GET_PROP_INDEX(ucs4);\n"
2058            "    return uc_properties + index;\n"
2059            "}\n"
2060            "\n"
2061            "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n"
2062            "{\n"
2063            "    int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2064            "    return uc_properties + index;\n"
2065            "}\n\n";
2066
2067     out += "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION"\n\n";
2068
2069     out += "static const ushort specialCaseMap [] = {";
2070     for (int i = 0; i < specialCaseMap.size(); ++i) {
2071         if (!(i % 16))
2072             out += "\n   ";
2073         out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16);
2074         if (i < specialCaseMap.size() - 1)
2075             out += ",";
2076     }
2077     out += "\n};\n";
2078     out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n";
2079
2080     qDebug() << "Special case map uses " << specialCaseMap.size()*2 << "bytes";
2081
2082     return out;
2083 }
2084
2085
2086 struct DecompositionBlock {
2087     DecompositionBlock() { index = -1; }
2088     int index;
2089     QList<int> decompositionPositions;
2090     bool operator ==(const DecompositionBlock &other)
2091         { return decompositionPositions == other.decompositionPositions; }
2092 };
2093
2094 static QByteArray createCompositionInfo()
2095 {
2096     qDebug("createCompositionInfo:");
2097
2098     const int BMP_BLOCKSIZE=16;
2099     const int BMP_SHIFT = 4;
2100     const int BMP_END = 0x3400; // start of Han
2101     const int SMP_END = 0x30000;
2102     const int SMP_BLOCKSIZE = 256;
2103     const int SMP_SHIFT = 8;
2104
2105     if(SMP_END <= highestComposedCharacter)
2106         qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter);
2107
2108     QList<DecompositionBlock> blocks;
2109     QList<int> blockMap;
2110     QList<unsigned short> decompositions;
2111
2112     int used = 0;
2113     int tableIndex = 0;
2114
2115     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2116         DecompositionBlock b;
2117         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2118             int uc = block*BMP_BLOCKSIZE + i;
2119             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2120             if (!d.decomposition.isEmpty()) {
2121                 int utf16Chars = 0;
2122                 for (int j = 0; j < d.decomposition.size(); ++j)
2123                     utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
2124                 decompositions.append(d.decompositionType + (utf16Chars<<8));
2125                 for (int j = 0; j < d.decomposition.size(); ++j) {
2126                     int code = d.decomposition.at(j);
2127                     if (code > 0x10000) {
2128                         // save as surrogate pair
2129                         code -= 0x10000;
2130                         ushort high = code/0x400 + 0xd800;
2131                         ushort low = code%0x400 + 0xdc00;
2132                         decompositions.append(high);
2133                         decompositions.append(low);
2134                     } else {
2135                         decompositions.append(code);
2136                     }
2137                 }
2138                 b.decompositionPositions.append(tableIndex);
2139                 tableIndex += utf16Chars + 1;
2140             } else {
2141                 b.decompositionPositions.append(0xffff);
2142             }
2143         }
2144         int index = blocks.indexOf(b);
2145         if (index == -1) {
2146             index = blocks.size();
2147             b.index = used;
2148             used += BMP_BLOCKSIZE;
2149             blocks.append(b);
2150         }
2151         blockMap.append(blocks.at(index).index);
2152     }
2153
2154     int bmp_blocks = blocks.size();
2155     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2156
2157     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2158         DecompositionBlock b;
2159         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2160             int uc = block*SMP_BLOCKSIZE + i;
2161             UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2162             if (!d.decomposition.isEmpty()) {
2163                 int utf16Chars = 0;
2164                 for (int j = 0; j < d.decomposition.size(); ++j)
2165                     utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
2166                 decompositions.append(d.decompositionType + (utf16Chars<<8));
2167                 for (int j = 0; j < d.decomposition.size(); ++j) {
2168                     int code = d.decomposition.at(j);
2169                     if (code > 0x10000) {
2170                         // save as surrogate pair
2171                         code -= 0x10000;
2172                         ushort high = code/0x400 + 0xd800;
2173                         ushort low = code%0x400 + 0xdc00;
2174                         decompositions.append(high);
2175                         decompositions.append(low);
2176                     } else {
2177                         decompositions.append(code);
2178                     }
2179                 }
2180                 b.decompositionPositions.append(tableIndex);
2181                 tableIndex += utf16Chars + 1;
2182             } else {
2183                 b.decompositionPositions.append(0xffff);
2184             }
2185         }
2186         int index = blocks.indexOf(b);
2187         if (index == -1) {
2188             index = blocks.size();
2189             b.index = used;
2190             used += SMP_BLOCKSIZE;
2191             blocks.append(b);
2192         }
2193         blockMap.append(blocks.at(index).index);
2194     }
2195
2196     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2197     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2198     int bmp_mem = bmp_block_data + bmp_trie;
2199     qDebug("    %d unique blocks in BMP.",blocks.size());
2200     qDebug("        block data uses: %d bytes", bmp_block_data);
2201     qDebug("        trie data uses : %d bytes", bmp_trie);
2202     qDebug("        memory usage: %d bytes", bmp_mem);
2203
2204     int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
2205     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2206     int smp_mem = smp_block_data + smp_trie;
2207     qDebug("    %d unique blocks in SMP.",blocks.size()-bmp_blocks);
2208     qDebug("        block data uses: %d bytes", smp_block_data);
2209     qDebug("        trie data uses : %d bytes", smp_trie);
2210
2211     qDebug("\n        decomposition table use : %d bytes", decompositions.size()*2);
2212     qDebug("    memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2);
2213
2214     QByteArray out;
2215
2216     out += "static const unsigned short uc_decomposition_trie[] = {\n";
2217
2218     // first write the map
2219     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2220     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2221         if (!(i % 8)) {
2222             if (out.endsWith(' '))
2223                 out.chop(1);
2224             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2225                 out += "\n";
2226             out += "\n    ";
2227         }
2228         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2229         out += ", ";
2230     }
2231     if (out.endsWith(' '))
2232         out.chop(1);
2233     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
2234     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2235         if (!(i % 8)) {
2236             if (out.endsWith(' '))
2237                 out.chop(1);
2238             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2239                 out += "\n";
2240             out += "\n    ";
2241         }
2242         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2243         out += ", ";
2244     }
2245     if (out.endsWith(' '))
2246         out.chop(1);
2247     out += "\n";
2248     // write the data
2249     for (int i = 0; i < blocks.size(); ++i) {
2250         if (out.endsWith(' '))
2251             out.chop(1);
2252         out += "\n";
2253         const DecompositionBlock &b = blocks.at(i);
2254         for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2255             if (!(j % 8)) {
2256                 if (out.endsWith(' '))
2257                     out.chop(1);
2258                 out += "\n    ";
2259             }
2260             out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2261             out += ", ";
2262         }
2263     }
2264
2265     if (out.endsWith(' '))
2266         out.chop(1);
2267     out += "\n};\n\n"
2268
2269            "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2270            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2271            "        ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2272            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2273            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2274            "           ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2275            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2276            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2277            "           : 0xffff))\n\n"
2278
2279            "static const unsigned short uc_decomposition_map[] = {\n";
2280
2281     for (int i = 0; i < decompositions.size(); ++i) {
2282         if (!(i % 8)) {
2283             if (out.endsWith(' '))
2284                 out.chop(1);
2285             out += "\n    ";
2286         }
2287         out += "0x" + QByteArray::number(decompositions.at(i), 16);
2288         out += ", ";
2289     }
2290
2291     if (out.endsWith(' '))
2292         out.chop(1);
2293     out += "\n};\n\n";
2294
2295     return out;
2296 }
2297
2298 static QByteArray createLigatureInfo()
2299 {
2300     qDebug("createLigatureInfo: numLigatures=%d", numLigatures);
2301
2302     QList<DecompositionBlock> blocks;
2303     QList<int> blockMap;
2304     QList<unsigned short> ligatures;
2305
2306     const int BMP_BLOCKSIZE = 32;
2307     const int BMP_SHIFT = 5;
2308     const int BMP_END = 0x3100;
2309     Q_ASSERT(highestLigature < BMP_END);
2310
2311     int used = 0;
2312     int tableIndex = 0;
2313
2314     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2315         DecompositionBlock b;
2316         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2317             int uc = block*BMP_BLOCKSIZE + i;
2318             QList<Ligature> l = ligatureHashes.value(uc);
2319             if (!l.isEmpty()) {
2320                 b.decompositionPositions.append(tableIndex);
2321                 qSort(l);
2322
2323                 ligatures.append(l.size());
2324                 for (int i = 0; i < l.size(); ++i) {
2325                     Q_ASSERT(l.at(i).u2 == uc);
2326                     ligatures.append(l.at(i).u1);
2327                     ligatures.append(l.at(i).ligature);
2328                 }
2329                 tableIndex += 2*l.size() + 1;
2330             } else {
2331                 b.decompositionPositions.append(0xffff);
2332             }
2333         }
2334         int index = blocks.indexOf(b);
2335         if (index == -1) {
2336             index = blocks.size();
2337             b.index = used;
2338             used += BMP_BLOCKSIZE;
2339             blocks.append(b);
2340         }
2341         blockMap.append(blocks.at(index).index);
2342     }
2343
2344     int bmp_blocks = blocks.size();
2345     Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2346
2347     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2348     int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2349     int bmp_mem = bmp_block_data + bmp_trie;
2350     qDebug("    %d unique blocks in BMP.",blocks.size());
2351     qDebug("        block data uses: %d bytes", bmp_block_data);
2352     qDebug("        trie data uses : %d bytes", bmp_trie);
2353     qDebug("        ligature data uses : %d bytes", ligatures.size()*2);
2354     qDebug("        memory usage: %d bytes", bmp_mem + ligatures.size() * 2);
2355
2356     QByteArray out;
2357
2358
2359     out += "static const unsigned short uc_ligature_trie[] = {\n";
2360
2361     // first write the map
2362     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2363     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2364         if (!(i % 8)) {
2365             if (out.endsWith(' '))
2366                 out.chop(1);
2367             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2368                 out += "\n";
2369             out += "\n    ";
2370         }
2371         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2372         out += ", ";
2373     }
2374     if (out.endsWith(' '))
2375         out.chop(1);
2376     out += "\n";
2377     // write the data
2378     for (int i = 0; i < blocks.size(); ++i) {
2379         if (out.endsWith(' '))
2380             out.chop(1);
2381         out += "\n";
2382         const DecompositionBlock &b = blocks.at(i);
2383         for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2384             if (!(j % 8)) {
2385                 if (out.endsWith(' '))
2386                     out.chop(1);
2387                 out += "\n    ";
2388             }
2389             out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2390             out += ", ";
2391         }
2392     }
2393     if (out.endsWith(' '))
2394         out.chop(1);
2395     out += "\n};\n\n"
2396
2397            "#define GET_LIGATURE_INDEX(u2) "
2398            "(u2 < 0x" + QByteArray::number(BMP_END, 16) + " ? "
2399            "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) +
2400            "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n"
2401
2402            "static const unsigned short uc_ligature_map [] = {\n";
2403
2404     for (int i = 0; i < ligatures.size(); ++i) {
2405         if (!(i % 8)) {
2406             if (out.endsWith(' '))
2407                 out.chop(1);
2408             out += "\n    ";
2409         }
2410         out += "0x" + QByteArray::number(ligatures.at(i), 16);
2411         out += ", ";
2412     }
2413
2414     if (out.endsWith(' '))
2415         out.chop(1);
2416     out += "\n};\n\n";
2417
2418     return out;
2419 }
2420
2421 QByteArray createCasingInfo()
2422 {
2423     QByteArray out;
2424
2425     out += "struct CasingInfo {\n"
2426            "    uint codePoint : 16;\n"
2427            "    uint flags : 8;\n"
2428            "    uint offset : 8;\n"
2429            "};\n\n";
2430
2431     return out;
2432 }
2433
2434 int main(int, char **)
2435 {
2436     initCategoryMap();
2437     initDirectionMap();
2438     initDecompositionMap();
2439     initGraphemeBreak();
2440     initWordBreak();
2441     initSentenceBreak();
2442
2443     readUnicodeData();
2444     readBidiMirroring();
2445     readArabicShaping();
2446     readDerivedAge();
2447     readCompositionExclusion();
2448     readLineBreak();
2449     readSpecialCasing();
2450     readCaseFolding();
2451     // readBlocks();
2452     readScripts();
2453     readGraphemeBreak();
2454     readWordBreak();
2455     readSentenceBreak();
2456
2457     computeUniqueProperties();
2458     QByteArray properties = createPropertyInfo();
2459     QByteArray compositions = createCompositionInfo();
2460     QByteArray ligatures = createLigatureInfo();
2461     QByteArray normalizationCorrections = createNormalizationCorrections();
2462     QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
2463     QByteArray scriptTableDeclaration = createScriptTableDeclaration();
2464
2465     QFile f("../../src/corelib/tools/qunicodetables.cpp");
2466     f.open(QFile::WriteOnly|QFile::Truncate);
2467
2468     QByteArray header =
2469         "/****************************************************************************\n"
2470         "**\n"
2471         "** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).\n"
2472         "** All rights reserved.\n"
2473         "** Contact: Nokia Corporation (qt-info@nokia.com)\n"
2474         "**\n"
2475         "** This file is part of the QtCore module of the Qt Toolkit.\n"
2476         "**\n"
2477         "** $QT_BEGIN_LICENSE:LGPL$\n"
2478         "** No Commercial Usage\n"
2479         "** This file contains pre-release code and may not be distributed.\n"
2480         "** You may use this file in accordance with the terms and conditions\n"
2481         "** contained in the Technology Preview License Agreement accompanying\n"
2482         "** this package.\n"
2483         "**\n"
2484         "** GNU Lesser General Public License Usage\n"
2485         "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
2486         "** General Public License version 2.1 as published by the Free Software\n"
2487         "** Foundation and appearing in the file LICENSE.LGPL included in the\n"
2488         "** packaging of this file.  Please review the following information to\n"
2489         "** ensure the GNU Lesser General Public License version 2.1 requirements\n"
2490         "** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
2491         "**\n"
2492         "** In addition, as a special exception, Nokia gives you certain additional\n"
2493         "** rights.  These rights are described in the Nokia Qt LGPL Exception\n"
2494         "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
2495         "**\n"
2496         "** If you have questions regarding the use of this file, please contact\n"
2497         "** Nokia at qt-info@nokia.com.\n"
2498         "**\n"
2499         "**\n"
2500         "**\n"
2501         "**\n"
2502         "**\n"
2503         "**\n"
2504         "**\n"
2505         "**\n"
2506         "** $QT_END_LICENSE$\n"
2507         "**\n"
2508         "****************************************************************************/\n\n"
2509
2510         "/* This file is autogenerated from the Unicode 5.0 database. Do not edit */\n\n";
2511
2512     QByteArray warning =
2513         "//\n"
2514         "//  W A R N I N G\n"
2515         "//  -------------\n"
2516         "//\n"
2517         "// This file is not part of the Qt API.  It exists for the convenience\n"
2518         "// of internal files.  This header file may change from version to version\n"
2519         "// without notice, or even be removed.\n"
2520         "//\n"
2521         "// We mean it.\n"
2522         "//\n\n";
2523
2524     f.write(header);
2525     f.write("QT_BEGIN_NAMESPACE\n\n");
2526     f.write(properties);
2527     f.write(compositions);
2528     f.write(ligatures);
2529     f.write(normalizationCorrections);
2530     f.write(scriptTableDeclaration);
2531     f.write("\nQT_END_NAMESPACE\n");
2532     f.close();
2533
2534     f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
2535     f.open(QFile::WriteOnly | QFile::Truncate);
2536     f.write(header);
2537     f.write(warning);
2538     f.write("#ifndef QUNICODETABLES_P_H\n"
2539             "#define QUNICODETABLES_P_H\n\n"
2540             "#include <QtCore/qchar.h>\n\n"
2541             "QT_BEGIN_NAMESPACE\n\n");
2542     f.write("namespace QUnicodeTables {\n");
2543     f.write(property_string);
2544     f.write("\n");
2545     f.write(scriptEnumDeclaration);
2546     f.write("\n");
2547     f.write(lineBreakClass);
2548     f.write("\n");
2549     f.write(methods);
2550     f.write("\n");
2551     f.write(grapheme_break_string);
2552     f.write("\n");
2553     f.write(word_break_string);
2554     f.write("\n");
2555     f.write(sentence_break_string);
2556     f.write("\n}\n\n"
2557             "QT_END_NAMESPACE\n\n"
2558             "#endif\n");
2559     f.close();
2560
2561     qDebug() << "maxMirroredDiff  = " << hex << maxMirroredDiff;
2562     qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
2563     qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
2564     qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
2565     qDebug() << "maxCaseFoldDiff  = " << hex << maxCaseFoldDiff;
2566 #if 0
2567 //     dump(0, 0x7f);
2568 //     dump(0x620, 0x640);
2569 //     dump(0x10000, 0x10020);
2570 //     dump(0x10800, 0x10820);
2571
2572     qDebug("decompositionLength used:");
2573     int totalcompositions = 0;
2574     int sum = 0;
2575     for (int i = 1; i < 20; ++i) {
2576         qDebug("    length %d used %d times", i, decompositionLength.value(i, 0));
2577         totalcompositions += i*decompositionLength.value(i, 0);
2578         sum += decompositionLength.value(i, 0);
2579     }
2580     qDebug("    len decomposition map %d, average length %f, num composed chars %d",
2581            totalcompositions, (float)totalcompositions/(float)sum,  sum);
2582     qDebug("highest composed character %x", highestComposedCharacter);
2583     qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
2584
2585     qBubbleSort(ligatures);
2586     for (int i = 0; i < ligatures.size(); ++i)
2587         qDebug("%s", ligatures.at(i).data());
2588
2589 //     qDebug("combiningClass usage:");
2590 //     int numClasses = 0;
2591 //     for (int i = 0; i < 255; ++i) {
2592 //         int num = combiningClassUsage.value(i, 0);
2593 //         if (num) {
2594 //             ++numClasses;
2595 //             qDebug("    combiningClass %d used %d times", i, num);
2596 //         }
2597 //     }
2598 //     qDebug("total of %d combining classes used", numClasses);
2599
2600 #endif
2601 }
2602