(MSVC 2002/2003) Use 64-bit versions of ftell and fseek
[qt-netbsd.git] / util / unicode / main.cpp
blob5982d81ef3c8b171984ee11ec0fb6c8632ed39a5
1 /****************************************************************************
2 **
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4 ** All rights reserved.
5 ** Contact: Nokia Corporation (qt-info@nokia.com)
6 **
7 ** This file is part of the utils of the Qt Toolkit.
8 **
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** No Commercial Usage
11 ** This file contains pre-release code and may not be distributed.
12 ** You may use this file in accordance with the terms and conditions
13 ** contained in the Technology Preview License Agreement accompanying
14 ** this package.
16 ** GNU Lesser General Public License Usage
17 ** Alternatively, this file may be used under the terms of the GNU Lesser
18 ** General Public License version 2.1 as published by the Free Software
19 ** Foundation and appearing in the file LICENSE.LGPL included in the
20 ** packaging of this file. Please review the following information to
21 ** ensure the GNU Lesser General Public License version 2.1 requirements
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
24 ** In addition, as a special exception, Nokia gives you certain additional
25 ** rights. These rights are described in the Nokia Qt LGPL Exception
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
28 ** If you have questions regarding the use of this file, please contact
29 ** Nokia at qt-info@nokia.com.
38 ** $QT_END_LICENSE$
40 ****************************************************************************/
41 #include <qlist.h>
42 #include <qhash.h>
43 #include <qfile.h>
44 #include <qstring.h>
45 #include <qchar.h>
46 #include <private/qunicodetables_p.h>
47 #include <qvector.h>
48 #include <qdebug.h>
51 static struct AgeMap {
52 const char *age;
53 const QChar::UnicodeVersion version;
54 } ageMap [] = {
55 { "1.1", QChar::Unicode_1_1 },
56 { "2.0", QChar::Unicode_2_0 },
57 { "2.1", QChar::Unicode_2_1_2 },
58 { "3.0", QChar::Unicode_3_0 },
59 { "3.1", QChar::Unicode_3_1 },
60 { "3.2", QChar::Unicode_3_2 },
61 { "4.0", QChar::Unicode_4_0 },
62 { "4.1", QChar::Unicode_4_1 },
63 { "5.0", QChar::Unicode_5_0 },
64 { 0, QChar::Unicode_Unassigned }
66 #define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0"
68 static const char *grapheme_break_string =
69 " enum GraphemeBreak {\n"
70 " GraphemeBreakOther, \n"
71 " GraphemeBreakCR,\n"
72 " GraphemeBreakLF,\n"
73 " GraphemeBreakControl,\n"
74 " GraphemeBreakExtend,\n"
75 " GraphemeBreakL,\n"
76 " GraphemeBreakV,\n"
77 " GraphemeBreakT,\n"
78 " GraphemeBreakLV,\n"
79 " GraphemeBreakLVT\n"
80 " };\n\n";
82 enum GraphemeBreak {
83 GraphemeBreakOther,
84 GraphemeBreakCR,
85 GraphemeBreakLF,
86 GraphemeBreakControl,
87 GraphemeBreakExtend,
88 GraphemeBreakL,
89 GraphemeBreakV,
90 GraphemeBreakT,
91 GraphemeBreakLV,
92 GraphemeBreakLVT
95 QHash<QByteArray, GraphemeBreak> grapheme_break_map;
97 static void initGraphemeBreak()
99 struct GraphemeBreakList {
100 GraphemeBreak brk;
101 const char *name;
102 } breaks[] = {
103 { GraphemeBreakOther, "Other" },
104 { GraphemeBreakCR, "CR" },
105 { GraphemeBreakLF, "LF" },
106 { GraphemeBreakControl, "Control" },
107 { GraphemeBreakExtend, "Extend" },
108 { GraphemeBreakL, "L" },
109 { GraphemeBreakV, "V" },
110 { GraphemeBreakT, "T" },
111 { GraphemeBreakLV, "LV" },
112 { GraphemeBreakLVT, "LVT" },
113 { GraphemeBreakOther, 0 }
115 GraphemeBreakList *d = breaks;
116 while (d->name) {
117 grapheme_break_map.insert(d->name, d->brk);
118 ++d;
122 const char *word_break_string =
123 " enum WordBreak {\n"
124 " WordBreakOther,\n"
125 " WordBreakFormat,\n"
126 " WordBreakKatakana,\n"
127 " WordBreakALetter,\n"
128 " WordBreakMidLetter,\n"
129 " WordBreakMidNum,\n"
130 " WordBreakNumeric,\n"
131 " WordBreakExtendNumLet\n"
132 " };\n\n";
134 enum WordBreak {
135 WordBreakOther,
136 WordBreakFormat,
137 WordBreakKatakana,
138 WordBreakALetter,
139 WordBreakMidLetter,
140 WordBreakMidNum,
141 WordBreakNumeric,
142 WordBreakExtendNumLet
146 QHash<QByteArray, WordBreak> word_break_map;
148 static void initWordBreak()
150 struct WordBreakList {
151 WordBreak brk;
152 const char *name;
153 } breaks[] = {
154 { WordBreakFormat, "Format" },
155 { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
156 { WordBreakKatakana, "Katakana" },
157 { WordBreakALetter, "ALetter" },
158 { WordBreakMidLetter, "MidLetter" },
159 { WordBreakMidNum, "MidNum" },
160 { WordBreakNumeric, "Numeric" },
161 { WordBreakExtendNumLet, "ExtendNumLet" },
162 { WordBreakFormat, 0 }
164 WordBreakList *d = breaks;
165 while (d->name) {
166 word_break_map.insert(d->name, d->brk);
167 ++d;
172 static const char *sentence_break_string =
173 " enum SentenceBreak {\n"
174 " SentenceBreakOther,\n"
175 " SentenceBreakSep,\n"
176 " SentenceBreakFormat,\n"
177 " SentenceBreakSp,\n"
178 " SentenceBreakLower,\n"
179 " SentenceBreakUpper,\n"
180 " SentenceBreakOLetter,\n"
181 " SentenceBreakNumeric,\n"
182 " SentenceBreakATerm,\n"
183 " SentenceBreakSTerm,\n"
184 " SentenceBreakClose\n"
185 " };\n\n";
187 enum SentenceBreak {
188 SentenceBreakOther,
189 SentenceBreakSep,
190 SentenceBreakFormat,
191 SentenceBreakSp,
192 SentenceBreakLower,
193 SentenceBreakUpper,
194 SentenceBreakOLetter,
195 SentenceBreakNumeric,
196 SentenceBreakATerm,
197 SentenceBreakSTerm,
198 SentenceBreakClose
202 QHash<QByteArray, SentenceBreak> sentence_break_map;
204 static void initSentenceBreak()
206 struct SentenceBreakList {
207 SentenceBreak brk;
208 const char *name;
209 } breaks[] = {
210 { SentenceBreakOther, "Other" },
211 { SentenceBreakSep, "Sep" },
212 { SentenceBreakFormat, "Format" },
213 { SentenceBreakSp, "Sp" },
214 { SentenceBreakLower, "Lower" },
215 { SentenceBreakUpper, "Upper" },
216 { SentenceBreakOLetter, "OLetter" },
217 { SentenceBreakNumeric, "Numeric" },
218 { SentenceBreakATerm, "ATerm" },
219 { SentenceBreakSTerm, "STerm" },
220 { SentenceBreakClose, "Close" },
221 { SentenceBreakOther, 0 }
223 SentenceBreakList *d = breaks;
224 while (d->name) {
225 sentence_break_map.insert(d->name, d->brk);
226 ++d;
231 // Keep this one in sync with the code in createPropertyInfo
232 const char *property_string =
233 " struct Properties {\n"
234 " ushort category : 8;\n"
235 " ushort line_break_class : 8;\n"
236 " ushort direction : 8;\n"
237 " ushort combiningClass :8;\n"
238 " ushort joining : 2;\n"
239 " signed short digitValue : 6; /* 5 needed */\n"
240 " ushort unicodeVersion : 4;\n"
241 " ushort lowerCaseSpecial : 1;\n"
242 " ushort upperCaseSpecial : 1;\n"
243 " ushort titleCaseSpecial : 1;\n"
244 " ushort caseFoldSpecial : 1; /* currently unused */\n"
245 " signed short mirrorDiff : 16;\n"
246 " signed short lowerCaseDiff : 16;\n"
247 " signed short upperCaseDiff : 16;\n"
248 " signed short titleCaseDiff : 16;\n"
249 " signed short caseFoldDiff : 16;\n"
250 " ushort graphemeBreak : 8;\n"
251 " ushort wordBreak : 8;\n"
252 " ushort sentenceBreak : 8;\n"
253 " };\n"
254 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
255 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
257 const char *lineBreakClass =
258 " // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
259 " // we don't use the XX, AI and CB properties and map them to AL instead.\n"
260 " // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
261 " enum LineBreakClass {\n"
262 " LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
263 " LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
264 " LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
265 " LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
266 " LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
267 " LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
268 " LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
269 " };\n\n";
271 const char *methods =
272 " Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
273 " inline int lineBreakClass(const QChar &ch) {\n"
274 " return QUnicodeTables::lineBreakClass(ch.unicode());\n"
275 " }\n"
276 "\n"
277 " Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
278 " Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n"
279 " return script(ch.unicode());\n"
280 " }\n\n";
283 struct PropertyFlags {
284 bool operator ==(const PropertyFlags &o) {
285 return (combiningClass == o.combiningClass
286 && category == o.category
287 && direction == o.direction
288 && joining == o.joining
289 && age == o.age
290 && digitValue == o.digitValue
291 && line_break_class == o.line_break_class
292 && mirrorDiff == o.mirrorDiff
293 && lowerCaseDiff == o.lowerCaseDiff
294 && upperCaseDiff == o.upperCaseDiff
295 && titleCaseDiff == o.titleCaseDiff
296 && caseFoldDiff == o.caseFoldDiff
297 && lowerCaseSpecial == o.lowerCaseSpecial
298 && upperCaseSpecial == o.upperCaseSpecial
299 && titleCaseSpecial == o.titleCaseSpecial
300 && caseFoldSpecial == o.caseFoldSpecial
301 && graphemeBreak == o.graphemeBreak
302 && wordBreak == o.wordBreak
303 && sentenceBreak == o.sentenceBreak
306 // from UnicodeData.txt
307 uchar combiningClass : 8;
308 QChar::Category category : 5;
309 QChar::Direction direction : 5;
310 // from ArabicShaping.txt
311 QChar::Joining joining : 2;
312 // from DerivedAge.txt
313 QChar::UnicodeVersion age : 4;
314 int digitValue;
315 uint line_break_class : 5;
317 int mirrorDiff : 16;
319 int lowerCaseDiff;
320 int upperCaseDiff;
321 int titleCaseDiff;
322 int caseFoldDiff;
323 bool lowerCaseSpecial;
324 bool upperCaseSpecial;
325 bool titleCaseSpecial;
326 bool caseFoldSpecial;
327 GraphemeBreak graphemeBreak;
328 WordBreak wordBreak;
329 SentenceBreak sentenceBreak;
332 QList<int> specialCaseMap;
333 int specialCaseMaxLen = 0;
335 static int appendToSpecialCaseMap(const QList<int> &map)
337 QList<int> utf16map;
338 for (int i = 0; i < map.size(); ++i) {
339 int val = map.at(i);
340 if (val > 0xffff) {
341 utf16map << QChar::highSurrogate(val);
342 utf16map << QChar::lowSurrogate(val);
343 } else {
344 utf16map << val;
347 specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size());
348 utf16map << 0;
350 for (int i = 0; i < specialCaseMap.size() - utf16map.size() - 1; ++i) {
351 int j;
352 for (j = 0; j < utf16map.size(); ++j) {
353 if (specialCaseMap.at(i+j) != utf16map.at(j))
354 break;
356 if (j == utf16map.size())
357 return i;
360 int pos = specialCaseMap.size();
361 specialCaseMap << utf16map;
362 return pos;
365 struct UnicodeData {
366 UnicodeData(int codepoint = 0) {
367 p.category = QChar::NoCategory;
368 p.combiningClass = 0;
370 p.direction = QChar::DirL;
371 // DirR for: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
372 if ((codepoint >= 0x590 && codepoint <= 0x5ff)
373 || (codepoint >= 0x7c0 && codepoint <= 0x8ff)
374 || (codepoint >= 0xfb1d && codepoint <= 0xfb4f)
375 || (codepoint >= 0x10800 && codepoint <= 0x10fff))
376 p.direction = QChar::DirR;
377 // DirAL for: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE
378 if ((codepoint >= 0x600 && codepoint <= 0x7bf)
379 || (codepoint >= 0xfb50 && codepoint <= 0xfdcf)
380 || (codepoint >= 0xfdf0 && codepoint <= 0xfdff)
381 || (codepoint >= 0xfe70 && codepoint <= 0xfefe))
382 p.direction = QChar::DirAL;
384 mirroredChar = 0;
385 decompositionType = QChar::NoDecomposition;
386 p.joining = QChar::OtherJoining;
387 p.age = QChar::Unicode_Unassigned;
388 p.mirrorDiff = 0;
389 p.digitValue = -1;
390 p.line_break_class = QUnicodeTables::LineBreak_AL;
391 p.lowerCaseDiff = 0;
392 p.upperCaseDiff = 0;
393 p.titleCaseDiff = 0;
394 p.caseFoldDiff = 0;
395 p.lowerCaseSpecial = 0;
396 p.upperCaseSpecial = 0;
397 p.titleCaseSpecial = 0;
398 p.caseFoldSpecial = 0;
399 p.graphemeBreak = GraphemeBreakOther;
400 p.wordBreak = WordBreakOther;
401 p.sentenceBreak = SentenceBreakOther;
402 propertyIndex = -1;
403 excludedComposition = false;
405 PropertyFlags p;
407 // from UnicodeData.txt
408 QChar::Decomposition decompositionType;
409 QList<int> decomposition;
411 QList<int> specialFolding;
413 // from BidiMirroring.txt
414 int mirroredChar;
416 // CompositionExclusions.txt
417 bool excludedComposition;
419 // computed position of unicode property set
420 int propertyIndex;
423 enum UniDataFields {
424 UD_Value,
425 UD_Name,
426 UD_Category,
427 UD_CombiningClass,
428 UD_BidiCategory,
429 UD_Decomposition,
430 UD_DecimalDigitValue,
431 UD_DigitValue,
432 UD_NumericValue,
433 UD_Mirrored,
434 UD_OldName,
435 UD_Comment,
436 UD_UpperCase,
437 UD_LowerCase,
438 UD_TitleCase
441 QHash<QByteArray, QChar::Category> categoryMap;
443 static void initCategoryMap()
445 struct Cat {
446 QChar::Category cat;
447 const char *name;
448 } categories [] = {
449 { QChar::Mark_NonSpacing, "Mn" },
450 { QChar::Mark_SpacingCombining, "Mc" },
451 { QChar::Mark_Enclosing, "Me" },
453 { QChar::Number_DecimalDigit, "Nd" },
454 { QChar::Number_Letter, "Nl" },
455 { QChar::Number_Other, "No" },
457 { QChar::Separator_Space, "Zs" },
458 { QChar::Separator_Line, "Zl" },
459 { QChar::Separator_Paragraph, "Zp" },
461 { QChar::Other_Control, "Cc" },
462 { QChar::Other_Format, "Cf" },
463 { QChar::Other_Surrogate, "Cs" },
464 { QChar::Other_PrivateUse, "Co" },
465 { QChar::Other_NotAssigned, "Cn" },
467 { QChar::Letter_Uppercase, "Lu" },
468 { QChar::Letter_Lowercase, "Ll" },
469 { QChar::Letter_Titlecase, "Lt" },
470 { QChar::Letter_Modifier, "Lm" },
471 { QChar::Letter_Other, "Lo" },
473 { QChar::Punctuation_Connector, "Pc" },
474 { QChar::Punctuation_Dash, "Pd" },
475 { QChar::Punctuation_Open, "Ps" },
476 { QChar::Punctuation_Close, "Pe" },
477 { QChar::Punctuation_InitialQuote, "Pi" },
478 { QChar::Punctuation_FinalQuote, "Pf" },
479 { QChar::Punctuation_Other, "Po" },
481 { QChar::Symbol_Math, "Sm" },
482 { QChar::Symbol_Currency, "Sc" },
483 { QChar::Symbol_Modifier, "Sk" },
484 { QChar::Symbol_Other, "So" },
485 { QChar::NoCategory, 0 }
487 Cat *c = categories;
488 while (c->cat != QChar::NoCategory) {
489 categoryMap.insert(c->name, c->cat);
490 ++c;
494 QHash<QByteArray, QChar::Direction> directionMap;
496 static void initDirectionMap()
498 struct Dir {
499 QChar::Direction dir;
500 const char *name;
501 } directions[] = {
502 { QChar::DirL, "L" },
503 { QChar::DirR, "R" },
504 { QChar::DirEN, "EN" },
505 { QChar::DirES, "ES" },
506 { QChar::DirET, "ET" },
507 { QChar::DirAN, "AN" },
508 { QChar::DirCS, "CS" },
509 { QChar::DirB, "B" },
510 { QChar::DirS, "S" },
511 { QChar::DirWS, "WS" },
512 { QChar::DirON, "ON" },
513 { QChar::DirLRE, "LRE" },
514 { QChar::DirLRO, "LRO" },
515 { QChar::DirAL, "AL" },
516 { QChar::DirRLE, "RLE" },
517 { QChar::DirRLO, "RLO" },
518 { QChar::DirPDF, "PDF" },
519 { QChar::DirNSM, "NSM" },
520 { QChar::DirBN, "BN" },
521 { QChar::DirL, 0 }
523 Dir *d = directions;
524 while (d->name) {
525 directionMap.insert(d->name, d->dir);
526 ++d;
531 QHash<QByteArray, QChar::Decomposition> decompositionMap;
533 static void initDecompositionMap()
535 struct Dec {
536 QChar::Decomposition dec;
537 const char *name;
538 } decompositions[] = {
539 { QChar::Canonical, "<canonical>" },
540 { QChar::Font, "<font>" },
541 { QChar::NoBreak, "<noBreak>" },
542 { QChar::Initial, "<initial>" },
543 { QChar::Medial, "<medial>" },
544 { QChar::Final, "<final>" },
545 { QChar::Isolated, "<isolated>" },
546 { QChar::Circle, "<circle>" },
547 { QChar::Super, "<super>" },
548 { QChar::Sub, "<sub>" },
549 { QChar::Vertical, "<vertical>" },
550 { QChar::Wide, "<wide>" },
551 { QChar::Narrow, "<narrow>" },
552 { QChar::Small, "<small>" },
553 { QChar::Square, "<square>" },
554 { QChar::Compat, "<compat>" },
555 { QChar::Fraction, "<fraction>" },
556 { QChar::NoDecomposition, 0 }
558 Dec *d = decompositions;
559 while (d->name) {
560 decompositionMap.insert(d->name, d->dec);
561 ++d;
566 QHash<int, UnicodeData> unicodeData;
567 QList<PropertyFlags> uniqueProperties;
570 QHash<int, int> decompositionLength;
571 int highestComposedCharacter = 0;
572 int numLigatures = 0;
573 int highestLigature = 0;
575 struct Ligature {ushort u1; ushort u2; ushort ligature;};
576 // we need them sorted after the first component for fast lookup
577 bool operator < (const Ligature &l1, const Ligature &l2) {
578 return l1.u1 < l2.u1;
581 QHash<ushort, QList<Ligature> > ligatureHashes;
583 QHash<int, int> combiningClassUsage;
585 int maxLowerCaseDiff = 0;
586 int maxUpperCaseDiff = 0;
587 int maxTitleCaseDiff = 0;
589 static void readUnicodeData()
591 QFile f("data/UnicodeData.txt");
592 if (!f.exists())
593 qFatal("Couldn't find UnicodeData.txt");
595 f.open(QFile::ReadOnly);
597 while (!f.atEnd()) {
598 QByteArray line;
599 line.resize(1024);
600 int len = f.readLine(line.data(), 1024);
601 line.truncate(len-1);
603 int comment = line.indexOf('#');
604 if (comment >= 0)
605 line = line.left(comment);
606 if (line.isEmpty())
607 continue;
609 QList<QByteArray> properties = line.split(';');
610 bool ok;
611 int codepoint = properties[UD_Value].toInt(&ok, 16);
612 int lastCodepoint = codepoint;
614 QByteArray name = properties[UD_Name];
615 if (name.startsWith('<') && name.contains("First")) {
616 QByteArray nextLine;
617 nextLine.resize(1024);
618 f.readLine(nextLine.data(), 1024);
619 QList<QByteArray> properties = nextLine.split(';');
620 lastCodepoint = properties[UD_Value].toInt(&ok, 16);
623 UnicodeData data(codepoint);
624 data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory);
625 data.p.combiningClass = properties[UD_CombiningClass].toInt();
627 if (!combiningClassUsage.contains(data.p.combiningClass))
628 combiningClassUsage[data.p.combiningClass] = 1;
629 else
630 ++combiningClassUsage[data.p.combiningClass];
632 data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
634 if (!properties[UD_UpperCase].isEmpty()) {
635 int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
636 Q_ASSERT(ok);
637 data.p.upperCaseDiff = upperCase - codepoint;
638 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(data.p.upperCaseDiff));
639 if (codepoint > 0xffff) {
640 // if the condition below doesn't hold anymore we need to modify our case folding code
641 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
642 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
645 if (!properties[UD_LowerCase].isEmpty()) {
646 int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
647 Q_ASSERT (ok);
648 data.p.lowerCaseDiff = lowerCase - codepoint;
649 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(data.p.lowerCaseDiff));
650 if (codepoint > 0xffff) {
651 // if the condition below doesn't hold anymore we need to modify our case folding code
652 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
653 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
656 // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
657 if (properties[UD_TitleCase].isEmpty())
658 properties[UD_TitleCase] = properties[UD_UpperCase];
659 if (!properties[UD_TitleCase].isEmpty()) {
660 int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
661 Q_ASSERT (ok);
662 data.p.titleCaseDiff = titleCase - codepoint;
663 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(data.p.titleCaseDiff));
664 if (codepoint > 0xffff) {
665 // if the condition below doesn't hold anymore we need to modify our case folding code
666 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
667 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
671 if (!properties[UD_DigitValue].isEmpty())
672 data.p.digitValue = properties[UD_DigitValue].toInt();
674 // decompositition
675 QByteArray decomposition = properties[UD_Decomposition];
676 if (!decomposition.isEmpty()) {
677 highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
678 QList<QByteArray> d = decomposition.split(' ');
679 if (d[0].contains('<')) {
680 data.decompositionType = decompositionMap.value(d[0], QChar::Canonical);
681 d.takeFirst();
682 } else {
683 data.decompositionType = QChar::Canonical;
685 for (int i = 0; i < d.size(); ++i)
686 data.decomposition.append(d[i].toInt(&ok, 16));
687 if (!decompositionLength.contains(data.decomposition.size()))
688 decompositionLength[data.decomposition.size()] = 1;
689 else
690 ++decompositionLength[data.decomposition.size()];
693 for (int i = codepoint; i <= lastCodepoint; ++i)
694 unicodeData.insert(i, data);
699 static int maxMirroredDiff = 0;
701 static void readBidiMirroring()
703 QFile f("data/BidiMirroring.txt");
704 if (!f.exists())
705 qFatal("Couldn't find BidiMirroring.txt");
707 f.open(QFile::ReadOnly);
709 while (!f.atEnd()) {
710 QByteArray line;
711 line.resize(1024);
712 int len = f.readLine(line.data(), 1024);
713 line.resize(len-1);
715 int comment = line.indexOf('#');
716 if (comment >= 0)
717 line = line.left(comment);
719 if (line.isEmpty())
720 continue;
721 line = line.replace(" ", "");
723 QList<QByteArray> pair = line.split(';');
724 Q_ASSERT(pair.size() == 2);
726 bool ok;
727 int codepoint = pair[0].toInt(&ok, 16);
728 int mirror = pair[1].toInt(&ok, 16);
730 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
731 d.mirroredChar = mirror;
732 if (qAbs(codepoint-d.mirroredChar) > maxMirroredDiff)
733 maxMirroredDiff = qAbs(codepoint - d.mirroredChar);
735 d.p.mirrorDiff = d.mirroredChar - codepoint;
736 unicodeData.insert(codepoint, d);
740 static void readArabicShaping()
742 QFile f("data/ArabicShaping.txt");
743 if (!f.exists())
744 qFatal("Couldn't find ArabicShaping.txt");
746 f.open(QFile::ReadOnly);
748 while (!f.atEnd()) {
749 QByteArray line;
750 line.resize(1024);
751 int len = f.readLine(line.data(), 1024);
752 line.resize(len-1);
754 int comment = line.indexOf('#');
755 if (comment >= 0)
756 line = line.left(comment);
757 line = line.trimmed();
759 if (line.isEmpty())
760 continue;
762 QList<QByteArray> shaping = line.split(';');
763 Q_ASSERT(shaping.size() == 4);
765 bool ok;
766 int codepoint = shaping[0].toInt(&ok, 16);
767 QChar::Joining j = QChar::OtherJoining;
768 QByteArray shape = shaping[2].trimmed();
769 if (shape == "R")
770 j = QChar::Right;
771 else if (shape == "D")
772 j = QChar::Dual;
773 else if (shape == "C")
774 j = QChar::Center;
776 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
777 d.p.joining = j;
778 unicodeData.insert(codepoint, d);
782 static void readDerivedAge()
784 QFile f("data/DerivedAge.txt");
785 if (!f.exists())
786 qFatal("Couldn't find DerivedAge.txt");
788 f.open(QFile::ReadOnly);
790 while (!f.atEnd()) {
791 QByteArray line;
792 line.resize(1024);
793 int len = f.readLine(line.data(), 1024);
794 line.resize(len-1);
796 int comment = line.indexOf('#');
797 if (comment >= 0)
798 line = line.left(comment);
799 line.replace(" ", "");
801 if (line.isEmpty())
802 continue;
804 QList<QByteArray> l = line.split(';');
805 Q_ASSERT(l.size() == 2);
807 QByteArray codes = l[0];
808 codes.replace("..", ".");
809 QList<QByteArray> cl = codes.split('.');
811 bool ok;
812 int from = cl[0].toInt(&ok, 16);
813 int to = from;
814 if (cl.size() == 2)
815 to = cl[1].toInt(&ok, 16);
817 QChar::UnicodeVersion age = QChar::Unicode_Unassigned;
818 QByteArray ba = l[1];
819 AgeMap *map = ageMap;
820 while (map->age) {
821 if (ba == map->age) {
822 age = map->version;
823 break;
825 ++map;
827 //qDebug() << hex << from << ".." << to << ba << age;
828 Q_ASSERT(age != QChar::Unicode_Unassigned);
830 for (int codepoint = from; codepoint <= to; ++codepoint) {
831 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
832 d.p.age = age;
833 unicodeData.insert(codepoint, d);
839 static void readCompositionExclusion()
841 QFile f("data/CompositionExclusions.txt");
842 if (!f.exists())
843 qFatal("Couldn't find CompositionExclusions.txt");
845 f.open(QFile::ReadOnly);
847 while (!f.atEnd()) {
848 QByteArray line;
849 line.resize(1024);
850 int len = f.readLine(line.data(), 1024);
851 line.resize(len-1);
853 int comment = line.indexOf('#');
854 if (comment >= 0)
855 line = line.left(comment);
856 line.replace(" ", "");
858 if (line.isEmpty())
859 continue;
861 Q_ASSERT(!line.contains(".."));
863 bool ok;
864 int codepoint = line.toInt(&ok, 16);
866 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
867 d.excludedComposition = true;
868 unicodeData.insert(codepoint, d);
871 for (int i = 0; i < 0x110000; ++i) {
872 UnicodeData data = unicodeData.value(i, UnicodeData(i));
873 if (!data.excludedComposition
874 && data.decompositionType == QChar::Canonical
875 && data.decomposition.size() > 1) {
876 Q_ASSERT(data.decomposition.size() == 2);
878 uint part1 = data.decomposition.at(0);
879 uint part2 = data.decomposition.at(1);
880 UnicodeData first = unicodeData.value(part1, UnicodeData(part1));
881 if (first.p.combiningClass != 0)
882 continue;
884 ++numLigatures;
885 highestLigature = qMax(highestLigature, (int)part1);
886 Ligature l = {(ushort)part1, (ushort)part2, i};
887 ligatureHashes[part2].append(l);
892 struct NormalizationCorrection {
893 uint codepoint;
894 uint mapped;
895 uint version;
898 static QByteArray createNormalizationCorrections()
900 QFile f("data/NormalizationCorrections.txt");
901 if (!f.exists())
902 qFatal("Couldn't find NormalizationCorrections.txt");
904 f.open(QFile::ReadOnly);
906 QByteArray out;
908 out += "struct NormalizationCorrection {\n"
909 " uint ucs4;\n"
910 " uint old_mapping;\n"
911 " int version;\n"
912 "};\n\n"
914 "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
916 int numCorrections = 0;
917 while (!f.atEnd()) {
918 QByteArray line;
919 line.resize(1024);
920 int len = f.readLine(line.data(), 1024);
921 line.resize(len-1);
923 int comment = line.indexOf('#');
924 if (comment >= 0)
925 line = line.left(comment);
926 line.replace(" ", "");
928 if (line.isEmpty())
929 continue;
931 Q_ASSERT(!line.contains(".."));
933 QList<QByteArray> fields = line.split(';');
934 Q_ASSERT(fields.size() == 4);
936 NormalizationCorrection c;
937 bool ok;
938 c.codepoint = fields.at(0).toInt(&ok, 16);
939 c.mapped = fields.at(1).toInt(&ok, 16);
940 if (fields.at(3) == "3.2.0")
941 c.version = QChar::Unicode_3_2;
942 else if (fields.at(3) == "4.0.0")
943 c.version = QChar::Unicode_4_0;
944 else
945 qFatal("unknown unicode version in NormalizationCorrection.txt");
947 out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
948 + ", " + QString::number(c.version) + " },\n";
949 ++numCorrections;
952 out += "};\n\n"
954 "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n";
957 return out;
961 static void computeUniqueProperties()
963 qDebug("computeUniqueProperties:");
964 for (int uc = 0; uc < 0x110000; ++uc) {
965 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
967 int index = uniqueProperties.indexOf(d.p);
968 if (index == -1) {
969 index = uniqueProperties.size();
970 uniqueProperties.append(d.p);
972 d.propertyIndex = index;
973 unicodeData.insert(uc, d);
975 qDebug(" %d unicode properties found", uniqueProperties.size());
979 static void readLineBreak()
981 QFile f("data/LineBreak.txt");
982 if (!f.exists())
983 qFatal("Couldn't find LineBreak.txt");
985 f.open(QFile::ReadOnly);
987 while (!f.atEnd()) {
988 QByteArray line;
989 line.resize(1024);
990 int len = f.readLine(line.data(), 1024);
991 line.resize(len-1);
993 int comment = line.indexOf('#');
994 if (comment >= 0)
995 line = line.left(comment);
996 line.replace(" ", "");
998 if (line.isEmpty())
999 continue;
1001 QList<QByteArray> l = line.split(';');
1002 Q_ASSERT(l.size() == 2);
1004 QByteArray codes = l[0];
1005 codes.replace("..", ".");
1006 QList<QByteArray> cl = codes.split('.');
1008 bool ok;
1009 int from = cl[0].toInt(&ok, 16);
1010 int to = from;
1011 if (cl.size() == 2)
1012 to = cl[1].toInt(&ok, 16);
1014 // ### Classes XX and AI are left out and mapped to AL for now
1015 QUnicodeTables::LineBreakClass lb = QUnicodeTables::LineBreak_AL;
1016 QByteArray ba = l[1];
1018 if (ba == "AI") lb = QUnicodeTables::LineBreak_AL;
1019 else if (ba == "XX") lb = QUnicodeTables::LineBreak_AL;
1020 else if (ba == "NL") lb = QUnicodeTables::LineBreak_AL;
1021 else if (ba == "OP") lb = QUnicodeTables::LineBreak_OP;
1022 else if (ba == "CL") lb = QUnicodeTables::LineBreak_CL;
1023 else if (ba == "QU") lb = QUnicodeTables::LineBreak_QU;
1024 else if (ba == "GL") lb = QUnicodeTables::LineBreak_GL;
1025 else if (ba == "NS") lb = QUnicodeTables::LineBreak_NS;
1026 else if (ba == "EX") lb = QUnicodeTables::LineBreak_EX;
1027 else if (ba == "SY") lb = QUnicodeTables::LineBreak_SY;
1028 else if (ba == "IS") lb = QUnicodeTables::LineBreak_IS;
1029 else if (ba == "PR") lb = QUnicodeTables::LineBreak_PR;
1030 else if (ba == "PO") lb = QUnicodeTables::LineBreak_PO;
1031 else if (ba == "NU") lb = QUnicodeTables::LineBreak_NU;
1032 else if (ba == "AL") lb = QUnicodeTables::LineBreak_AL;
1033 else if (ba == "ID") lb = QUnicodeTables::LineBreak_ID;
1034 else if (ba == "IN") lb = QUnicodeTables::LineBreak_IN;
1035 else if (ba == "HY") lb = QUnicodeTables::LineBreak_HY;
1036 else if (ba == "BA") lb = QUnicodeTables::LineBreak_BA;
1037 else if (ba == "BB") lb = QUnicodeTables::LineBreak_BB;
1038 else if (ba == "B2") lb = QUnicodeTables::LineBreak_B2;
1039 else if (ba == "ZW") lb = QUnicodeTables::LineBreak_ZW;
1040 else if (ba == "CM") lb = QUnicodeTables::LineBreak_CM;
1041 else if (ba == "SA") lb = QUnicodeTables::LineBreak_SA;
1042 else if (ba == "BK") lb = QUnicodeTables::LineBreak_BK;
1043 else if (ba == "CR") lb = QUnicodeTables::LineBreak_CR;
1044 else if (ba == "LF") lb = QUnicodeTables::LineBreak_LF;
1045 else if (ba == "SG") lb = QUnicodeTables::LineBreak_SG;
1046 else if (ba == "CB") lb = QUnicodeTables::LineBreak_AL;
1047 else if (ba == "SP") lb = QUnicodeTables::LineBreak_SP;
1048 else if (ba == "WJ") lb = QUnicodeTables::LineBreak_WJ;
1049 else if (ba == "H2") lb = QUnicodeTables::LineBreak_H2;
1050 else if (ba == "H3") lb = QUnicodeTables::LineBreak_H3;
1051 else if (ba == "JL") lb = QUnicodeTables::LineBreak_JL;
1052 else if (ba == "JV") lb = QUnicodeTables::LineBreak_JV;
1053 else if (ba == "JT") lb = QUnicodeTables::LineBreak_JT;
1054 else {
1055 qDebug() << "unhandled line break class:" << ba;
1058 for (int codepoint = from; codepoint <= to; ++codepoint) {
1059 UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
1060 d.p.line_break_class = lb;
1061 unicodeData.insert(codepoint, d);
1067 static void readSpecialCasing()
1069 // qDebug() << "Reading SpecialCasing.txt";
1070 QFile f("data/SpecialCasing.txt");
1071 if (!f.exists())
1072 qFatal("Couldn't find SpecialCasing.txt");
1074 f.open(QFile::ReadOnly);
1076 while (!f.atEnd()) {
1077 QByteArray line;
1078 line.resize(1024);
1079 int len = f.readLine(line.data(), 1024);
1080 line.resize(len-1);
1082 int comment = line.indexOf('#');
1083 if (comment >= 0)
1084 line = line.left(comment);
1086 if (line.isEmpty())
1087 continue;
1089 QList<QByteArray> l = line.split(';');
1091 QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1092 if (!condition.isEmpty())
1093 // #####
1094 continue;
1096 bool ok;
1097 int codepoint = l[0].trimmed().toInt(&ok, 16);
1098 Q_ASSERT(ok);
1099 Q_ASSERT(codepoint <= 0xffff);
1101 // qDebug() << "codepoint" << hex << codepoint;
1102 // qDebug() << line;
1104 QList<QByteArray> lower = l[1].trimmed().split(' ');
1105 QList<int> lowerMap;
1106 for (int i = 0; i < lower.size(); ++i) {
1107 bool ok;
1108 lowerMap.append(lower.at(i).toInt(&ok, 16));
1109 Q_ASSERT(ok);
1112 QList<QByteArray> title = l[2].trimmed().split(' ');
1113 QList<int> titleMap;
1114 for (int i = 0; i < title.size(); ++i) {
1115 bool ok;
1116 titleMap.append(title.at(i).toInt(&ok, 16));
1117 if (!ok)
1118 qDebug() << line << title.at(i);
1119 Q_ASSERT(ok);
1122 QList<QByteArray> upper = l[3].trimmed().split(' ');
1123 QList<int> upperMap;
1124 for (int i = 0; i < upper.size(); ++i) {
1125 bool ok;
1126 upperMap.append(upper.at(i).toInt(&ok, 16));
1127 Q_ASSERT(ok);
1131 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1133 Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1134 Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1135 Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1137 if (lowerMap.size() > 1) {
1138 ud.p.lowerCaseSpecial = true;
1139 ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1141 if (titleMap.size() > 1) {
1142 ud.p.titleCaseSpecial = true;
1143 ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1145 if (upperMap.size() > 1) {
1146 ud.p.upperCaseSpecial = true;
1147 ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);;
1150 unicodeData.insert(codepoint, ud);
1154 int maxCaseFoldDiff = 0;
1156 static void readCaseFolding()
1158 qDebug() << "Reading CaseFolding.txt";
1159 QFile f("data/CaseFolding.txt");
1160 if (!f.exists())
1161 qFatal("Couldn't find CaseFolding.txt");
1163 f.open(QFile::ReadOnly);
1165 while (!f.atEnd()) {
1166 QByteArray line;
1167 line.resize(1024);
1168 int len = f.readLine(line.data(), 1024);
1169 line.resize(len-1);
1171 int comment = line.indexOf('#');
1172 if (comment >= 0)
1173 line = line.left(comment);
1175 if (line.isEmpty())
1176 continue;
1178 QList<QByteArray> l = line.split(';');
1180 bool ok;
1181 uint codepoint = l[0].trimmed().toInt(&ok, 16);
1182 Q_ASSERT(ok);
1185 l[1] = l[1].trimmed();
1186 if (l[1] == "F" || l[1] == "T")
1187 continue;
1189 // qDebug() << "codepoint" << hex << codepoint;
1190 // qDebug() << line;
1191 QList<QByteArray> fold = l[2].trimmed().split(' ');
1192 QList<int> foldMap;
1193 for (int i = 0; i < fold.size(); ++i) {
1194 bool ok;
1195 foldMap.append(fold.at(i).toInt(&ok, 16));
1196 Q_ASSERT(ok);
1199 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1200 if (foldMap.size() == 1) {
1201 ud.p.caseFoldDiff = foldMap.at(0) - codepoint;
1202 maxCaseFoldDiff = qMax(maxCaseFoldDiff, ud.p.caseFoldDiff);
1203 if (codepoint > 0xffff) {
1204 // if the condition below doesn't hold anymore we need to modify our case folding code
1205 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
1206 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(foldMap.at(0)));
1208 if (foldMap.at(0) != codepoint + ud.p.lowerCaseDiff)
1209 qDebug() << hex << codepoint;
1210 } else {
1211 Q_ASSERT(false); // we currently don't support full case foldings
1212 // qDebug() << "special" << hex << foldMap;
1213 ud.p.caseFoldSpecial = true;
1214 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1216 unicodeData.insert(codepoint, ud);
1220 static void readGraphemeBreak()
1222 qDebug() << "Reading GraphemeBreakProperty.txt";
1223 QFile f("data/GraphemeBreakProperty.txt");
1224 if (!f.exists())
1225 qFatal("Couldn't find GraphemeBreakProperty.txt");
1227 f.open(QFile::ReadOnly);
1229 while (!f.atEnd()) {
1230 QByteArray line;
1231 line.resize(1024);
1232 int len = f.readLine(line.data(), 1024);
1233 line.resize(len-1);
1235 int comment = line.indexOf('#');
1236 if (comment >= 0)
1237 line = line.left(comment);
1239 if (line.isEmpty())
1240 continue;
1242 QList<QByteArray> l = line.split(';');
1244 QByteArray codes = l[0].trimmed();
1245 codes.replace("..", ".");
1246 QList<QByteArray> cl = codes.split('.');
1248 bool ok;
1249 int from = cl[0].toInt(&ok, 16);
1250 Q_ASSERT(ok);
1251 int to = from;
1252 if (cl.size() == 2) {
1253 to = cl[1].toInt(&ok, 16);
1254 Q_ASSERT(ok);
1257 GraphemeBreak brk = grapheme_break_map.value(l[1].trimmed(), GraphemeBreakOther);
1259 for (int codepoint = from; codepoint <= to; ++codepoint) {
1260 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1261 ud.p.graphemeBreak = brk;
1262 unicodeData.insert(codepoint, ud);
1267 static void readWordBreak()
1269 qDebug() << "Reading WordBreakProperty.txt";
1270 QFile f("data/WordBreakProperty.txt");
1271 if (!f.exists())
1272 qFatal("Couldn't find WordBreakProperty.txt");
1274 f.open(QFile::ReadOnly);
1276 while (!f.atEnd()) {
1277 QByteArray line;
1278 line.resize(1024);
1279 int len = f.readLine(line.data(), 1024);
1280 line.resize(len-1);
1282 int comment = line.indexOf('#');
1283 if (comment >= 0)
1284 line = line.left(comment);
1286 if (line.isEmpty())
1287 continue;
1289 QList<QByteArray> l = line.split(';');
1291 QByteArray codes = l[0].trimmed();
1292 codes.replace("..", ".");
1293 QList<QByteArray> cl = codes.split('.');
1295 bool ok;
1296 int from = cl[0].toInt(&ok, 16);
1297 Q_ASSERT(ok);
1298 int to = from;
1299 if (cl.size() == 2) {
1300 to = cl[1].toInt(&ok, 16);
1301 Q_ASSERT(ok);
1304 WordBreak brk = word_break_map.value(l[1].trimmed(), WordBreakOther);
1305 Q_ASSERT(brk != WordBreakOther);
1307 for (int codepoint = from; codepoint <= to; ++codepoint) {
1308 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1309 ud.p.wordBreak = brk;
1310 unicodeData.insert(codepoint, ud);
1315 static void readSentenceBreak()
1317 qDebug() << "Reading SentenceBreakProperty.txt";
1318 QFile f("data/SentenceBreakProperty.txt");
1319 if (!f.exists())
1320 qFatal("Couldn't find SentenceBreakProperty.txt");
1322 f.open(QFile::ReadOnly);
1324 while (!f.atEnd()) {
1325 QByteArray line;
1326 line.resize(1024);
1327 int len = f.readLine(line.data(), 1024);
1328 line.resize(len-1);
1330 int comment = line.indexOf('#');
1331 if (comment >= 0)
1332 line = line.left(comment);
1334 if (line.isEmpty())
1335 continue;
1337 QList<QByteArray> l = line.split(';');
1339 QByteArray codes = l[0].trimmed();
1340 codes.replace("..", ".");
1341 QList<QByteArray> cl = codes.split('.');
1343 bool ok;
1344 int from = cl[0].toInt(&ok, 16);
1345 Q_ASSERT(ok);
1346 int to = from;
1347 if (cl.size() == 2) {
1348 to = cl[1].toInt(&ok, 16);
1349 Q_ASSERT(ok);
1352 SentenceBreak brk = sentence_break_map.value(l[1].trimmed(), SentenceBreakOther);
1353 Q_ASSERT(brk != SentenceBreakOther);
1355 for (int codepoint = from; codepoint <= to; ++codepoint) {
1356 UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
1357 ud.p.sentenceBreak = brk;
1358 unicodeData.insert(codepoint, ud);
1363 #if 0
1364 // this piece of code does full case folding and comparison. We currently
1365 // don't use it, since this gives lots of issues with things as case insensitive
1366 // search and replace.
1367 static inline void foldCase(uint ch, ushort *out)
1369 const QUnicodeTables::Properties *p = qGetProp(ch);
1370 if (!p->caseFoldSpecial) {
1371 *(out++) = ch + p->caseFoldDiff;
1372 } else {
1373 const ushort *folded = specialCaseMap + p->caseFoldDiff;
1374 while (*folded)
1375 *out++ = *folded++;
1377 *out = 0;
1380 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
1382 if (a == b)
1383 return 0;
1384 if (a == 0)
1385 return 1;
1386 if (b == 0)
1387 return -1;
1389 while (a != ae && b != be) {
1390 const QUnicodeTables::Properties *pa = qGetProp(*a);
1391 const QUnicodeTables::Properties *pb = qGetProp(*b);
1392 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1393 goto special;
1394 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1395 if ((diff))
1396 return diff;
1397 ++a;
1398 ++b;
1401 if (a == ae) {
1402 if (b == be)
1403 return 0;
1404 return -1;
1406 return 1;
1407 special:
1408 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1409 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1410 abuf[0] = bbuf[0] = 0;
1411 ushort *ap = abuf;
1412 ushort *bp = bbuf;
1413 while (1) {
1414 if (!*ap) {
1415 if (a == ae) {
1416 if (!*bp && b == be)
1417 return 0;
1418 return -1;
1420 foldCase(*(a++), abuf);
1421 ap = abuf;
1423 if (!*bp) {
1424 if (b == be)
1425 return 1;
1426 foldCase(*(b++), bbuf);
1427 bp = bbuf;
1429 if (*ap != *bp)
1430 return (int)*ap - (int)*bp;
1431 ++ap;
1432 ++bp;
1437 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
1439 if (a == 0)
1440 return 1;
1441 if (b == 0)
1442 return -1;
1444 while (a != ae && *b) {
1445 const QUnicodeTables::Properties *pa = qGetProp(*a);
1446 const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
1447 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
1448 goto special;
1449 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
1450 if ((diff))
1451 return diff;
1452 ++a;
1453 ++b;
1455 if (a == ae) {
1456 if (!*b)
1457 return 0;
1458 return -1;
1460 return 1;
1462 special:
1463 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
1464 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
1465 abuf[0] = bbuf[0] = 0;
1466 ushort *ap = abuf;
1467 ushort *bp = bbuf;
1468 while (1) {
1469 if (!*ap) {
1470 if (a == ae) {
1471 if (!*bp && !*b)
1472 return 0;
1473 return -1;
1475 foldCase(*(a++), abuf);
1476 ap = abuf;
1478 if (!*bp) {
1479 if (!*b)
1480 return 1;
1481 foldCase(*(b++), bbuf);
1482 bp = bbuf;
1484 if (*ap != *bp)
1485 return (int)*ap - (int)*bp;
1486 ++ap;
1487 ++bp;
1490 #endif
1492 #if 0
1493 static QList<QByteArray> blockNames;
1494 struct BlockInfo
1496 int blockIndex;
1497 int firstCodePoint;
1498 int lastCodePoint;
1500 static QList<BlockInfo> blockInfoList;
1502 static void readBlocks()
1504 QFile f("data/Blocks.txt");
1505 if (!f.exists())
1506 qFatal("Couldn't find Blocks.txt");
1508 f.open(QFile::ReadOnly);
1510 while (!f.atEnd()) {
1511 QByteArray line = f.readLine();
1512 line.resize(line.size() - 1);
1514 int comment = line.indexOf("#");
1515 if (comment >= 0)
1516 line = line.left(comment);
1518 line.replace(" ", "");
1520 if (line.isEmpty())
1521 continue;
1523 int semicolon = line.indexOf(';');
1524 Q_ASSERT(semicolon >= 0);
1525 QByteArray codePoints = line.left(semicolon);
1526 QByteArray blockName = line.mid(semicolon + 1);
1528 int blockIndex = blockNames.indexOf(blockName);
1529 if (blockIndex < 0) {
1530 blockNames.append(blockName);
1531 blockIndex = blockNames.indexOf(blockName);
1532 Q_ASSERT(blockIndex >= 0);
1535 int dotdot = codePoints.indexOf("..");
1536 Q_ASSERT(dotdot >= 0);
1537 bool unused;
1538 int first = codePoints.left(dotdot).toInt(&unused, 16);
1539 int last = codePoints.mid(dotdot + 2).toInt(&unused, 16);
1541 BlockInfo blockInfo = { blockIndex, first, last };
1542 blockInfoList.append(blockInfo);
1545 #endif
1547 static QList<QByteArray> scriptNames;
1548 static QHash<int, int> scriptAssignment;
1549 static QHash<int, int> scriptHash;
1551 struct ExtraBlock {
1552 int block;
1553 QVector<int> vector;
1556 static QList<ExtraBlock> extraBlockList;
1559 static void readScripts()
1561 scriptNames.append("Common");
1563 static const char *files[] = {
1564 "data/ScriptsInitial.txt",
1565 "data/Scripts.txt",
1566 "data/ScriptsCorrections.txt"
1568 enum { fileCount = sizeof(files) / sizeof(const char *) };
1570 for (int i = 0; i < fileCount; ++i) {
1571 QFile f(files[i]);
1572 if (!f.exists())
1573 qFatal("Couldn't find %s", files[i]);
1576 f.open(QFile::ReadOnly);
1578 while (!f.atEnd()) {
1579 QByteArray line = f.readLine();
1580 line.resize(line.size() - 1);
1582 int comment = line.indexOf("#");
1583 if (comment >= 0)
1584 line = line.left(comment);
1586 line.replace(" ", "");
1587 line.replace("_", "");
1589 if (line.isEmpty())
1590 continue;
1592 int semicolon = line.indexOf(';');
1593 Q_ASSERT(semicolon >= 0);
1594 QByteArray codePoints = line.left(semicolon);
1595 QByteArray scriptName = line.mid(semicolon + 1);
1597 int scriptIndex = scriptNames.indexOf(scriptName);
1598 if (scriptIndex < 0) {
1599 scriptNames.append(scriptName);
1600 scriptIndex = scriptNames.indexOf(scriptName);
1601 Q_ASSERT(scriptIndex >= 0);
1604 int dotdot = codePoints.indexOf("..");
1605 bool unused;
1606 int first = -1, last = -1;
1607 if (dotdot >= 0) {
1608 first = codePoints.left(dotdot).toInt(&unused, 16);
1609 last = codePoints.mid(dotdot + 2).toInt(&unused, 16);
1610 } else {
1611 first = codePoints.toInt(&unused, 16);
1614 if (last != -1) {
1615 for (int i = first; i <= last; ++i)
1616 scriptAssignment[i] = scriptIndex;
1617 } else {
1618 scriptAssignment[first] = scriptIndex;
1625 static int scriptSentinel = 0;
1627 QByteArray createScriptEnumDeclaration()
1629 static const char *specialScripts[] = {
1630 "Common",
1631 "Arabic",
1632 "Armenian",
1633 "Bengali",
1634 "Cyrillic",
1635 "Devanagari",
1636 "Georgian",
1637 "Greek",
1638 "Gujarati",
1639 "Gurmukhi",
1640 "Hangul",
1641 "Hebrew",
1642 "Kannada",
1643 "Khmer",
1644 "Lao",
1645 "Malayalam",
1646 "Myanmar",
1647 "Ogham",
1648 "Oriya",
1649 "Runic",
1650 "Sinhala",
1651 "Syriac",
1652 "Tamil",
1653 "Telugu",
1654 "Thaana",
1655 "Thai",
1656 "Tibetan",
1657 "Inherited"
1659 const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *);
1661 // generate script enum
1662 QByteArray declaration;
1664 declaration += " // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n";
1665 declaration += " enum Script {\n Common";
1667 int uniqueScripts = 1; // Common
1669 // output the ones with special processing first
1670 for (int i = 1; i < scriptNames.size(); ++i) {
1671 QByteArray scriptName = scriptNames.at(i);
1672 // does the script require special processing?
1673 bool special = false;
1674 for (int s = 0; !special && s < specialScriptsCount; ++s) {
1675 if (scriptName == specialScripts[s])
1676 special = true;
1678 if (!special) {
1679 scriptHash[i] = 0; // alias for 'Common'
1680 continue;
1681 } else {
1682 ++uniqueScripts;
1683 scriptHash[i] = i;
1686 declaration += ",\n ";
1687 declaration += scriptName;
1689 declaration += ",\n ScriptCount = Inherited";
1691 // output the ones that are an alias for 'Common'
1692 for (int i = 1; i < scriptNames.size(); ++i) {
1693 if (scriptHash.value(i) != 0)
1694 continue;
1695 QByteArray scriptName = scriptNames.at(i);
1696 scriptName += " = Common";
1697 declaration += ",\n ";
1698 declaration += scriptName;
1701 declaration += "\n };\n";
1703 scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32
1704 declaration += " enum { ScriptSentinel = ";
1705 declaration += QByteArray::number(scriptSentinel);
1706 declaration += " };\n\n";
1707 return declaration;
1710 QByteArray createScriptTableDeclaration()
1712 Q_ASSERT(scriptSentinel > 0);
1714 QByteArray declaration;
1716 const int unicodeBlockCount = 512; // number of unicode blocks
1717 const int unicodeBlockSize = 128; // size of each block
1718 declaration = "enum { UnicodeBlockCount = ";
1719 declaration += QByteArray::number(unicodeBlockCount);
1720 declaration += " }; // number of unicode blocks\n";
1721 declaration += "enum { UnicodeBlockSize = ";
1722 declaration += QByteArray::number(unicodeBlockSize);
1723 declaration += " }; // size of each block\n\n";
1725 // script table
1726 declaration += "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n";
1727 for (int i = 0; i < unicodeBlockCount; ++i) {
1728 int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80));
1729 int blockAssignment[unicodeBlockSize];
1730 for (int x = 0; x < unicodeBlockSize; ++x) {
1731 int codePoint = (i << 7) | x;
1732 blockAssignment[x] = scriptAssignment.value(codePoint, 0);
1734 bool allTheSame = true;
1735 const int originalScript = blockAssignment[0];
1736 const int script = scriptHash.value(originalScript);
1737 for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) {
1738 const int s = scriptHash.value(blockAssignment[x]);
1739 if (s != script)
1740 allTheSame = false;
1743 if (allTheSame) {
1744 declaration += " ";
1745 declaration += scriptNames.value(originalScript);
1746 declaration += ", /* U+";
1747 declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1748 declaration += '-';
1749 declaration +=
1750 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1751 declaration += " */\n";
1752 } else {
1753 const int value = extraBlockList.size() + scriptSentinel;
1754 const int offset =
1755 ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1757 declaration += " ";
1758 declaration += QByteArray::number(value);
1759 declaration += ", /* U+";
1760 declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1761 declaration += '-';
1762 declaration +=
1763 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1764 declaration += " at offset ";
1765 declaration += QByteArray::number(offset);
1766 declaration += " */\n";
1768 ExtraBlock extraBlock;
1769 extraBlock.block = block;
1770 extraBlock.vector.resize(unicodeBlockSize);
1771 for (int x = 0; x < unicodeBlockSize; ++x)
1772 extraBlock.vector[x] = blockAssignment[x];
1774 extraBlockList.append(extraBlock);
1778 for (int i = 0; i < extraBlockList.size(); ++i) {
1779 const int value = i + scriptSentinel;
1780 const int offset =
1781 ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
1782 const ExtraBlock &extraBlock = extraBlockList.at(i);
1783 const int block = extraBlock.block;
1785 declaration += "\n\n /* U+";
1786 declaration += QByteArray::number(block, 16).rightJustified(4, '0');
1787 declaration += '-';
1788 declaration +=
1789 QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
1790 declaration += " at offset ";
1791 declaration += QByteArray::number(offset);
1792 declaration += " */\n ";
1794 for (int x = 0; x < extraBlock.vector.size(); ++x) {
1795 const int o = extraBlock.vector.at(x);
1797 declaration += scriptNames.value(o);
1798 if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1)
1799 declaration += ',';
1800 if ((x & 7) == 7 && x < extraBlock.vector.size() - 1)
1801 declaration += "\n ";
1802 else
1803 declaration += ' ';
1806 declaration += "\n};\n\n} // namespace QUnicodeTables\n\n";
1808 qDebug("createScriptTableDeclaration: table size is %d bytes",
1809 unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize));
1811 return declaration;
1814 #if 0
1815 static void dump(int from, int to)
1817 for (int i = from; i <= to; ++i) {
1818 UnicodeData d = unicodeData.value(i, UnicodeData(i));
1819 qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
1820 i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
1821 if (d.decompositionType != QChar::NoDecomposition) {
1822 qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
1823 d.decomposition[0]);
1826 qDebug(" ");
1828 #endif
1830 struct PropertyBlock {
1831 PropertyBlock() { index = -1; }
1832 int index;
1833 QList<int> properties;
1834 bool operator ==(const PropertyBlock &other) { return properties == other.properties; }
1837 static QByteArray createPropertyInfo()
1839 qDebug("createPropertyInfo:");
1841 const int BMP_BLOCKSIZE=32;
1842 const int BMP_SHIFT = 5;
1843 const int BMP_END = 0x11000;
1844 const int SMP_END = 0x110000;
1845 const int SMP_BLOCKSIZE = 256;
1846 const int SMP_SHIFT = 8;
1848 QList<PropertyBlock> blocks;
1849 QList<int> blockMap;
1851 int used = 0;
1853 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
1854 PropertyBlock b;
1855 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
1856 int uc = block*BMP_BLOCKSIZE + i;
1857 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
1858 b.properties.append(d.propertyIndex);
1860 int index = blocks.indexOf(b);
1861 if (index == -1) {
1862 index = blocks.size();
1863 b.index = used;
1864 used += BMP_BLOCKSIZE;
1865 blocks.append(b);
1867 blockMap.append(blocks.at(index).index);
1870 int bmp_blocks = blocks.size();
1871 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
1873 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
1874 PropertyBlock b;
1875 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
1876 int uc = block*SMP_BLOCKSIZE + i;
1877 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
1878 b.properties.append(d.propertyIndex);
1880 int index = blocks.indexOf(b);
1881 if (index == -1) {
1882 index = blocks.size();
1883 b.index = used;
1884 used += SMP_BLOCKSIZE;
1885 blocks.append(b);
1887 blockMap.append(blocks.at(index).index);
1890 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
1891 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
1892 int bmp_mem = bmp_block_data + bmp_trie;
1893 qDebug(" %d unique blocks in BMP.",blocks.size());
1894 qDebug(" block data uses: %d bytes", bmp_block_data);
1895 qDebug(" trie data uses : %d bytes", bmp_trie);
1897 int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
1898 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
1899 int smp_mem = smp_block_data + smp_trie;
1900 qDebug(" %d unique blocks in SMP.",blocks.size()-bmp_blocks);
1901 qDebug(" block data uses: %d bytes", smp_block_data);
1902 qDebug(" trie data uses : %d bytes", smp_trie);
1904 qDebug("\n properties use : %d bytes", uniqueProperties.size()*20);
1905 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + uniqueProperties.size()*20);
1907 QByteArray out;
1908 out += "static const unsigned short uc_property_trie[] = {\n";
1910 // first write the map
1911 out += " // 0x" + QByteArray::number(BMP_END, 16);
1912 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
1913 if (!(i % 8)) {
1914 if (out.endsWith(' '))
1915 out.chop(1);
1916 if (!((i*BMP_BLOCKSIZE) % 0x1000))
1917 out += "\n";
1918 out += "\n ";
1920 out += QByteArray::number(blockMap.at(i) + blockMap.size());
1921 out += ", ";
1923 if (out.endsWith(' '))
1924 out.chop(1);
1925 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
1926 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
1927 if (!(i % 8)) {
1928 if (out.endsWith(' '))
1929 out.chop(1);
1930 if (!(i % (0x10000/SMP_BLOCKSIZE)))
1931 out += "\n";
1932 out += "\n ";
1934 out += QByteArray::number(blockMap.at(i) + blockMap.size());
1935 out += ", ";
1937 if (out.endsWith(' '))
1938 out.chop(1);
1939 out += "\n";
1940 // write the data
1941 for (int i = 0; i < blocks.size(); ++i) {
1942 if (out.endsWith(' '))
1943 out.chop(1);
1944 out += "\n";
1945 const PropertyBlock &b = blocks.at(i);
1946 for (int j = 0; j < b.properties.size(); ++j) {
1947 if (!(j % 8)) {
1948 if (out.endsWith(' '))
1949 out.chop(1);
1950 out += "\n ";
1952 out += QByteArray::number(b.properties.at(j));
1953 out += ", ";
1957 // we reserve one bit more than in the assert below for the sign
1958 Q_ASSERT(maxMirroredDiff < (1<<12));
1959 Q_ASSERT(maxLowerCaseDiff < (1<<14));
1960 Q_ASSERT(maxUpperCaseDiff < (1<<14));
1961 Q_ASSERT(maxTitleCaseDiff < (1<<14));
1962 Q_ASSERT(maxCaseFoldDiff < (1<<14));
1964 if (out.endsWith(' '))
1965 out.chop(1);
1966 out += "\n};\n\n"
1968 "#define GET_PROP_INDEX(ucs4) \\\n"
1969 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
1970 " ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
1971 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
1972 " : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
1973 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
1974 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
1975 "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
1976 "(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
1977 "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"
1980 "static const QUnicodeTables::Properties uc_properties [] = {\n";
1982 // keep in sync with the property declaration
1983 for (int i = 0; i < uniqueProperties.size(); ++i) {
1984 PropertyFlags p = uniqueProperties.at(i);
1985 out += " { ";
1986 // " ushort category : 8;\n"
1987 out += QByteArray::number( p.category );
1988 out += ", ";
1989 // " ushort line_break_class : 8;\n"
1990 out += QByteArray::number( p.line_break_class );
1991 out += ", ";
1992 // " ushort direction : 8;\n"
1993 out += QByteArray::number( p.direction );
1994 out += ", ";
1995 // " ushort combiningClass :8;\n"
1996 out += QByteArray::number( p.combiningClass );
1997 out += ", ";
1998 // " ushort joining : 2;\n"
1999 out += QByteArray::number( p.joining );
2000 out += ", ";
2001 // " signed short digitValue : 6;\n /* 5 needed */"
2002 out += QByteArray::number( p.digitValue );
2003 out += ", ";
2004 // " ushort unicodeVersion : 4;\n"
2005 out += QByteArray::number( p.age );
2006 out += ", ";
2007 // " ushort lowerCaseSpecial : 1;\n"
2008 // " ushort upperCaseSpecial : 1;\n"
2009 // " ushort titleCaseSpecial : 1;\n"
2010 // " ushort caseFoldSpecial : 1;\n"
2011 out += QByteArray::number( p.lowerCaseSpecial );
2012 out += ", ";
2013 out += QByteArray::number( p.upperCaseSpecial );
2014 out += ", ";
2015 out += QByteArray::number( p.titleCaseSpecial );
2016 out += ", ";
2017 out += QByteArray::number( p.caseFoldSpecial );
2018 out += ", ";
2019 // " signed short mirrorDiff : 16;\n"
2020 // " signed short lowerCaseDiff : 16;\n"
2021 // " signed short upperCaseDiff : 16;\n"
2022 // " signed short titleCaseDiff : 16;\n"
2023 // " signed short caseFoldDiff : 16;\n"
2024 out += QByteArray::number( p.mirrorDiff );
2025 out += ", ";
2026 out += QByteArray::number( p.lowerCaseDiff );
2027 out += ", ";
2028 out += QByteArray::number( p.upperCaseDiff );
2029 out += ", ";
2030 out += QByteArray::number( p.titleCaseDiff );
2031 out += ", ";
2032 out += QByteArray::number( p.caseFoldDiff );
2033 out += ", ";
2034 out += QByteArray::number( p.graphemeBreak );
2035 out += ", ";
2036 out += QByteArray::number( p.wordBreak );
2037 out += ", ";
2038 out += QByteArray::number( p.sentenceBreak );
2039 out += "},\n";
2041 out += "};\n\n";
2043 out += "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n"
2044 "{\n"
2045 " int index = GET_PROP_INDEX(ucs4);\n"
2046 " return uc_properties + index;\n"
2047 "}\n"
2048 "\n"
2049 "static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n"
2050 "{\n"
2051 " int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2052 " return uc_properties + index;\n"
2053 "}\n"
2054 "\n"
2055 "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n"
2056 "{\n"
2057 " int index = GET_PROP_INDEX(ucs4);\n"
2058 " return uc_properties + index;\n"
2059 "}\n"
2060 "\n"
2061 "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n"
2062 "{\n"
2063 " int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2064 " return uc_properties + index;\n"
2065 "}\n\n";
2067 out += "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION"\n\n";
2069 out += "static const ushort specialCaseMap [] = {";
2070 for (int i = 0; i < specialCaseMap.size(); ++i) {
2071 if (!(i % 16))
2072 out += "\n ";
2073 out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16);
2074 if (i < specialCaseMap.size() - 1)
2075 out += ",";
2077 out += "\n};\n";
2078 out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n";
2080 qDebug() << "Special case map uses " << specialCaseMap.size()*2 << "bytes";
2082 return out;
2086 struct DecompositionBlock {
2087 DecompositionBlock() { index = -1; }
2088 int index;
2089 QList<int> decompositionPositions;
2090 bool operator ==(const DecompositionBlock &other)
2091 { return decompositionPositions == other.decompositionPositions; }
2094 static QByteArray createCompositionInfo()
2096 qDebug("createCompositionInfo:");
2098 const int BMP_BLOCKSIZE=16;
2099 const int BMP_SHIFT = 4;
2100 const int BMP_END = 0x3400; // start of Han
2101 const int SMP_END = 0x30000;
2102 const int SMP_BLOCKSIZE = 256;
2103 const int SMP_SHIFT = 8;
2105 if(SMP_END <= highestComposedCharacter)
2106 qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter);
2108 QList<DecompositionBlock> blocks;
2109 QList<int> blockMap;
2110 QList<unsigned short> decompositions;
2112 int used = 0;
2113 int tableIndex = 0;
2115 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2116 DecompositionBlock b;
2117 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2118 int uc = block*BMP_BLOCKSIZE + i;
2119 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2120 if (!d.decomposition.isEmpty()) {
2121 int utf16Chars = 0;
2122 for (int j = 0; j < d.decomposition.size(); ++j)
2123 utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
2124 decompositions.append(d.decompositionType + (utf16Chars<<8));
2125 for (int j = 0; j < d.decomposition.size(); ++j) {
2126 int code = d.decomposition.at(j);
2127 if (code > 0x10000) {
2128 // save as surrogate pair
2129 code -= 0x10000;
2130 ushort high = code/0x400 + 0xd800;
2131 ushort low = code%0x400 + 0xdc00;
2132 decompositions.append(high);
2133 decompositions.append(low);
2134 } else {
2135 decompositions.append(code);
2138 b.decompositionPositions.append(tableIndex);
2139 tableIndex += utf16Chars + 1;
2140 } else {
2141 b.decompositionPositions.append(0xffff);
2144 int index = blocks.indexOf(b);
2145 if (index == -1) {
2146 index = blocks.size();
2147 b.index = used;
2148 used += BMP_BLOCKSIZE;
2149 blocks.append(b);
2151 blockMap.append(blocks.at(index).index);
2154 int bmp_blocks = blocks.size();
2155 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2157 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2158 DecompositionBlock b;
2159 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2160 int uc = block*SMP_BLOCKSIZE + i;
2161 UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
2162 if (!d.decomposition.isEmpty()) {
2163 int utf16Chars = 0;
2164 for (int j = 0; j < d.decomposition.size(); ++j)
2165 utf16Chars += d.decomposition.at(j) > 0x10000 ? 2 : 1;
2166 decompositions.append(d.decompositionType + (utf16Chars<<8));
2167 for (int j = 0; j < d.decomposition.size(); ++j) {
2168 int code = d.decomposition.at(j);
2169 if (code > 0x10000) {
2170 // save as surrogate pair
2171 code -= 0x10000;
2172 ushort high = code/0x400 + 0xd800;
2173 ushort low = code%0x400 + 0xdc00;
2174 decompositions.append(high);
2175 decompositions.append(low);
2176 } else {
2177 decompositions.append(code);
2180 b.decompositionPositions.append(tableIndex);
2181 tableIndex += utf16Chars + 1;
2182 } else {
2183 b.decompositionPositions.append(0xffff);
2186 int index = blocks.indexOf(b);
2187 if (index == -1) {
2188 index = blocks.size();
2189 b.index = used;
2190 used += SMP_BLOCKSIZE;
2191 blocks.append(b);
2193 blockMap.append(blocks.at(index).index);
2196 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2197 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2198 int bmp_mem = bmp_block_data + bmp_trie;
2199 qDebug(" %d unique blocks in BMP.",blocks.size());
2200 qDebug(" block data uses: %d bytes", bmp_block_data);
2201 qDebug(" trie data uses : %d bytes", bmp_trie);
2202 qDebug(" memory usage: %d bytes", bmp_mem);
2204 int smp_block_data = (blocks.size()- bmp_blocks)*SMP_BLOCKSIZE*2;
2205 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
2206 int smp_mem = smp_block_data + smp_trie;
2207 qDebug(" %d unique blocks in SMP.",blocks.size()-bmp_blocks);
2208 qDebug(" block data uses: %d bytes", smp_block_data);
2209 qDebug(" trie data uses : %d bytes", smp_trie);
2211 qDebug("\n decomposition table use : %d bytes", decompositions.size()*2);
2212 qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2);
2214 QByteArray out;
2216 out += "static const unsigned short uc_decomposition_trie[] = {\n";
2218 // first write the map
2219 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2220 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2221 if (!(i % 8)) {
2222 if (out.endsWith(' '))
2223 out.chop(1);
2224 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2225 out += "\n";
2226 out += "\n ";
2228 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2229 out += ", ";
2231 if (out.endsWith(' '))
2232 out.chop(1);
2233 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
2234 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2235 if (!(i % 8)) {
2236 if (out.endsWith(' '))
2237 out.chop(1);
2238 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2239 out += "\n";
2240 out += "\n ";
2242 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2243 out += ", ";
2245 if (out.endsWith(' '))
2246 out.chop(1);
2247 out += "\n";
2248 // write the data
2249 for (int i = 0; i < blocks.size(); ++i) {
2250 if (out.endsWith(' '))
2251 out.chop(1);
2252 out += "\n";
2253 const DecompositionBlock &b = blocks.at(i);
2254 for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2255 if (!(j % 8)) {
2256 if (out.endsWith(' '))
2257 out.chop(1);
2258 out += "\n ";
2260 out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2261 out += ", ";
2265 if (out.endsWith(' '))
2266 out.chop(1);
2267 out += "\n};\n\n"
2269 "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2270 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2271 " ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2272 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2273 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
2274 " ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2275 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2276 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
2277 " : 0xffff))\n\n"
2279 "static const unsigned short uc_decomposition_map[] = {\n";
2281 for (int i = 0; i < decompositions.size(); ++i) {
2282 if (!(i % 8)) {
2283 if (out.endsWith(' '))
2284 out.chop(1);
2285 out += "\n ";
2287 out += "0x" + QByteArray::number(decompositions.at(i), 16);
2288 out += ", ";
2291 if (out.endsWith(' '))
2292 out.chop(1);
2293 out += "\n};\n\n";
2295 return out;
2298 static QByteArray createLigatureInfo()
2300 qDebug("createLigatureInfo: numLigatures=%d", numLigatures);
2302 QList<DecompositionBlock> blocks;
2303 QList<int> blockMap;
2304 QList<unsigned short> ligatures;
2306 const int BMP_BLOCKSIZE = 32;
2307 const int BMP_SHIFT = 5;
2308 const int BMP_END = 0x3100;
2309 Q_ASSERT(highestLigature < BMP_END);
2311 int used = 0;
2312 int tableIndex = 0;
2314 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2315 DecompositionBlock b;
2316 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2317 int uc = block*BMP_BLOCKSIZE + i;
2318 QList<Ligature> l = ligatureHashes.value(uc);
2319 if (!l.isEmpty()) {
2320 b.decompositionPositions.append(tableIndex);
2321 qSort(l);
2323 ligatures.append(l.size());
2324 for (int i = 0; i < l.size(); ++i) {
2325 Q_ASSERT(l.at(i).u2 == uc);
2326 ligatures.append(l.at(i).u1);
2327 ligatures.append(l.at(i).ligature);
2329 tableIndex += 2*l.size() + 1;
2330 } else {
2331 b.decompositionPositions.append(0xffff);
2334 int index = blocks.indexOf(b);
2335 if (index == -1) {
2336 index = blocks.size();
2337 b.index = used;
2338 used += BMP_BLOCKSIZE;
2339 blocks.append(b);
2341 blockMap.append(blocks.at(index).index);
2344 int bmp_blocks = blocks.size();
2345 Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
2347 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
2348 int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
2349 int bmp_mem = bmp_block_data + bmp_trie;
2350 qDebug(" %d unique blocks in BMP.",blocks.size());
2351 qDebug(" block data uses: %d bytes", bmp_block_data);
2352 qDebug(" trie data uses : %d bytes", bmp_trie);
2353 qDebug(" ligature data uses : %d bytes", ligatures.size()*2);
2354 qDebug(" memory usage: %d bytes", bmp_mem + ligatures.size() * 2);
2356 QByteArray out;
2359 out += "static const unsigned short uc_ligature_trie[] = {\n";
2361 // first write the map
2362 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2363 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2364 if (!(i % 8)) {
2365 if (out.endsWith(' '))
2366 out.chop(1);
2367 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2368 out += "\n";
2369 out += "\n ";
2371 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2372 out += ", ";
2374 if (out.endsWith(' '))
2375 out.chop(1);
2376 out += "\n";
2377 // write the data
2378 for (int i = 0; i < blocks.size(); ++i) {
2379 if (out.endsWith(' '))
2380 out.chop(1);
2381 out += "\n";
2382 const DecompositionBlock &b = blocks.at(i);
2383 for (int j = 0; j < b.decompositionPositions.size(); ++j) {
2384 if (!(j % 8)) {
2385 if (out.endsWith(' '))
2386 out.chop(1);
2387 out += "\n ";
2389 out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
2390 out += ", ";
2393 if (out.endsWith(' '))
2394 out.chop(1);
2395 out += "\n};\n\n"
2397 "#define GET_LIGATURE_INDEX(u2) "
2398 "(u2 < 0x" + QByteArray::number(BMP_END, 16) + " ? "
2399 "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) +
2400 "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n"
2402 "static const unsigned short uc_ligature_map [] = {\n";
2404 for (int i = 0; i < ligatures.size(); ++i) {
2405 if (!(i % 8)) {
2406 if (out.endsWith(' '))
2407 out.chop(1);
2408 out += "\n ";
2410 out += "0x" + QByteArray::number(ligatures.at(i), 16);
2411 out += ", ";
2414 if (out.endsWith(' '))
2415 out.chop(1);
2416 out += "\n};\n\n";
2418 return out;
2421 QByteArray createCasingInfo()
2423 QByteArray out;
2425 out += "struct CasingInfo {\n"
2426 " uint codePoint : 16;\n"
2427 " uint flags : 8;\n"
2428 " uint offset : 8;\n"
2429 "};\n\n";
2431 return out;
2434 int main(int, char **)
2436 initCategoryMap();
2437 initDirectionMap();
2438 initDecompositionMap();
2439 initGraphemeBreak();
2440 initWordBreak();
2441 initSentenceBreak();
2443 readUnicodeData();
2444 readBidiMirroring();
2445 readArabicShaping();
2446 readDerivedAge();
2447 readCompositionExclusion();
2448 readLineBreak();
2449 readSpecialCasing();
2450 readCaseFolding();
2451 // readBlocks();
2452 readScripts();
2453 readGraphemeBreak();
2454 readWordBreak();
2455 readSentenceBreak();
2457 computeUniqueProperties();
2458 QByteArray properties = createPropertyInfo();
2459 QByteArray compositions = createCompositionInfo();
2460 QByteArray ligatures = createLigatureInfo();
2461 QByteArray normalizationCorrections = createNormalizationCorrections();
2462 QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
2463 QByteArray scriptTableDeclaration = createScriptTableDeclaration();
2465 QFile f("../../src/corelib/tools/qunicodetables.cpp");
2466 f.open(QFile::WriteOnly|QFile::Truncate);
2468 QByteArray header =
2469 "/****************************************************************************\n"
2470 "**\n"
2471 "** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).\n"
2472 "** All rights reserved.\n"
2473 "** Contact: Nokia Corporation (qt-info@nokia.com)\n"
2474 "**\n"
2475 "** This file is part of the QtCore module of the Qt Toolkit.\n"
2476 "**\n"
2477 "** $QT_BEGIN_LICENSE:LGPL$\n"
2478 "** No Commercial Usage\n"
2479 "** This file contains pre-release code and may not be distributed.\n"
2480 "** You may use this file in accordance with the terms and conditions\n"
2481 "** contained in the Technology Preview License Agreement accompanying\n"
2482 "** this package.\n"
2483 "**\n"
2484 "** GNU Lesser General Public License Usage\n"
2485 "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
2486 "** General Public License version 2.1 as published by the Free Software\n"
2487 "** Foundation and appearing in the file LICENSE.LGPL included in the\n"
2488 "** packaging of this file. Please review the following information to\n"
2489 "** ensure the GNU Lesser General Public License version 2.1 requirements\n"
2490 "** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
2491 "**\n"
2492 "** In addition, as a special exception, Nokia gives you certain additional\n"
2493 "** rights. These rights are described in the Nokia Qt LGPL Exception\n"
2494 "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
2495 "**\n"
2496 "** If you have questions regarding the use of this file, please contact\n"
2497 "** Nokia at qt-info@nokia.com.\n"
2498 "**\n"
2499 "**\n"
2500 "**\n"
2501 "**\n"
2502 "**\n"
2503 "**\n"
2504 "**\n"
2505 "**\n"
2506 "** $QT_END_LICENSE$\n"
2507 "**\n"
2508 "****************************************************************************/\n\n"
2510 "/* This file is autogenerated from the Unicode 5.0 database. Do not edit */\n\n";
2512 QByteArray warning =
2513 "//\n"
2514 "// W A R N I N G\n"
2515 "// -------------\n"
2516 "//\n"
2517 "// This file is not part of the Qt API. It exists for the convenience\n"
2518 "// of internal files. This header file may change from version to version\n"
2519 "// without notice, or even be removed.\n"
2520 "//\n"
2521 "// We mean it.\n"
2522 "//\n\n";
2524 f.write(header);
2525 f.write("QT_BEGIN_NAMESPACE\n\n");
2526 f.write(properties);
2527 f.write(compositions);
2528 f.write(ligatures);
2529 f.write(normalizationCorrections);
2530 f.write(scriptTableDeclaration);
2531 f.write("\nQT_END_NAMESPACE\n");
2532 f.close();
2534 f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
2535 f.open(QFile::WriteOnly | QFile::Truncate);
2536 f.write(header);
2537 f.write(warning);
2538 f.write("#ifndef QUNICODETABLES_P_H\n"
2539 "#define QUNICODETABLES_P_H\n\n"
2540 "#include <QtCore/qchar.h>\n\n"
2541 "QT_BEGIN_NAMESPACE\n\n");
2542 f.write("namespace QUnicodeTables {\n");
2543 f.write(property_string);
2544 f.write("\n");
2545 f.write(scriptEnumDeclaration);
2546 f.write("\n");
2547 f.write(lineBreakClass);
2548 f.write("\n");
2549 f.write(methods);
2550 f.write("\n");
2551 f.write(grapheme_break_string);
2552 f.write("\n");
2553 f.write(word_break_string);
2554 f.write("\n");
2555 f.write(sentence_break_string);
2556 f.write("\n}\n\n"
2557 "QT_END_NAMESPACE\n\n"
2558 "#endif\n");
2559 f.close();
2561 qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff;
2562 qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
2563 qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
2564 qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
2565 qDebug() << "maxCaseFoldDiff = " << hex << maxCaseFoldDiff;
2566 #if 0
2567 // dump(0, 0x7f);
2568 // dump(0x620, 0x640);
2569 // dump(0x10000, 0x10020);
2570 // dump(0x10800, 0x10820);
2572 qDebug("decompositionLength used:");
2573 int totalcompositions = 0;
2574 int sum = 0;
2575 for (int i = 1; i < 20; ++i) {
2576 qDebug(" length %d used %d times", i, decompositionLength.value(i, 0));
2577 totalcompositions += i*decompositionLength.value(i, 0);
2578 sum += decompositionLength.value(i, 0);
2580 qDebug(" len decomposition map %d, average length %f, num composed chars %d",
2581 totalcompositions, (float)totalcompositions/(float)sum, sum);
2582 qDebug("highest composed character %x", highestComposedCharacter);
2583 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
2585 qBubbleSort(ligatures);
2586 for (int i = 0; i < ligatures.size(); ++i)
2587 qDebug("%s", ligatures.at(i).data());
2589 // qDebug("combiningClass usage:");
2590 // int numClasses = 0;
2591 // for (int i = 0; i < 255; ++i) {
2592 // int num = combiningClassUsage.value(i, 0);
2593 // if (num) {
2594 // ++numClasses;
2595 // qDebug(" combiningClass %d used %d times", i, num);
2596 // }
2597 // }
2598 // qDebug("total of %d combining classes used", numClasses);
2600 #endif