1 /****************************************************************************
3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4 ** All rights reserved.
5 ** Contact: Nokia Corporation (qt-info@nokia.com)
7 ** This file is part of the utils of the Qt Toolkit.
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** No Commercial Usage
11 ** This file contains pre-release code and may not be distributed.
12 ** You may use this file in accordance with the terms and conditions
13 ** contained in the Technology Preview License Agreement accompanying
16 ** GNU Lesser General Public License Usage
17 ** Alternatively, this file may be used under the terms of the GNU Lesser
18 ** General Public License version 2.1 as published by the Free Software
19 ** Foundation and appearing in the file LICENSE.LGPL included in the
20 ** packaging of this file. Please review the following information to
21 ** ensure the GNU Lesser General Public License version 2.1 requirements
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
24 ** In addition, as a special exception, Nokia gives you certain additional
25 ** rights. These rights are described in the Nokia Qt LGPL Exception
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
28 ** If you have questions regarding the use of this file, please contact
29 ** Nokia at qt-info@nokia.com.
40 ****************************************************************************/
46 #include <private/qunicodetables_p.h>
51 static struct AgeMap
{
53 const QChar::UnicodeVersion version
;
55 { "1.1", QChar::Unicode_1_1
},
56 { "2.0", QChar::Unicode_2_0
},
57 { "2.1", QChar::Unicode_2_1_2
},
58 { "3.0", QChar::Unicode_3_0
},
59 { "3.1", QChar::Unicode_3_1
},
60 { "3.2", QChar::Unicode_3_2
},
61 { "4.0", QChar::Unicode_4_0
},
62 { "4.1", QChar::Unicode_4_1
},
63 { "5.0", QChar::Unicode_5_0
},
64 { 0, QChar::Unicode_Unassigned
}
66 #define CURRENT_UNICODE_VERSION "QChar::Unicode_5_0"
68 static const char *grapheme_break_string
=
69 " enum GraphemeBreak {\n"
70 " GraphemeBreakOther, \n"
73 " GraphemeBreakControl,\n"
74 " GraphemeBreakExtend,\n"
95 QHash
<QByteArray
, GraphemeBreak
> grapheme_break_map
;
97 static void initGraphemeBreak()
99 struct GraphemeBreakList
{
103 { GraphemeBreakOther
, "Other" },
104 { GraphemeBreakCR
, "CR" },
105 { GraphemeBreakLF
, "LF" },
106 { GraphemeBreakControl
, "Control" },
107 { GraphemeBreakExtend
, "Extend" },
108 { GraphemeBreakL
, "L" },
109 { GraphemeBreakV
, "V" },
110 { GraphemeBreakT
, "T" },
111 { GraphemeBreakLV
, "LV" },
112 { GraphemeBreakLVT
, "LVT" },
113 { GraphemeBreakOther
, 0 }
115 GraphemeBreakList
*d
= breaks
;
117 grapheme_break_map
.insert(d
->name
, d
->brk
);
122 const char *word_break_string
=
123 " enum WordBreak {\n"
125 " WordBreakFormat,\n"
126 " WordBreakKatakana,\n"
127 " WordBreakALetter,\n"
128 " WordBreakMidLetter,\n"
129 " WordBreakMidNum,\n"
130 " WordBreakNumeric,\n"
131 " WordBreakExtendNumLet\n"
142 WordBreakExtendNumLet
146 QHash
<QByteArray
, WordBreak
> word_break_map
;
148 static void initWordBreak()
150 struct WordBreakList
{
154 { WordBreakFormat
, "Format" },
155 { WordBreakFormat
, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
156 { WordBreakKatakana
, "Katakana" },
157 { WordBreakALetter
, "ALetter" },
158 { WordBreakMidLetter
, "MidLetter" },
159 { WordBreakMidNum
, "MidNum" },
160 { WordBreakNumeric
, "Numeric" },
161 { WordBreakExtendNumLet
, "ExtendNumLet" },
162 { WordBreakFormat
, 0 }
164 WordBreakList
*d
= breaks
;
166 word_break_map
.insert(d
->name
, d
->brk
);
172 static const char *sentence_break_string
=
173 " enum SentenceBreak {\n"
174 " SentenceBreakOther,\n"
175 " SentenceBreakSep,\n"
176 " SentenceBreakFormat,\n"
177 " SentenceBreakSp,\n"
178 " SentenceBreakLower,\n"
179 " SentenceBreakUpper,\n"
180 " SentenceBreakOLetter,\n"
181 " SentenceBreakNumeric,\n"
182 " SentenceBreakATerm,\n"
183 " SentenceBreakSTerm,\n"
184 " SentenceBreakClose\n"
194 SentenceBreakOLetter
,
195 SentenceBreakNumeric
,
202 QHash
<QByteArray
, SentenceBreak
> sentence_break_map
;
204 static void initSentenceBreak()
206 struct SentenceBreakList
{
210 { SentenceBreakOther
, "Other" },
211 { SentenceBreakSep
, "Sep" },
212 { SentenceBreakFormat
, "Format" },
213 { SentenceBreakSp
, "Sp" },
214 { SentenceBreakLower
, "Lower" },
215 { SentenceBreakUpper
, "Upper" },
216 { SentenceBreakOLetter
, "OLetter" },
217 { SentenceBreakNumeric
, "Numeric" },
218 { SentenceBreakATerm
, "ATerm" },
219 { SentenceBreakSTerm
, "STerm" },
220 { SentenceBreakClose
, "Close" },
221 { SentenceBreakOther
, 0 }
223 SentenceBreakList
*d
= breaks
;
225 sentence_break_map
.insert(d
->name
, d
->brk
);
231 // Keep this one in sync with the code in createPropertyInfo
232 const char *property_string
=
233 " struct Properties {\n"
234 " ushort category : 8;\n"
235 " ushort line_break_class : 8;\n"
236 " ushort direction : 8;\n"
237 " ushort combiningClass :8;\n"
238 " ushort joining : 2;\n"
239 " signed short digitValue : 6; /* 5 needed */\n"
240 " ushort unicodeVersion : 4;\n"
241 " ushort lowerCaseSpecial : 1;\n"
242 " ushort upperCaseSpecial : 1;\n"
243 " ushort titleCaseSpecial : 1;\n"
244 " ushort caseFoldSpecial : 1; /* currently unused */\n"
245 " signed short mirrorDiff : 16;\n"
246 " signed short lowerCaseDiff : 16;\n"
247 " signed short upperCaseDiff : 16;\n"
248 " signed short titleCaseDiff : 16;\n"
249 " signed short caseFoldDiff : 16;\n"
250 " ushort graphemeBreak : 8;\n"
251 " ushort wordBreak : 8;\n"
252 " ushort sentenceBreak : 8;\n"
254 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
255 " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
257 const char *lineBreakClass
=
258 " // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
259 " // we don't use the XX, AI and CB properties and map them to AL instead.\n"
260 " // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
261 " enum LineBreakClass {\n"
262 " LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
263 " LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
264 " LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
265 " LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
266 " LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
267 " LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
268 " LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
271 const char *methods
=
272 " Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
273 " inline int lineBreakClass(const QChar &ch) {\n"
274 " return QUnicodeTables::lineBreakClass(ch.unicode());\n"
277 " Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
278 " Q_CORE_EXPORT_INLINE int QT_FASTCALL script(const QChar &ch) {\n"
279 " return script(ch.unicode());\n"
283 struct PropertyFlags
{
284 bool operator ==(const PropertyFlags
&o
) {
285 return (combiningClass
== o
.combiningClass
286 && category
== o
.category
287 && direction
== o
.direction
288 && joining
== o
.joining
290 && digitValue
== o
.digitValue
291 && line_break_class
== o
.line_break_class
292 && mirrorDiff
== o
.mirrorDiff
293 && lowerCaseDiff
== o
.lowerCaseDiff
294 && upperCaseDiff
== o
.upperCaseDiff
295 && titleCaseDiff
== o
.titleCaseDiff
296 && caseFoldDiff
== o
.caseFoldDiff
297 && lowerCaseSpecial
== o
.lowerCaseSpecial
298 && upperCaseSpecial
== o
.upperCaseSpecial
299 && titleCaseSpecial
== o
.titleCaseSpecial
300 && caseFoldSpecial
== o
.caseFoldSpecial
301 && graphemeBreak
== o
.graphemeBreak
302 && wordBreak
== o
.wordBreak
303 && sentenceBreak
== o
.sentenceBreak
306 // from UnicodeData.txt
307 uchar combiningClass
: 8;
308 QChar::Category category
: 5;
309 QChar::Direction direction
: 5;
310 // from ArabicShaping.txt
311 QChar::Joining joining
: 2;
312 // from DerivedAge.txt
313 QChar::UnicodeVersion age
: 4;
315 uint line_break_class
: 5;
323 bool lowerCaseSpecial
;
324 bool upperCaseSpecial
;
325 bool titleCaseSpecial
;
326 bool caseFoldSpecial
;
327 GraphemeBreak graphemeBreak
;
329 SentenceBreak sentenceBreak
;
332 QList
<int> specialCaseMap
;
333 int specialCaseMaxLen
= 0;
335 static int appendToSpecialCaseMap(const QList
<int> &map
)
338 for (int i
= 0; i
< map
.size(); ++i
) {
341 utf16map
<< QChar::highSurrogate(val
);
342 utf16map
<< QChar::lowSurrogate(val
);
347 specialCaseMaxLen
= qMax(specialCaseMaxLen
, utf16map
.size());
350 for (int i
= 0; i
< specialCaseMap
.size() - utf16map
.size() - 1; ++i
) {
352 for (j
= 0; j
< utf16map
.size(); ++j
) {
353 if (specialCaseMap
.at(i
+j
) != utf16map
.at(j
))
356 if (j
== utf16map
.size())
360 int pos
= specialCaseMap
.size();
361 specialCaseMap
<< utf16map
;
366 UnicodeData(int codepoint
= 0) {
367 p
.category
= QChar::NoCategory
;
368 p
.combiningClass
= 0;
370 p
.direction
= QChar::DirL
;
371 // DirR for: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
372 if ((codepoint
>= 0x590 && codepoint
<= 0x5ff)
373 || (codepoint
>= 0x7c0 && codepoint
<= 0x8ff)
374 || (codepoint
>= 0xfb1d && codepoint
<= 0xfb4f)
375 || (codepoint
>= 0x10800 && codepoint
<= 0x10fff))
376 p
.direction
= QChar::DirR
;
377 // DirAL for: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE
378 if ((codepoint
>= 0x600 && codepoint
<= 0x7bf)
379 || (codepoint
>= 0xfb50 && codepoint
<= 0xfdcf)
380 || (codepoint
>= 0xfdf0 && codepoint
<= 0xfdff)
381 || (codepoint
>= 0xfe70 && codepoint
<= 0xfefe))
382 p
.direction
= QChar::DirAL
;
385 decompositionType
= QChar::NoDecomposition
;
386 p
.joining
= QChar::OtherJoining
;
387 p
.age
= QChar::Unicode_Unassigned
;
390 p
.line_break_class
= QUnicodeTables::LineBreak_AL
;
395 p
.lowerCaseSpecial
= 0;
396 p
.upperCaseSpecial
= 0;
397 p
.titleCaseSpecial
= 0;
398 p
.caseFoldSpecial
= 0;
399 p
.graphemeBreak
= GraphemeBreakOther
;
400 p
.wordBreak
= WordBreakOther
;
401 p
.sentenceBreak
= SentenceBreakOther
;
403 excludedComposition
= false;
407 // from UnicodeData.txt
408 QChar::Decomposition decompositionType
;
409 QList
<int> decomposition
;
411 QList
<int> specialFolding
;
413 // from BidiMirroring.txt
416 // CompositionExclusions.txt
417 bool excludedComposition
;
419 // computed position of unicode property set
430 UD_DecimalDigitValue
,
441 QHash
<QByteArray
, QChar::Category
> categoryMap
;
443 static void initCategoryMap()
449 { QChar::Mark_NonSpacing
, "Mn" },
450 { QChar::Mark_SpacingCombining
, "Mc" },
451 { QChar::Mark_Enclosing
, "Me" },
453 { QChar::Number_DecimalDigit
, "Nd" },
454 { QChar::Number_Letter
, "Nl" },
455 { QChar::Number_Other
, "No" },
457 { QChar::Separator_Space
, "Zs" },
458 { QChar::Separator_Line
, "Zl" },
459 { QChar::Separator_Paragraph
, "Zp" },
461 { QChar::Other_Control
, "Cc" },
462 { QChar::Other_Format
, "Cf" },
463 { QChar::Other_Surrogate
, "Cs" },
464 { QChar::Other_PrivateUse
, "Co" },
465 { QChar::Other_NotAssigned
, "Cn" },
467 { QChar::Letter_Uppercase
, "Lu" },
468 { QChar::Letter_Lowercase
, "Ll" },
469 { QChar::Letter_Titlecase
, "Lt" },
470 { QChar::Letter_Modifier
, "Lm" },
471 { QChar::Letter_Other
, "Lo" },
473 { QChar::Punctuation_Connector
, "Pc" },
474 { QChar::Punctuation_Dash
, "Pd" },
475 { QChar::Punctuation_Open
, "Ps" },
476 { QChar::Punctuation_Close
, "Pe" },
477 { QChar::Punctuation_InitialQuote
, "Pi" },
478 { QChar::Punctuation_FinalQuote
, "Pf" },
479 { QChar::Punctuation_Other
, "Po" },
481 { QChar::Symbol_Math
, "Sm" },
482 { QChar::Symbol_Currency
, "Sc" },
483 { QChar::Symbol_Modifier
, "Sk" },
484 { QChar::Symbol_Other
, "So" },
485 { QChar::NoCategory
, 0 }
488 while (c
->cat
!= QChar::NoCategory
) {
489 categoryMap
.insert(c
->name
, c
->cat
);
494 QHash
<QByteArray
, QChar::Direction
> directionMap
;
496 static void initDirectionMap()
499 QChar::Direction dir
;
502 { QChar::DirL
, "L" },
503 { QChar::DirR
, "R" },
504 { QChar::DirEN
, "EN" },
505 { QChar::DirES
, "ES" },
506 { QChar::DirET
, "ET" },
507 { QChar::DirAN
, "AN" },
508 { QChar::DirCS
, "CS" },
509 { QChar::DirB
, "B" },
510 { QChar::DirS
, "S" },
511 { QChar::DirWS
, "WS" },
512 { QChar::DirON
, "ON" },
513 { QChar::DirLRE
, "LRE" },
514 { QChar::DirLRO
, "LRO" },
515 { QChar::DirAL
, "AL" },
516 { QChar::DirRLE
, "RLE" },
517 { QChar::DirRLO
, "RLO" },
518 { QChar::DirPDF
, "PDF" },
519 { QChar::DirNSM
, "NSM" },
520 { QChar::DirBN
, "BN" },
525 directionMap
.insert(d
->name
, d
->dir
);
531 QHash
<QByteArray
, QChar::Decomposition
> decompositionMap
;
533 static void initDecompositionMap()
536 QChar::Decomposition dec
;
538 } decompositions
[] = {
539 { QChar::Canonical
, "<canonical>" },
540 { QChar::Font
, "<font>" },
541 { QChar::NoBreak
, "<noBreak>" },
542 { QChar::Initial
, "<initial>" },
543 { QChar::Medial
, "<medial>" },
544 { QChar::Final
, "<final>" },
545 { QChar::Isolated
, "<isolated>" },
546 { QChar::Circle
, "<circle>" },
547 { QChar::Super
, "<super>" },
548 { QChar::Sub
, "<sub>" },
549 { QChar::Vertical
, "<vertical>" },
550 { QChar::Wide
, "<wide>" },
551 { QChar::Narrow
, "<narrow>" },
552 { QChar::Small
, "<small>" },
553 { QChar::Square
, "<square>" },
554 { QChar::Compat
, "<compat>" },
555 { QChar::Fraction
, "<fraction>" },
556 { QChar::NoDecomposition
, 0 }
558 Dec
*d
= decompositions
;
560 decompositionMap
.insert(d
->name
, d
->dec
);
566 QHash
<int, UnicodeData
> unicodeData
;
567 QList
<PropertyFlags
> uniqueProperties
;
570 QHash
<int, int> decompositionLength
;
571 int highestComposedCharacter
= 0;
572 int numLigatures
= 0;
573 int highestLigature
= 0;
575 struct Ligature
{ushort u1
; ushort u2
; ushort ligature
;};
576 // we need them sorted after the first component for fast lookup
577 bool operator < (const Ligature
&l1
, const Ligature
&l2
) {
578 return l1
.u1
< l2
.u1
;
581 QHash
<ushort
, QList
<Ligature
> > ligatureHashes
;
583 QHash
<int, int> combiningClassUsage
;
585 int maxLowerCaseDiff
= 0;
586 int maxUpperCaseDiff
= 0;
587 int maxTitleCaseDiff
= 0;
589 static void readUnicodeData()
591 QFile
f("data/UnicodeData.txt");
593 qFatal("Couldn't find UnicodeData.txt");
595 f
.open(QFile::ReadOnly
);
600 int len
= f
.readLine(line
.data(), 1024);
601 line
.truncate(len
-1);
603 int comment
= line
.indexOf('#');
605 line
= line
.left(comment
);
609 QList
<QByteArray
> properties
= line
.split(';');
611 int codepoint
= properties
[UD_Value
].toInt(&ok
, 16);
612 int lastCodepoint
= codepoint
;
614 QByteArray name
= properties
[UD_Name
];
615 if (name
.startsWith('<') && name
.contains("First")) {
617 nextLine
.resize(1024);
618 f
.readLine(nextLine
.data(), 1024);
619 QList
<QByteArray
> properties
= nextLine
.split(';');
620 lastCodepoint
= properties
[UD_Value
].toInt(&ok
, 16);
623 UnicodeData
data(codepoint
);
624 data
.p
.category
= categoryMap
.value(properties
[UD_Category
], QChar::NoCategory
);
625 data
.p
.combiningClass
= properties
[UD_CombiningClass
].toInt();
627 if (!combiningClassUsage
.contains(data
.p
.combiningClass
))
628 combiningClassUsage
[data
.p
.combiningClass
] = 1;
630 ++combiningClassUsage
[data
.p
.combiningClass
];
632 data
.p
.direction
= directionMap
.value(properties
[UD_BidiCategory
], data
.p
.direction
);
634 if (!properties
[UD_UpperCase
].isEmpty()) {
635 int upperCase
= properties
[UD_UpperCase
].toInt(&ok
, 16);
637 data
.p
.upperCaseDiff
= upperCase
- codepoint
;
638 maxUpperCaseDiff
= qMax(maxUpperCaseDiff
, qAbs(data
.p
.upperCaseDiff
));
639 if (codepoint
> 0xffff) {
640 // if the condition below doesn't hold anymore we need to modify our case folding code
641 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
642 Q_ASSERT(QChar::highSurrogate(codepoint
) == QChar::highSurrogate(upperCase
));
645 if (!properties
[UD_LowerCase
].isEmpty()) {
646 int lowerCase
= properties
[UD_LowerCase
].toInt(&ok
, 16);
648 data
.p
.lowerCaseDiff
= lowerCase
- codepoint
;
649 maxLowerCaseDiff
= qMax(maxLowerCaseDiff
, qAbs(data
.p
.lowerCaseDiff
));
650 if (codepoint
> 0xffff) {
651 // if the condition below doesn't hold anymore we need to modify our case folding code
652 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
653 Q_ASSERT(QChar::highSurrogate(codepoint
) == QChar::highSurrogate(lowerCase
));
656 // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
657 if (properties
[UD_TitleCase
].isEmpty())
658 properties
[UD_TitleCase
] = properties
[UD_UpperCase
];
659 if (!properties
[UD_TitleCase
].isEmpty()) {
660 int titleCase
= properties
[UD_TitleCase
].toInt(&ok
, 16);
662 data
.p
.titleCaseDiff
= titleCase
- codepoint
;
663 maxTitleCaseDiff
= qMax(maxTitleCaseDiff
, qAbs(data
.p
.titleCaseDiff
));
664 if (codepoint
> 0xffff) {
665 // if the condition below doesn't hold anymore we need to modify our case folding code
666 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
667 Q_ASSERT(QChar::highSurrogate(codepoint
) == QChar::highSurrogate(titleCase
));
671 if (!properties
[UD_DigitValue
].isEmpty())
672 data
.p
.digitValue
= properties
[UD_DigitValue
].toInt();
675 QByteArray decomposition
= properties
[UD_Decomposition
];
676 if (!decomposition
.isEmpty()) {
677 highestComposedCharacter
= qMax(highestComposedCharacter
, codepoint
);
678 QList
<QByteArray
> d
= decomposition
.split(' ');
679 if (d
[0].contains('<')) {
680 data
.decompositionType
= decompositionMap
.value(d
[0], QChar::Canonical
);
683 data
.decompositionType
= QChar::Canonical
;
685 for (int i
= 0; i
< d
.size(); ++i
)
686 data
.decomposition
.append(d
[i
].toInt(&ok
, 16));
687 if (!decompositionLength
.contains(data
.decomposition
.size()))
688 decompositionLength
[data
.decomposition
.size()] = 1;
690 ++decompositionLength
[data
.decomposition
.size()];
693 for (int i
= codepoint
; i
<= lastCodepoint
; ++i
)
694 unicodeData
.insert(i
, data
);
699 static int maxMirroredDiff
= 0;
701 static void readBidiMirroring()
703 QFile
f("data/BidiMirroring.txt");
705 qFatal("Couldn't find BidiMirroring.txt");
707 f
.open(QFile::ReadOnly
);
712 int len
= f
.readLine(line
.data(), 1024);
715 int comment
= line
.indexOf('#');
717 line
= line
.left(comment
);
721 line
= line
.replace(" ", "");
723 QList
<QByteArray
> pair
= line
.split(';');
724 Q_ASSERT(pair
.size() == 2);
727 int codepoint
= pair
[0].toInt(&ok
, 16);
728 int mirror
= pair
[1].toInt(&ok
, 16);
730 UnicodeData d
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
731 d
.mirroredChar
= mirror
;
732 if (qAbs(codepoint
-d
.mirroredChar
) > maxMirroredDiff
)
733 maxMirroredDiff
= qAbs(codepoint
- d
.mirroredChar
);
735 d
.p
.mirrorDiff
= d
.mirroredChar
- codepoint
;
736 unicodeData
.insert(codepoint
, d
);
740 static void readArabicShaping()
742 QFile
f("data/ArabicShaping.txt");
744 qFatal("Couldn't find ArabicShaping.txt");
746 f
.open(QFile::ReadOnly
);
751 int len
= f
.readLine(line
.data(), 1024);
754 int comment
= line
.indexOf('#');
756 line
= line
.left(comment
);
757 line
= line
.trimmed();
762 QList
<QByteArray
> shaping
= line
.split(';');
763 Q_ASSERT(shaping
.size() == 4);
766 int codepoint
= shaping
[0].toInt(&ok
, 16);
767 QChar::Joining j
= QChar::OtherJoining
;
768 QByteArray shape
= shaping
[2].trimmed();
771 else if (shape
== "D")
773 else if (shape
== "C")
776 UnicodeData d
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
778 unicodeData
.insert(codepoint
, d
);
782 static void readDerivedAge()
784 QFile
f("data/DerivedAge.txt");
786 qFatal("Couldn't find DerivedAge.txt");
788 f
.open(QFile::ReadOnly
);
793 int len
= f
.readLine(line
.data(), 1024);
796 int comment
= line
.indexOf('#');
798 line
= line
.left(comment
);
799 line
.replace(" ", "");
804 QList
<QByteArray
> l
= line
.split(';');
805 Q_ASSERT(l
.size() == 2);
807 QByteArray codes
= l
[0];
808 codes
.replace("..", ".");
809 QList
<QByteArray
> cl
= codes
.split('.');
812 int from
= cl
[0].toInt(&ok
, 16);
815 to
= cl
[1].toInt(&ok
, 16);
817 QChar::UnicodeVersion age
= QChar::Unicode_Unassigned
;
818 QByteArray ba
= l
[1];
819 AgeMap
*map
= ageMap
;
821 if (ba
== map
->age
) {
827 //qDebug() << hex << from << ".." << to << ba << age;
828 Q_ASSERT(age
!= QChar::Unicode_Unassigned
);
830 for (int codepoint
= from
; codepoint
<= to
; ++codepoint
) {
831 UnicodeData d
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
833 unicodeData
.insert(codepoint
, d
);
839 static void readCompositionExclusion()
841 QFile
f("data/CompositionExclusions.txt");
843 qFatal("Couldn't find CompositionExclusions.txt");
845 f
.open(QFile::ReadOnly
);
850 int len
= f
.readLine(line
.data(), 1024);
853 int comment
= line
.indexOf('#');
855 line
= line
.left(comment
);
856 line
.replace(" ", "");
861 Q_ASSERT(!line
.contains(".."));
864 int codepoint
= line
.toInt(&ok
, 16);
866 UnicodeData d
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
867 d
.excludedComposition
= true;
868 unicodeData
.insert(codepoint
, d
);
871 for (int i
= 0; i
< 0x110000; ++i
) {
872 UnicodeData data
= unicodeData
.value(i
, UnicodeData(i
));
873 if (!data
.excludedComposition
874 && data
.decompositionType
== QChar::Canonical
875 && data
.decomposition
.size() > 1) {
876 Q_ASSERT(data
.decomposition
.size() == 2);
878 uint part1
= data
.decomposition
.at(0);
879 uint part2
= data
.decomposition
.at(1);
880 UnicodeData first
= unicodeData
.value(part1
, UnicodeData(part1
));
881 if (first
.p
.combiningClass
!= 0)
885 highestLigature
= qMax(highestLigature
, (int)part1
);
886 Ligature l
= {(ushort
)part1
, (ushort
)part2
, i
};
887 ligatureHashes
[part2
].append(l
);
892 struct NormalizationCorrection
{
898 static QByteArray
createNormalizationCorrections()
900 QFile
f("data/NormalizationCorrections.txt");
902 qFatal("Couldn't find NormalizationCorrections.txt");
904 f
.open(QFile::ReadOnly
);
908 out
+= "struct NormalizationCorrection {\n"
910 " uint old_mapping;\n"
914 "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
916 int numCorrections
= 0;
920 int len
= f
.readLine(line
.data(), 1024);
923 int comment
= line
.indexOf('#');
925 line
= line
.left(comment
);
926 line
.replace(" ", "");
931 Q_ASSERT(!line
.contains(".."));
933 QList
<QByteArray
> fields
= line
.split(';');
934 Q_ASSERT(fields
.size() == 4);
936 NormalizationCorrection c
;
938 c
.codepoint
= fields
.at(0).toInt(&ok
, 16);
939 c
.mapped
= fields
.at(1).toInt(&ok
, 16);
940 if (fields
.at(3) == "3.2.0")
941 c
.version
= QChar::Unicode_3_2
;
942 else if (fields
.at(3) == "4.0.0")
943 c
.version
= QChar::Unicode_4_0
;
945 qFatal("unknown unicode version in NormalizationCorrection.txt");
947 out
+= " { 0x" + QByteArray::number(c
.codepoint
, 16) + ", 0x" + QByteArray::number(c
.mapped
, 16)
948 + ", " + QString::number(c
.version
) + " },\n";
954 "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections
) + " };\n\n";
961 static void computeUniqueProperties()
963 qDebug("computeUniqueProperties:");
964 for (int uc
= 0; uc
< 0x110000; ++uc
) {
965 UnicodeData d
= unicodeData
.value(uc
, UnicodeData(uc
));
967 int index
= uniqueProperties
.indexOf(d
.p
);
969 index
= uniqueProperties
.size();
970 uniqueProperties
.append(d
.p
);
972 d
.propertyIndex
= index
;
973 unicodeData
.insert(uc
, d
);
975 qDebug(" %d unicode properties found", uniqueProperties
.size());
979 static void readLineBreak()
981 QFile
f("data/LineBreak.txt");
983 qFatal("Couldn't find LineBreak.txt");
985 f
.open(QFile::ReadOnly
);
990 int len
= f
.readLine(line
.data(), 1024);
993 int comment
= line
.indexOf('#');
995 line
= line
.left(comment
);
996 line
.replace(" ", "");
1001 QList
<QByteArray
> l
= line
.split(';');
1002 Q_ASSERT(l
.size() == 2);
1004 QByteArray codes
= l
[0];
1005 codes
.replace("..", ".");
1006 QList
<QByteArray
> cl
= codes
.split('.');
1009 int from
= cl
[0].toInt(&ok
, 16);
1012 to
= cl
[1].toInt(&ok
, 16);
1014 // ### Classes XX and AI are left out and mapped to AL for now
1015 QUnicodeTables::LineBreakClass lb
= QUnicodeTables::LineBreak_AL
;
1016 QByteArray ba
= l
[1];
1018 if (ba
== "AI") lb
= QUnicodeTables::LineBreak_AL
;
1019 else if (ba
== "XX") lb
= QUnicodeTables::LineBreak_AL
;
1020 else if (ba
== "NL") lb
= QUnicodeTables::LineBreak_AL
;
1021 else if (ba
== "OP") lb
= QUnicodeTables::LineBreak_OP
;
1022 else if (ba
== "CL") lb
= QUnicodeTables::LineBreak_CL
;
1023 else if (ba
== "QU") lb
= QUnicodeTables::LineBreak_QU
;
1024 else if (ba
== "GL") lb
= QUnicodeTables::LineBreak_GL
;
1025 else if (ba
== "NS") lb
= QUnicodeTables::LineBreak_NS
;
1026 else if (ba
== "EX") lb
= QUnicodeTables::LineBreak_EX
;
1027 else if (ba
== "SY") lb
= QUnicodeTables::LineBreak_SY
;
1028 else if (ba
== "IS") lb
= QUnicodeTables::LineBreak_IS
;
1029 else if (ba
== "PR") lb
= QUnicodeTables::LineBreak_PR
;
1030 else if (ba
== "PO") lb
= QUnicodeTables::LineBreak_PO
;
1031 else if (ba
== "NU") lb
= QUnicodeTables::LineBreak_NU
;
1032 else if (ba
== "AL") lb
= QUnicodeTables::LineBreak_AL
;
1033 else if (ba
== "ID") lb
= QUnicodeTables::LineBreak_ID
;
1034 else if (ba
== "IN") lb
= QUnicodeTables::LineBreak_IN
;
1035 else if (ba
== "HY") lb
= QUnicodeTables::LineBreak_HY
;
1036 else if (ba
== "BA") lb
= QUnicodeTables::LineBreak_BA
;
1037 else if (ba
== "BB") lb
= QUnicodeTables::LineBreak_BB
;
1038 else if (ba
== "B2") lb
= QUnicodeTables::LineBreak_B2
;
1039 else if (ba
== "ZW") lb
= QUnicodeTables::LineBreak_ZW
;
1040 else if (ba
== "CM") lb
= QUnicodeTables::LineBreak_CM
;
1041 else if (ba
== "SA") lb
= QUnicodeTables::LineBreak_SA
;
1042 else if (ba
== "BK") lb
= QUnicodeTables::LineBreak_BK
;
1043 else if (ba
== "CR") lb
= QUnicodeTables::LineBreak_CR
;
1044 else if (ba
== "LF") lb
= QUnicodeTables::LineBreak_LF
;
1045 else if (ba
== "SG") lb
= QUnicodeTables::LineBreak_SG
;
1046 else if (ba
== "CB") lb
= QUnicodeTables::LineBreak_AL
;
1047 else if (ba
== "SP") lb
= QUnicodeTables::LineBreak_SP
;
1048 else if (ba
== "WJ") lb
= QUnicodeTables::LineBreak_WJ
;
1049 else if (ba
== "H2") lb
= QUnicodeTables::LineBreak_H2
;
1050 else if (ba
== "H3") lb
= QUnicodeTables::LineBreak_H3
;
1051 else if (ba
== "JL") lb
= QUnicodeTables::LineBreak_JL
;
1052 else if (ba
== "JV") lb
= QUnicodeTables::LineBreak_JV
;
1053 else if (ba
== "JT") lb
= QUnicodeTables::LineBreak_JT
;
1055 qDebug() << "unhandled line break class:" << ba
;
1058 for (int codepoint
= from
; codepoint
<= to
; ++codepoint
) {
1059 UnicodeData d
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
1060 d
.p
.line_break_class
= lb
;
1061 unicodeData
.insert(codepoint
, d
);
1067 static void readSpecialCasing()
1069 // qDebug() << "Reading SpecialCasing.txt";
1070 QFile
f("data/SpecialCasing.txt");
1072 qFatal("Couldn't find SpecialCasing.txt");
1074 f
.open(QFile::ReadOnly
);
1076 while (!f
.atEnd()) {
1079 int len
= f
.readLine(line
.data(), 1024);
1082 int comment
= line
.indexOf('#');
1084 line
= line
.left(comment
);
1089 QList
<QByteArray
> l
= line
.split(';');
1091 QByteArray condition
= l
.size() < 5 ? QByteArray() : l
[4].trimmed();
1092 if (!condition
.isEmpty())
1097 int codepoint
= l
[0].trimmed().toInt(&ok
, 16);
1099 Q_ASSERT(codepoint
<= 0xffff);
1101 // qDebug() << "codepoint" << hex << codepoint;
1102 // qDebug() << line;
1104 QList
<QByteArray
> lower
= l
[1].trimmed().split(' ');
1105 QList
<int> lowerMap
;
1106 for (int i
= 0; i
< lower
.size(); ++i
) {
1108 lowerMap
.append(lower
.at(i
).toInt(&ok
, 16));
1112 QList
<QByteArray
> title
= l
[2].trimmed().split(' ');
1113 QList
<int> titleMap
;
1114 for (int i
= 0; i
< title
.size(); ++i
) {
1116 titleMap
.append(title
.at(i
).toInt(&ok
, 16));
1118 qDebug() << line
<< title
.at(i
);
1122 QList
<QByteArray
> upper
= l
[3].trimmed().split(' ');
1123 QList
<int> upperMap
;
1124 for (int i
= 0; i
< upper
.size(); ++i
) {
1126 upperMap
.append(upper
.at(i
).toInt(&ok
, 16));
1131 UnicodeData ud
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
1133 Q_ASSERT(lowerMap
.size() > 1 || lowerMap
.at(0) == codepoint
+ ud
.p
.lowerCaseDiff
);
1134 Q_ASSERT(titleMap
.size() > 1 || titleMap
.at(0) == codepoint
+ ud
.p
.titleCaseDiff
);
1135 Q_ASSERT(upperMap
.size() > 1 || upperMap
.at(0) == codepoint
+ ud
.p
.upperCaseDiff
);
1137 if (lowerMap
.size() > 1) {
1138 ud
.p
.lowerCaseSpecial
= true;
1139 ud
.p
.lowerCaseDiff
= appendToSpecialCaseMap(lowerMap
);
1141 if (titleMap
.size() > 1) {
1142 ud
.p
.titleCaseSpecial
= true;
1143 ud
.p
.titleCaseDiff
= appendToSpecialCaseMap(titleMap
);
1145 if (upperMap
.size() > 1) {
1146 ud
.p
.upperCaseSpecial
= true;
1147 ud
.p
.upperCaseDiff
= appendToSpecialCaseMap(upperMap
);;
1150 unicodeData
.insert(codepoint
, ud
);
1154 int maxCaseFoldDiff
= 0;
1156 static void readCaseFolding()
1158 qDebug() << "Reading CaseFolding.txt";
1159 QFile
f("data/CaseFolding.txt");
1161 qFatal("Couldn't find CaseFolding.txt");
1163 f
.open(QFile::ReadOnly
);
1165 while (!f
.atEnd()) {
1168 int len
= f
.readLine(line
.data(), 1024);
1171 int comment
= line
.indexOf('#');
1173 line
= line
.left(comment
);
1178 QList
<QByteArray
> l
= line
.split(';');
1181 uint codepoint
= l
[0].trimmed().toInt(&ok
, 16);
1185 l
[1] = l
[1].trimmed();
1186 if (l
[1] == "F" || l
[1] == "T")
1189 // qDebug() << "codepoint" << hex << codepoint;
1190 // qDebug() << line;
1191 QList
<QByteArray
> fold
= l
[2].trimmed().split(' ');
1193 for (int i
= 0; i
< fold
.size(); ++i
) {
1195 foldMap
.append(fold
.at(i
).toInt(&ok
, 16));
1199 UnicodeData ud
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
1200 if (foldMap
.size() == 1) {
1201 ud
.p
.caseFoldDiff
= foldMap
.at(0) - codepoint
;
1202 maxCaseFoldDiff
= qMax(maxCaseFoldDiff
, ud
.p
.caseFoldDiff
);
1203 if (codepoint
> 0xffff) {
1204 // if the condition below doesn't hold anymore we need to modify our case folding code
1205 //qDebug() << codepoint << QChar::highSurrogate(codepoint) << QChar::highSurrogate(foldMap.at(0));
1206 Q_ASSERT(QChar::highSurrogate(codepoint
) == QChar::highSurrogate(foldMap
.at(0)));
1208 if (foldMap
.at(0) != codepoint
+ ud
.p
.lowerCaseDiff
)
1209 qDebug() << hex
<< codepoint
;
1211 Q_ASSERT(false); // we currently don't support full case foldings
1212 // qDebug() << "special" << hex << foldMap;
1213 ud
.p
.caseFoldSpecial
= true;
1214 ud
.p
.caseFoldDiff
= appendToSpecialCaseMap(foldMap
);
1216 unicodeData
.insert(codepoint
, ud
);
1220 static void readGraphemeBreak()
1222 qDebug() << "Reading GraphemeBreakProperty.txt";
1223 QFile
f("data/GraphemeBreakProperty.txt");
1225 qFatal("Couldn't find GraphemeBreakProperty.txt");
1227 f
.open(QFile::ReadOnly
);
1229 while (!f
.atEnd()) {
1232 int len
= f
.readLine(line
.data(), 1024);
1235 int comment
= line
.indexOf('#');
1237 line
= line
.left(comment
);
1242 QList
<QByteArray
> l
= line
.split(';');
1244 QByteArray codes
= l
[0].trimmed();
1245 codes
.replace("..", ".");
1246 QList
<QByteArray
> cl
= codes
.split('.');
1249 int from
= cl
[0].toInt(&ok
, 16);
1252 if (cl
.size() == 2) {
1253 to
= cl
[1].toInt(&ok
, 16);
1257 GraphemeBreak brk
= grapheme_break_map
.value(l
[1].trimmed(), GraphemeBreakOther
);
1259 for (int codepoint
= from
; codepoint
<= to
; ++codepoint
) {
1260 UnicodeData ud
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
1261 ud
.p
.graphemeBreak
= brk
;
1262 unicodeData
.insert(codepoint
, ud
);
1267 static void readWordBreak()
1269 qDebug() << "Reading WordBreakProperty.txt";
1270 QFile
f("data/WordBreakProperty.txt");
1272 qFatal("Couldn't find WordBreakProperty.txt");
1274 f
.open(QFile::ReadOnly
);
1276 while (!f
.atEnd()) {
1279 int len
= f
.readLine(line
.data(), 1024);
1282 int comment
= line
.indexOf('#');
1284 line
= line
.left(comment
);
1289 QList
<QByteArray
> l
= line
.split(';');
1291 QByteArray codes
= l
[0].trimmed();
1292 codes
.replace("..", ".");
1293 QList
<QByteArray
> cl
= codes
.split('.');
1296 int from
= cl
[0].toInt(&ok
, 16);
1299 if (cl
.size() == 2) {
1300 to
= cl
[1].toInt(&ok
, 16);
1304 WordBreak brk
= word_break_map
.value(l
[1].trimmed(), WordBreakOther
);
1305 Q_ASSERT(brk
!= WordBreakOther
);
1307 for (int codepoint
= from
; codepoint
<= to
; ++codepoint
) {
1308 UnicodeData ud
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
1309 ud
.p
.wordBreak
= brk
;
1310 unicodeData
.insert(codepoint
, ud
);
1315 static void readSentenceBreak()
1317 qDebug() << "Reading SentenceBreakProperty.txt";
1318 QFile
f("data/SentenceBreakProperty.txt");
1320 qFatal("Couldn't find SentenceBreakProperty.txt");
1322 f
.open(QFile::ReadOnly
);
1324 while (!f
.atEnd()) {
1327 int len
= f
.readLine(line
.data(), 1024);
1330 int comment
= line
.indexOf('#');
1332 line
= line
.left(comment
);
1337 QList
<QByteArray
> l
= line
.split(';');
1339 QByteArray codes
= l
[0].trimmed();
1340 codes
.replace("..", ".");
1341 QList
<QByteArray
> cl
= codes
.split('.');
1344 int from
= cl
[0].toInt(&ok
, 16);
1347 if (cl
.size() == 2) {
1348 to
= cl
[1].toInt(&ok
, 16);
1352 SentenceBreak brk
= sentence_break_map
.value(l
[1].trimmed(), SentenceBreakOther
);
1353 Q_ASSERT(brk
!= SentenceBreakOther
);
1355 for (int codepoint
= from
; codepoint
<= to
; ++codepoint
) {
1356 UnicodeData ud
= unicodeData
.value(codepoint
, UnicodeData(codepoint
));
1357 ud
.p
.sentenceBreak
= brk
;
1358 unicodeData
.insert(codepoint
, ud
);
1364 // this piece of code does full case folding and comparison. We currently
1365 // don't use it, since this gives lots of issues with things as case insensitive
1366 // search and replace.
1367 static inline void foldCase(uint ch
, ushort
*out
)
1369 const QUnicodeTables::Properties
*p
= qGetProp(ch
);
1370 if (!p
->caseFoldSpecial
) {
1371 *(out
++) = ch
+ p
->caseFoldDiff
;
1373 const ushort
*folded
= specialCaseMap
+ p
->caseFoldDiff
;
1380 static int ucstricmp(const ushort
*a
, const ushort
*ae
, const ushort
*b
, const ushort
*be
)
1389 while (a
!= ae
&& b
!= be
) {
1390 const QUnicodeTables::Properties
*pa
= qGetProp(*a
);
1391 const QUnicodeTables::Properties
*pb
= qGetProp(*b
);
1392 if (pa
->caseFoldSpecial
| pb
->caseFoldSpecial
)
1394 int diff
= (int)(*a
+ pa
->caseFoldDiff
) - (int)(*b
+ pb
->caseFoldDiff
);
1408 ushort abuf
[SPECIAL_CASE_MAX_LEN
+ 1];
1409 ushort bbuf
[SPECIAL_CASE_MAX_LEN
+ 1];
1410 abuf
[0] = bbuf
[0] = 0;
1416 if (!*bp
&& b
== be
)
1420 foldCase(*(a
++), abuf
);
1426 foldCase(*(b
++), bbuf
);
1430 return (int)*ap
- (int)*bp
;
1437 static int ucstricmp(const ushort
*a
, const ushort
*ae
, const uchar
*b
)
1444 while (a
!= ae
&& *b
) {
1445 const QUnicodeTables::Properties
*pa
= qGetProp(*a
);
1446 const QUnicodeTables::Properties
*pb
= qGetProp((ushort
)*b
);
1447 if (pa
->caseFoldSpecial
| pb
->caseFoldSpecial
)
1449 int diff
= (int)(*a
+ pa
->caseFoldDiff
) - (int)(*b
+ pb
->caseFoldDiff
);
1463 ushort abuf
[SPECIAL_CASE_MAX_LEN
+ 1];
1464 ushort bbuf
[SPECIAL_CASE_MAX_LEN
+ 1];
1465 abuf
[0] = bbuf
[0] = 0;
1475 foldCase(*(a
++), abuf
);
1481 foldCase(*(b
++), bbuf
);
1485 return (int)*ap
- (int)*bp
;
1493 static QList
<QByteArray
> blockNames
;
1500 static QList
<BlockInfo
> blockInfoList
;
1502 static void readBlocks()
1504 QFile
f("data/Blocks.txt");
1506 qFatal("Couldn't find Blocks.txt");
1508 f
.open(QFile::ReadOnly
);
1510 while (!f
.atEnd()) {
1511 QByteArray line
= f
.readLine();
1512 line
.resize(line
.size() - 1);
1514 int comment
= line
.indexOf("#");
1516 line
= line
.left(comment
);
1518 line
.replace(" ", "");
1523 int semicolon
= line
.indexOf(';');
1524 Q_ASSERT(semicolon
>= 0);
1525 QByteArray codePoints
= line
.left(semicolon
);
1526 QByteArray blockName
= line
.mid(semicolon
+ 1);
1528 int blockIndex
= blockNames
.indexOf(blockName
);
1529 if (blockIndex
< 0) {
1530 blockNames
.append(blockName
);
1531 blockIndex
= blockNames
.indexOf(blockName
);
1532 Q_ASSERT(blockIndex
>= 0);
1535 int dotdot
= codePoints
.indexOf("..");
1536 Q_ASSERT(dotdot
>= 0);
1538 int first
= codePoints
.left(dotdot
).toInt(&unused
, 16);
1539 int last
= codePoints
.mid(dotdot
+ 2).toInt(&unused
, 16);
1541 BlockInfo blockInfo
= { blockIndex
, first
, last
};
1542 blockInfoList
.append(blockInfo
);
1547 static QList
<QByteArray
> scriptNames
;
1548 static QHash
<int, int> scriptAssignment
;
1549 static QHash
<int, int> scriptHash
;
1553 QVector
<int> vector
;
1556 static QList
<ExtraBlock
> extraBlockList
;
1559 static void readScripts()
1561 scriptNames
.append("Common");
1563 static const char *files
[] = {
1564 "data/ScriptsInitial.txt",
1566 "data/ScriptsCorrections.txt"
1568 enum { fileCount
= sizeof(files
) / sizeof(const char *) };
1570 for (int i
= 0; i
< fileCount
; ++i
) {
1573 qFatal("Couldn't find %s", files
[i
]);
1576 f
.open(QFile::ReadOnly
);
1578 while (!f
.atEnd()) {
1579 QByteArray line
= f
.readLine();
1580 line
.resize(line
.size() - 1);
1582 int comment
= line
.indexOf("#");
1584 line
= line
.left(comment
);
1586 line
.replace(" ", "");
1587 line
.replace("_", "");
1592 int semicolon
= line
.indexOf(';');
1593 Q_ASSERT(semicolon
>= 0);
1594 QByteArray codePoints
= line
.left(semicolon
);
1595 QByteArray scriptName
= line
.mid(semicolon
+ 1);
1597 int scriptIndex
= scriptNames
.indexOf(scriptName
);
1598 if (scriptIndex
< 0) {
1599 scriptNames
.append(scriptName
);
1600 scriptIndex
= scriptNames
.indexOf(scriptName
);
1601 Q_ASSERT(scriptIndex
>= 0);
1604 int dotdot
= codePoints
.indexOf("..");
1606 int first
= -1, last
= -1;
1608 first
= codePoints
.left(dotdot
).toInt(&unused
, 16);
1609 last
= codePoints
.mid(dotdot
+ 2).toInt(&unused
, 16);
1611 first
= codePoints
.toInt(&unused
, 16);
1615 for (int i
= first
; i
<= last
; ++i
)
1616 scriptAssignment
[i
] = scriptIndex
;
1618 scriptAssignment
[first
] = scriptIndex
;
1625 static int scriptSentinel
= 0;
1627 QByteArray
createScriptEnumDeclaration()
1629 static const char *specialScripts
[] = {
1659 const int specialScriptsCount
= sizeof(specialScripts
) / sizeof(const char *);
1661 // generate script enum
1662 QByteArray declaration
;
1664 declaration
+= " // See http://www.unicode.org/reports/tr24/tr24-5.html\n\n";
1665 declaration
+= " enum Script {\n Common";
1667 int uniqueScripts
= 1; // Common
1669 // output the ones with special processing first
1670 for (int i
= 1; i
< scriptNames
.size(); ++i
) {
1671 QByteArray scriptName
= scriptNames
.at(i
);
1672 // does the script require special processing?
1673 bool special
= false;
1674 for (int s
= 0; !special
&& s
< specialScriptsCount
; ++s
) {
1675 if (scriptName
== specialScripts
[s
])
1679 scriptHash
[i
] = 0; // alias for 'Common'
1686 declaration
+= ",\n ";
1687 declaration
+= scriptName
;
1689 declaration
+= ",\n ScriptCount = Inherited";
1691 // output the ones that are an alias for 'Common'
1692 for (int i
= 1; i
< scriptNames
.size(); ++i
) {
1693 if (scriptHash
.value(i
) != 0)
1695 QByteArray scriptName
= scriptNames
.at(i
);
1696 scriptName
+= " = Common";
1697 declaration
+= ",\n ";
1698 declaration
+= scriptName
;
1701 declaration
+= "\n };\n";
1703 scriptSentinel
= ((uniqueScripts
+ 16) / 32) * 32; // a multiple of 32
1704 declaration
+= " enum { ScriptSentinel = ";
1705 declaration
+= QByteArray::number(scriptSentinel
);
1706 declaration
+= " };\n\n";
1710 QByteArray
createScriptTableDeclaration()
1712 Q_ASSERT(scriptSentinel
> 0);
1714 QByteArray declaration
;
1716 const int unicodeBlockCount
= 512; // number of unicode blocks
1717 const int unicodeBlockSize
= 128; // size of each block
1718 declaration
= "enum { UnicodeBlockCount = ";
1719 declaration
+= QByteArray::number(unicodeBlockCount
);
1720 declaration
+= " }; // number of unicode blocks\n";
1721 declaration
+= "enum { UnicodeBlockSize = ";
1722 declaration
+= QByteArray::number(unicodeBlockSize
);
1723 declaration
+= " }; // size of each block\n\n";
1726 declaration
+= "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n";
1727 for (int i
= 0; i
< unicodeBlockCount
; ++i
) {
1728 int block
= (((i
<< 7) & 0xff00) | ((i
& 1) * 0x80));
1729 int blockAssignment
[unicodeBlockSize
];
1730 for (int x
= 0; x
< unicodeBlockSize
; ++x
) {
1731 int codePoint
= (i
<< 7) | x
;
1732 blockAssignment
[x
] = scriptAssignment
.value(codePoint
, 0);
1734 bool allTheSame
= true;
1735 const int originalScript
= blockAssignment
[0];
1736 const int script
= scriptHash
.value(originalScript
);
1737 for (int x
= 1; allTheSame
&& x
< unicodeBlockSize
; ++x
) {
1738 const int s
= scriptHash
.value(blockAssignment
[x
]);
1745 declaration
+= scriptNames
.value(originalScript
);
1746 declaration
+= ", /* U+";
1747 declaration
+= QByteArray::number(block
, 16).rightJustified(4, '0');
1750 QByteArray::number(block
+ unicodeBlockSize
- 1, 16).rightJustified(4, '0');
1751 declaration
+= " */\n";
1753 const int value
= extraBlockList
.size() + scriptSentinel
;
1755 ((value
- scriptSentinel
) * unicodeBlockSize
) + unicodeBlockCount
;
1758 declaration
+= QByteArray::number(value
);
1759 declaration
+= ", /* U+";
1760 declaration
+= QByteArray::number(block
, 16).rightJustified(4, '0');
1763 QByteArray::number(block
+ unicodeBlockSize
- 1, 16).rightJustified(4, '0');
1764 declaration
+= " at offset ";
1765 declaration
+= QByteArray::number(offset
);
1766 declaration
+= " */\n";
1768 ExtraBlock extraBlock
;
1769 extraBlock
.block
= block
;
1770 extraBlock
.vector
.resize(unicodeBlockSize
);
1771 for (int x
= 0; x
< unicodeBlockSize
; ++x
)
1772 extraBlock
.vector
[x
] = blockAssignment
[x
];
1774 extraBlockList
.append(extraBlock
);
1778 for (int i
= 0; i
< extraBlockList
.size(); ++i
) {
1779 const int value
= i
+ scriptSentinel
;
1781 ((value
- scriptSentinel
) * unicodeBlockSize
) + unicodeBlockCount
;
1782 const ExtraBlock
&extraBlock
= extraBlockList
.at(i
);
1783 const int block
= extraBlock
.block
;
1785 declaration
+= "\n\n /* U+";
1786 declaration
+= QByteArray::number(block
, 16).rightJustified(4, '0');
1789 QByteArray::number(block
+ unicodeBlockSize
- 1, 16).rightJustified(4, '0');
1790 declaration
+= " at offset ";
1791 declaration
+= QByteArray::number(offset
);
1792 declaration
+= " */\n ";
1794 for (int x
= 0; x
< extraBlock
.vector
.size(); ++x
) {
1795 const int o
= extraBlock
.vector
.at(x
);
1797 declaration
+= scriptNames
.value(o
);
1798 if (x
< extraBlock
.vector
.size() - 1 || i
< extraBlockList
.size() - 1)
1800 if ((x
& 7) == 7 && x
< extraBlock
.vector
.size() - 1)
1801 declaration
+= "\n ";
1806 declaration
+= "\n};\n\n} // namespace QUnicodeTables\n\n";
1808 qDebug("createScriptTableDeclaration: table size is %d bytes",
1809 unicodeBlockCount
+ (extraBlockList
.size() * unicodeBlockSize
));
1815 static void dump(int from
, int to
)
1817 for (int i
= from
; i
<= to
; ++i
) {
1818 UnicodeData d
= unicodeData
.value(i
, UnicodeData(i
));
1819 qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
1820 i
, d
.p
.category
, d
.p
.combiningClass
, d
.p
.direction
, d
.otherCase
, d
.mirroredChar
, d
.p
.joining
, d
.p
.age
);
1821 if (d
.decompositionType
!= QChar::NoDecomposition
) {
1822 qDebug(" decomposition: type=%d, length=%d, first=%x", d
.decompositionType
, d
.decomposition
.size(),
1823 d
.decomposition
[0]);
1830 struct PropertyBlock
{
1831 PropertyBlock() { index
= -1; }
1833 QList
<int> properties
;
1834 bool operator ==(const PropertyBlock
&other
) { return properties
== other
.properties
; }
1837 static QByteArray
createPropertyInfo()
1839 qDebug("createPropertyInfo:");
1841 const int BMP_BLOCKSIZE
=32;
1842 const int BMP_SHIFT
= 5;
1843 const int BMP_END
= 0x11000;
1844 const int SMP_END
= 0x110000;
1845 const int SMP_BLOCKSIZE
= 256;
1846 const int SMP_SHIFT
= 8;
1848 QList
<PropertyBlock
> blocks
;
1849 QList
<int> blockMap
;
1853 for (int block
= 0; block
< BMP_END
/BMP_BLOCKSIZE
; ++block
) {
1855 for (int i
= 0; i
< BMP_BLOCKSIZE
; ++i
) {
1856 int uc
= block
*BMP_BLOCKSIZE
+ i
;
1857 UnicodeData d
= unicodeData
.value(uc
, UnicodeData(uc
));
1858 b
.properties
.append(d
.propertyIndex
);
1860 int index
= blocks
.indexOf(b
);
1862 index
= blocks
.size();
1864 used
+= BMP_BLOCKSIZE
;
1867 blockMap
.append(blocks
.at(index
).index
);
1870 int bmp_blocks
= blocks
.size();
1871 Q_ASSERT(blockMap
.size() == BMP_END
/BMP_BLOCKSIZE
);
1873 for (int block
= BMP_END
/SMP_BLOCKSIZE
; block
< SMP_END
/SMP_BLOCKSIZE
; ++block
) {
1875 for (int i
= 0; i
< SMP_BLOCKSIZE
; ++i
) {
1876 int uc
= block
*SMP_BLOCKSIZE
+ i
;
1877 UnicodeData d
= unicodeData
.value(uc
, UnicodeData(uc
));
1878 b
.properties
.append(d
.propertyIndex
);
1880 int index
= blocks
.indexOf(b
);
1882 index
= blocks
.size();
1884 used
+= SMP_BLOCKSIZE
;
1887 blockMap
.append(blocks
.at(index
).index
);
1890 int bmp_block_data
= bmp_blocks
*BMP_BLOCKSIZE
*2;
1891 int bmp_trie
= BMP_END
/BMP_BLOCKSIZE
*2;
1892 int bmp_mem
= bmp_block_data
+ bmp_trie
;
1893 qDebug(" %d unique blocks in BMP.",blocks
.size());
1894 qDebug(" block data uses: %d bytes", bmp_block_data
);
1895 qDebug(" trie data uses : %d bytes", bmp_trie
);
1897 int smp_block_data
= (blocks
.size()- bmp_blocks
)*SMP_BLOCKSIZE
*2;
1898 int smp_trie
= (SMP_END
-BMP_END
)/SMP_BLOCKSIZE
*2;
1899 int smp_mem
= smp_block_data
+ smp_trie
;
1900 qDebug(" %d unique blocks in SMP.",blocks
.size()-bmp_blocks
);
1901 qDebug(" block data uses: %d bytes", smp_block_data
);
1902 qDebug(" trie data uses : %d bytes", smp_trie
);
1904 qDebug("\n properties use : %d bytes", uniqueProperties
.size()*20);
1905 qDebug(" memory usage: %d bytes", bmp_mem
+smp_mem
+ uniqueProperties
.size()*20);
1908 out
+= "static const unsigned short uc_property_trie[] = {\n";
1910 // first write the map
1911 out
+= " // 0x" + QByteArray::number(BMP_END
, 16);
1912 for (int i
= 0; i
< BMP_END
/BMP_BLOCKSIZE
; ++i
) {
1914 if (out
.endsWith(' '))
1916 if (!((i
*BMP_BLOCKSIZE
) % 0x1000))
1920 out
+= QByteArray::number(blockMap
.at(i
) + blockMap
.size());
1923 if (out
.endsWith(' '))
1925 out
+= "\n\n // 0x" + QByteArray::number(BMP_END
, 16) + " - 0x" + QByteArray::number(SMP_END
, 16) + "\n";;
1926 for (int i
= BMP_END
/BMP_BLOCKSIZE
; i
< blockMap
.size(); ++i
) {
1928 if (out
.endsWith(' '))
1930 if (!(i
% (0x10000/SMP_BLOCKSIZE
)))
1934 out
+= QByteArray::number(blockMap
.at(i
) + blockMap
.size());
1937 if (out
.endsWith(' '))
1941 for (int i
= 0; i
< blocks
.size(); ++i
) {
1942 if (out
.endsWith(' '))
1945 const PropertyBlock
&b
= blocks
.at(i
);
1946 for (int j
= 0; j
< b
.properties
.size(); ++j
) {
1948 if (out
.endsWith(' '))
1952 out
+= QByteArray::number(b
.properties
.at(j
));
1957 // we reserve one bit more than in the assert below for the sign
1958 Q_ASSERT(maxMirroredDiff
< (1<<12));
1959 Q_ASSERT(maxLowerCaseDiff
< (1<<14));
1960 Q_ASSERT(maxUpperCaseDiff
< (1<<14));
1961 Q_ASSERT(maxTitleCaseDiff
< (1<<14));
1962 Q_ASSERT(maxCaseFoldDiff
< (1<<14));
1964 if (out
.endsWith(' '))
1968 "#define GET_PROP_INDEX(ucs4) \\\n"
1969 " (ucs4 < 0x" + QByteArray::number(BMP_END
, 16) + " \\\n"
1970 " ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT
) +
1971 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE
-1, 16)+ ")]) \\\n"
1972 " : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END
, 16) +
1973 ")>>" + QByteArray::number(SMP_SHIFT
) + ") + 0x" + QByteArray::number(BMP_END
/BMP_BLOCKSIZE
, 16) + "]"
1974 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE
-1, 16) + ")]))\n\n"
1975 "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
1976 "(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT
) +
1977 "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE
-1, 16)+ ")])\n\n"
1980 "static const QUnicodeTables::Properties uc_properties [] = {\n";
1982 // keep in sync with the property declaration
1983 for (int i
= 0; i
< uniqueProperties
.size(); ++i
) {
1984 PropertyFlags p
= uniqueProperties
.at(i
);
1986 // " ushort category : 8;\n"
1987 out
+= QByteArray::number( p
.category
);
1989 // " ushort line_break_class : 8;\n"
1990 out
+= QByteArray::number( p
.line_break_class
);
1992 // " ushort direction : 8;\n"
1993 out
+= QByteArray::number( p
.direction
);
1995 // " ushort combiningClass :8;\n"
1996 out
+= QByteArray::number( p
.combiningClass
);
1998 // " ushort joining : 2;\n"
1999 out
+= QByteArray::number( p
.joining
);
2001 // " signed short digitValue : 6;\n /* 5 needed */"
2002 out
+= QByteArray::number( p
.digitValue
);
2004 // " ushort unicodeVersion : 4;\n"
2005 out
+= QByteArray::number( p
.age
);
2007 // " ushort lowerCaseSpecial : 1;\n"
2008 // " ushort upperCaseSpecial : 1;\n"
2009 // " ushort titleCaseSpecial : 1;\n"
2010 // " ushort caseFoldSpecial : 1;\n"
2011 out
+= QByteArray::number( p
.lowerCaseSpecial
);
2013 out
+= QByteArray::number( p
.upperCaseSpecial
);
2015 out
+= QByteArray::number( p
.titleCaseSpecial
);
2017 out
+= QByteArray::number( p
.caseFoldSpecial
);
2019 // " signed short mirrorDiff : 16;\n"
2020 // " signed short lowerCaseDiff : 16;\n"
2021 // " signed short upperCaseDiff : 16;\n"
2022 // " signed short titleCaseDiff : 16;\n"
2023 // " signed short caseFoldDiff : 16;\n"
2024 out
+= QByteArray::number( p
.mirrorDiff
);
2026 out
+= QByteArray::number( p
.lowerCaseDiff
);
2028 out
+= QByteArray::number( p
.upperCaseDiff
);
2030 out
+= QByteArray::number( p
.titleCaseDiff
);
2032 out
+= QByteArray::number( p
.caseFoldDiff
);
2034 out
+= QByteArray::number( p
.graphemeBreak
);
2036 out
+= QByteArray::number( p
.wordBreak
);
2038 out
+= QByteArray::number( p
.sentenceBreak
);
2043 out
+= "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n"
2045 " int index = GET_PROP_INDEX(ucs4);\n"
2046 " return uc_properties + index;\n"
2049 "static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n"
2051 " int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2052 " return uc_properties + index;\n"
2055 "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n"
2057 " int index = GET_PROP_INDEX(ucs4);\n"
2058 " return uc_properties + index;\n"
2061 "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n"
2063 " int index = GET_PROP_INDEX_UCS2(ucs2);\n"
2064 " return uc_properties + index;\n"
2067 out
+= "#define CURRENT_VERSION "CURRENT_UNICODE_VERSION
"\n\n";
2069 out
+= "static const ushort specialCaseMap [] = {";
2070 for (int i
= 0; i
< specialCaseMap
.size(); ++i
) {
2073 out
+= QByteArray(" 0x") + QByteArray::number(specialCaseMap
.at(i
), 16);
2074 if (i
< specialCaseMap
.size() - 1)
2078 out
+= "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen
) + "\n\n";
2080 qDebug() << "Special case map uses " << specialCaseMap
.size()*2 << "bytes";
2086 struct DecompositionBlock
{
2087 DecompositionBlock() { index
= -1; }
2089 QList
<int> decompositionPositions
;
2090 bool operator ==(const DecompositionBlock
&other
)
2091 { return decompositionPositions
== other
.decompositionPositions
; }
2094 static QByteArray
createCompositionInfo()
2096 qDebug("createCompositionInfo:");
2098 const int BMP_BLOCKSIZE
=16;
2099 const int BMP_SHIFT
= 4;
2100 const int BMP_END
= 0x3400; // start of Han
2101 const int SMP_END
= 0x30000;
2102 const int SMP_BLOCKSIZE
= 256;
2103 const int SMP_SHIFT
= 8;
2105 if(SMP_END
<= highestComposedCharacter
)
2106 qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter
);
2108 QList
<DecompositionBlock
> blocks
;
2109 QList
<int> blockMap
;
2110 QList
<unsigned short> decompositions
;
2115 for (int block
= 0; block
< BMP_END
/BMP_BLOCKSIZE
; ++block
) {
2116 DecompositionBlock b
;
2117 for (int i
= 0; i
< BMP_BLOCKSIZE
; ++i
) {
2118 int uc
= block
*BMP_BLOCKSIZE
+ i
;
2119 UnicodeData d
= unicodeData
.value(uc
, UnicodeData(uc
));
2120 if (!d
.decomposition
.isEmpty()) {
2122 for (int j
= 0; j
< d
.decomposition
.size(); ++j
)
2123 utf16Chars
+= d
.decomposition
.at(j
) > 0x10000 ? 2 : 1;
2124 decompositions
.append(d
.decompositionType
+ (utf16Chars
<<8));
2125 for (int j
= 0; j
< d
.decomposition
.size(); ++j
) {
2126 int code
= d
.decomposition
.at(j
);
2127 if (code
> 0x10000) {
2128 // save as surrogate pair
2130 ushort high
= code
/0x400 + 0xd800;
2131 ushort low
= code
%0x400 + 0xdc00;
2132 decompositions
.append(high
);
2133 decompositions
.append(low
);
2135 decompositions
.append(code
);
2138 b
.decompositionPositions
.append(tableIndex
);
2139 tableIndex
+= utf16Chars
+ 1;
2141 b
.decompositionPositions
.append(0xffff);
2144 int index
= blocks
.indexOf(b
);
2146 index
= blocks
.size();
2148 used
+= BMP_BLOCKSIZE
;
2151 blockMap
.append(blocks
.at(index
).index
);
2154 int bmp_blocks
= blocks
.size();
2155 Q_ASSERT(blockMap
.size() == BMP_END
/BMP_BLOCKSIZE
);
2157 for (int block
= BMP_END
/SMP_BLOCKSIZE
; block
< SMP_END
/SMP_BLOCKSIZE
; ++block
) {
2158 DecompositionBlock b
;
2159 for (int i
= 0; i
< SMP_BLOCKSIZE
; ++i
) {
2160 int uc
= block
*SMP_BLOCKSIZE
+ i
;
2161 UnicodeData d
= unicodeData
.value(uc
, UnicodeData(uc
));
2162 if (!d
.decomposition
.isEmpty()) {
2164 for (int j
= 0; j
< d
.decomposition
.size(); ++j
)
2165 utf16Chars
+= d
.decomposition
.at(j
) > 0x10000 ? 2 : 1;
2166 decompositions
.append(d
.decompositionType
+ (utf16Chars
<<8));
2167 for (int j
= 0; j
< d
.decomposition
.size(); ++j
) {
2168 int code
= d
.decomposition
.at(j
);
2169 if (code
> 0x10000) {
2170 // save as surrogate pair
2172 ushort high
= code
/0x400 + 0xd800;
2173 ushort low
= code
%0x400 + 0xdc00;
2174 decompositions
.append(high
);
2175 decompositions
.append(low
);
2177 decompositions
.append(code
);
2180 b
.decompositionPositions
.append(tableIndex
);
2181 tableIndex
+= utf16Chars
+ 1;
2183 b
.decompositionPositions
.append(0xffff);
2186 int index
= blocks
.indexOf(b
);
2188 index
= blocks
.size();
2190 used
+= SMP_BLOCKSIZE
;
2193 blockMap
.append(blocks
.at(index
).index
);
2196 int bmp_block_data
= bmp_blocks
*BMP_BLOCKSIZE
*2;
2197 int bmp_trie
= BMP_END
/BMP_BLOCKSIZE
*2;
2198 int bmp_mem
= bmp_block_data
+ bmp_trie
;
2199 qDebug(" %d unique blocks in BMP.",blocks
.size());
2200 qDebug(" block data uses: %d bytes", bmp_block_data
);
2201 qDebug(" trie data uses : %d bytes", bmp_trie
);
2202 qDebug(" memory usage: %d bytes", bmp_mem
);
2204 int smp_block_data
= (blocks
.size()- bmp_blocks
)*SMP_BLOCKSIZE
*2;
2205 int smp_trie
= (SMP_END
-BMP_END
)/SMP_BLOCKSIZE
*2;
2206 int smp_mem
= smp_block_data
+ smp_trie
;
2207 qDebug(" %d unique blocks in SMP.",blocks
.size()-bmp_blocks
);
2208 qDebug(" block data uses: %d bytes", smp_block_data
);
2209 qDebug(" trie data uses : %d bytes", smp_trie
);
2211 qDebug("\n decomposition table use : %d bytes", decompositions
.size()*2);
2212 qDebug(" memory usage: %d bytes", bmp_mem
+smp_mem
+ decompositions
.size()*2);
2216 out
+= "static const unsigned short uc_decomposition_trie[] = {\n";
2218 // first write the map
2219 out
+= " // 0 - 0x" + QByteArray::number(BMP_END
, 16);
2220 for (int i
= 0; i
< BMP_END
/BMP_BLOCKSIZE
; ++i
) {
2222 if (out
.endsWith(' '))
2224 if (!((i
*BMP_BLOCKSIZE
) % 0x1000))
2228 out
+= QByteArray::number(blockMap
.at(i
) + blockMap
.size());
2231 if (out
.endsWith(' '))
2233 out
+= "\n\n // 0x" + QByteArray::number(BMP_END
, 16) + " - 0x" + QByteArray::number(SMP_END
, 16) + "\n";;
2234 for (int i
= BMP_END
/BMP_BLOCKSIZE
; i
< blockMap
.size(); ++i
) {
2236 if (out
.endsWith(' '))
2238 if (!(i
% (0x10000/SMP_BLOCKSIZE
)))
2242 out
+= QByteArray::number(blockMap
.at(i
) + blockMap
.size());
2245 if (out
.endsWith(' '))
2249 for (int i
= 0; i
< blocks
.size(); ++i
) {
2250 if (out
.endsWith(' '))
2253 const DecompositionBlock
&b
= blocks
.at(i
);
2254 for (int j
= 0; j
< b
.decompositionPositions
.size(); ++j
) {
2256 if (out
.endsWith(' '))
2260 out
+= "0x" + QByteArray::number(b
.decompositionPositions
.at(j
), 16);
2265 if (out
.endsWith(' '))
2269 "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2270 " (ucs4 < 0x" + QByteArray::number(BMP_END
, 16) + " \\\n"
2271 " ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT
) +
2272 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE
-1, 16)+ ")]) \\\n"
2273 " : (ucs4 < 0x" + QByteArray::number(SMP_END
, 16) + "\\\n"
2274 " ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END
, 16) +
2275 ")>>" + QByteArray::number(SMP_SHIFT
) + ") + 0x" + QByteArray::number(BMP_END
/BMP_BLOCKSIZE
, 16) + "]"
2276 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE
-1, 16) + ")]\\\n"
2279 "static const unsigned short uc_decomposition_map[] = {\n";
2281 for (int i
= 0; i
< decompositions
.size(); ++i
) {
2283 if (out
.endsWith(' '))
2287 out
+= "0x" + QByteArray::number(decompositions
.at(i
), 16);
2291 if (out
.endsWith(' '))
2298 static QByteArray
createLigatureInfo()
2300 qDebug("createLigatureInfo: numLigatures=%d", numLigatures
);
2302 QList
<DecompositionBlock
> blocks
;
2303 QList
<int> blockMap
;
2304 QList
<unsigned short> ligatures
;
2306 const int BMP_BLOCKSIZE
= 32;
2307 const int BMP_SHIFT
= 5;
2308 const int BMP_END
= 0x3100;
2309 Q_ASSERT(highestLigature
< BMP_END
);
2314 for (int block
= 0; block
< BMP_END
/BMP_BLOCKSIZE
; ++block
) {
2315 DecompositionBlock b
;
2316 for (int i
= 0; i
< BMP_BLOCKSIZE
; ++i
) {
2317 int uc
= block
*BMP_BLOCKSIZE
+ i
;
2318 QList
<Ligature
> l
= ligatureHashes
.value(uc
);
2320 b
.decompositionPositions
.append(tableIndex
);
2323 ligatures
.append(l
.size());
2324 for (int i
= 0; i
< l
.size(); ++i
) {
2325 Q_ASSERT(l
.at(i
).u2
== uc
);
2326 ligatures
.append(l
.at(i
).u1
);
2327 ligatures
.append(l
.at(i
).ligature
);
2329 tableIndex
+= 2*l
.size() + 1;
2331 b
.decompositionPositions
.append(0xffff);
2334 int index
= blocks
.indexOf(b
);
2336 index
= blocks
.size();
2338 used
+= BMP_BLOCKSIZE
;
2341 blockMap
.append(blocks
.at(index
).index
);
2344 int bmp_blocks
= blocks
.size();
2345 Q_ASSERT(blockMap
.size() == BMP_END
/BMP_BLOCKSIZE
);
2347 int bmp_block_data
= bmp_blocks
*BMP_BLOCKSIZE
*2;
2348 int bmp_trie
= BMP_END
/BMP_BLOCKSIZE
*2;
2349 int bmp_mem
= bmp_block_data
+ bmp_trie
;
2350 qDebug(" %d unique blocks in BMP.",blocks
.size());
2351 qDebug(" block data uses: %d bytes", bmp_block_data
);
2352 qDebug(" trie data uses : %d bytes", bmp_trie
);
2353 qDebug(" ligature data uses : %d bytes", ligatures
.size()*2);
2354 qDebug(" memory usage: %d bytes", bmp_mem
+ ligatures
.size() * 2);
2359 out
+= "static const unsigned short uc_ligature_trie[] = {\n";
2361 // first write the map
2362 out
+= " // 0 - 0x" + QByteArray::number(BMP_END
, 16);
2363 for (int i
= 0; i
< BMP_END
/BMP_BLOCKSIZE
; ++i
) {
2365 if (out
.endsWith(' '))
2367 if (!((i
*BMP_BLOCKSIZE
) % 0x1000))
2371 out
+= QByteArray::number(blockMap
.at(i
) + blockMap
.size());
2374 if (out
.endsWith(' '))
2378 for (int i
= 0; i
< blocks
.size(); ++i
) {
2379 if (out
.endsWith(' '))
2382 const DecompositionBlock
&b
= blocks
.at(i
);
2383 for (int j
= 0; j
< b
.decompositionPositions
.size(); ++j
) {
2385 if (out
.endsWith(' '))
2389 out
+= "0x" + QByteArray::number(b
.decompositionPositions
.at(j
), 16);
2393 if (out
.endsWith(' '))
2397 "#define GET_LIGATURE_INDEX(u2) "
2398 "(u2 < 0x" + QByteArray::number(BMP_END
, 16) + " ? "
2399 "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT
) +
2400 "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE
-1, 16)+ ")] : 0xffff);\n\n"
2402 "static const unsigned short uc_ligature_map [] = {\n";
2404 for (int i
= 0; i
< ligatures
.size(); ++i
) {
2406 if (out
.endsWith(' '))
2410 out
+= "0x" + QByteArray::number(ligatures
.at(i
), 16);
2414 if (out
.endsWith(' '))
2421 QByteArray
createCasingInfo()
2425 out
+= "struct CasingInfo {\n"
2426 " uint codePoint : 16;\n"
2427 " uint flags : 8;\n"
2428 " uint offset : 8;\n"
2434 int main(int, char **)
2438 initDecompositionMap();
2439 initGraphemeBreak();
2441 initSentenceBreak();
2444 readBidiMirroring();
2445 readArabicShaping();
2447 readCompositionExclusion();
2449 readSpecialCasing();
2453 readGraphemeBreak();
2455 readSentenceBreak();
2457 computeUniqueProperties();
2458 QByteArray properties
= createPropertyInfo();
2459 QByteArray compositions
= createCompositionInfo();
2460 QByteArray ligatures
= createLigatureInfo();
2461 QByteArray normalizationCorrections
= createNormalizationCorrections();
2462 QByteArray scriptEnumDeclaration
= createScriptEnumDeclaration();
2463 QByteArray scriptTableDeclaration
= createScriptTableDeclaration();
2465 QFile
f("../../src/corelib/tools/qunicodetables.cpp");
2466 f
.open(QFile::WriteOnly
|QFile::Truncate
);
2469 "/****************************************************************************\n"
2471 "** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).\n"
2472 "** All rights reserved.\n"
2473 "** Contact: Nokia Corporation (qt-info@nokia.com)\n"
2475 "** This file is part of the QtCore module of the Qt Toolkit.\n"
2477 "** $QT_BEGIN_LICENSE:LGPL$\n"
2478 "** No Commercial Usage\n"
2479 "** This file contains pre-release code and may not be distributed.\n"
2480 "** You may use this file in accordance with the terms and conditions\n"
2481 "** contained in the Technology Preview License Agreement accompanying\n"
2482 "** this package.\n"
2484 "** GNU Lesser General Public License Usage\n"
2485 "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
2486 "** General Public License version 2.1 as published by the Free Software\n"
2487 "** Foundation and appearing in the file LICENSE.LGPL included in the\n"
2488 "** packaging of this file. Please review the following information to\n"
2489 "** ensure the GNU Lesser General Public License version 2.1 requirements\n"
2490 "** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
2492 "** In addition, as a special exception, Nokia gives you certain additional\n"
2493 "** rights. These rights are described in the Nokia Qt LGPL Exception\n"
2494 "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
2496 "** If you have questions regarding the use of this file, please contact\n"
2497 "** Nokia at qt-info@nokia.com.\n"
2506 "** $QT_END_LICENSE$\n"
2508 "****************************************************************************/\n\n"
2510 "/* This file is autogenerated from the Unicode 5.0 database. Do not edit */\n\n";
2512 QByteArray warning
=
2514 "// W A R N I N G\n"
2515 "// -------------\n"
2517 "// This file is not part of the Qt API. It exists for the convenience\n"
2518 "// of internal files. This header file may change from version to version\n"
2519 "// without notice, or even be removed.\n"
2525 f
.write("QT_BEGIN_NAMESPACE\n\n");
2526 f
.write(properties
);
2527 f
.write(compositions
);
2529 f
.write(normalizationCorrections
);
2530 f
.write(scriptTableDeclaration
);
2531 f
.write("\nQT_END_NAMESPACE\n");
2534 f
.setFileName("../../src/corelib/tools/qunicodetables_p.h");
2535 f
.open(QFile::WriteOnly
| QFile::Truncate
);
2538 f
.write("#ifndef QUNICODETABLES_P_H\n"
2539 "#define QUNICODETABLES_P_H\n\n"
2540 "#include <QtCore/qchar.h>\n\n"
2541 "QT_BEGIN_NAMESPACE\n\n");
2542 f
.write("namespace QUnicodeTables {\n");
2543 f
.write(property_string
);
2545 f
.write(scriptEnumDeclaration
);
2547 f
.write(lineBreakClass
);
2551 f
.write(grapheme_break_string
);
2553 f
.write(word_break_string
);
2555 f
.write(sentence_break_string
);
2557 "QT_END_NAMESPACE\n\n"
2561 qDebug() << "maxMirroredDiff = " << hex
<< maxMirroredDiff
;
2562 qDebug() << "maxLowerCaseDiff = " << hex
<< maxLowerCaseDiff
;
2563 qDebug() << "maxUpperCaseDiff = " << hex
<< maxUpperCaseDiff
;
2564 qDebug() << "maxTitleCaseDiff = " << hex
<< maxTitleCaseDiff
;
2565 qDebug() << "maxCaseFoldDiff = " << hex
<< maxCaseFoldDiff
;
2568 // dump(0x620, 0x640);
2569 // dump(0x10000, 0x10020);
2570 // dump(0x10800, 0x10820);
2572 qDebug("decompositionLength used:");
2573 int totalcompositions
= 0;
2575 for (int i
= 1; i
< 20; ++i
) {
2576 qDebug(" length %d used %d times", i
, decompositionLength
.value(i
, 0));
2577 totalcompositions
+= i
*decompositionLength
.value(i
, 0);
2578 sum
+= decompositionLength
.value(i
, 0);
2580 qDebug(" len decomposition map %d, average length %f, num composed chars %d",
2581 totalcompositions
, (float)totalcompositions
/(float)sum
, sum
);
2582 qDebug("highest composed character %x", highestComposedCharacter
);
2583 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures
, highestLigature
, longestLigature
);
2585 qBubbleSort(ligatures
);
2586 for (int i
= 0; i
< ligatures
.size(); ++i
)
2587 qDebug("%s", ligatures
.at(i
).data());
2589 // qDebug("combiningClass usage:");
2590 // int numClasses = 0;
2591 // for (int i = 0; i < 255; ++i) {
2592 // int num = combiningClassUsage.value(i, 0);
2595 // qDebug(" combiningClass %d used %d times", i, num);
2598 // qDebug("total of %d combining classes used", numClasses);