Fix QT_NO_DATASTREAM macro checks and improve readability
[qt-netbsd.git] / src / corelib / tools / qchar.cpp
blob250dad097675f58ca451b6c2d074a6df7e7f1db1
1 /****************************************************************************
2 **
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4 ** Contact: Qt Software Information (qt-info@nokia.com)
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** No Commercial Usage
10 ** This file contains pre-release code and may not be distributed.
11 ** You may use this file in accordance with the terms and conditions
12 ** contained in the either Technology Preview License Agreement or the
13 ** Beta Release License Agreement.
15 ** GNU Lesser General Public License Usage
16 ** Alternatively, this file may be used under the terms of the GNU Lesser
17 ** General Public License version 2.1 as published by the Free Software
18 ** Foundation and appearing in the file LICENSE.LGPL included in the
19 ** packaging of this file. Please review the following information to
20 ** ensure the GNU Lesser General Public License version 2.1 requirements
21 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23 ** In addition, as a special exception, Nokia gives you certain
24 ** additional rights. These rights are described in the Nokia Qt LGPL
25 ** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26 ** package.
28 ** GNU General Public License Usage
29 ** Alternatively, this file may be used under the terms of the GNU
30 ** General Public License version 3.0 as published by the Free Software
31 ** Foundation and appearing in the file LICENSE.GPL included in the
32 ** packaging of this file. Please review the following information to
33 ** ensure the GNU General Public License version 3.0 requirements will be
34 ** met: http://www.gnu.org/copyleft/gpl.html.
36 ** If you are unsure which license is appropriate for your use, please
37 ** contact the sales department at qt-sales@nokia.com.
38 ** $QT_END_LICENSE$
40 ****************************************************************************/
42 // Don't define it while compiling this module, or USERS of Qt will
43 // not be able to link.
44 #ifdef QT_NO_CAST_FROM_ASCII
45 #undef QT_NO_CAST_FROM_ASCII
46 #endif
47 #ifdef QT_NO_CAST_TO_ASCII
48 #undef QT_NO_CAST_TO_ASCII
49 #endif
50 #include "qchar.h"
51 #include "qdatastream.h"
52 #include "qtextcodec.h"
54 #include "qunicodetables_p.h"
56 #include "qunicodetables.cpp"
58 QT_BEGIN_NAMESPACE
60 #define LAST_UNICODE_CHAR 0x10ffff
62 #ifndef QT_NO_CODEC_FOR_C_STRINGS
63 #ifdef QT_NO_TEXTCODEC
64 #define QT_NO_CODEC_FOR_C_STRINGS
65 #endif
66 #endif
68 #define FLAG(x) (1 << (x))
70 /*! \class QLatin1Char
71 \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
73 \ingroup text
75 This class is only useful to avoid the codec for C strings business
76 in the QChar(ch) constructor. You can avoid it by writing
77 QChar(ch, 0).
79 \sa QChar, QLatin1String, QString
82 /*!
83 \fn const char QLatin1Char::toLatin1() const
85 Converts a Latin-1 character to an 8-bit ASCII representation of
86 the character.
89 /*!
90 \fn const ushort QLatin1Char::unicode() const
92 Converts a Latin-1 character to an 16-bit-encoded Unicode representation
93 of the character.
96 /*!
97 \fn QLatin1Char::QLatin1Char(char c)
99 Constructs a Latin-1 character for \a c. This constructor should be
100 used when the encoding of the input character is known to be Latin-1.
104 \class QChar
105 \brief The QChar class provides a 16-bit Unicode character.
107 \ingroup text
108 \reentrant
110 In Qt, Unicode characters are 16-bit entities without any markup
111 or structure. This class represents such an entity. It is
112 lightweight, so it can be used everywhere. Most compilers treat
113 it like a \c{unsigned short}.
115 QChar provides a full complement of testing/classification
116 functions, converting to and from other formats, converting from
117 composed to decomposed Unicode, and trying to compare and
118 case-convert if you ask it to.
120 The classification functions include functions like those in the
121 standard C++ header \<cctype\> (formerly \<ctype.h\>), but
122 operating on the full range of Unicode characters. They all
123 return true if the character is a certain type of character;
124 otherwise they return false. These classification functions are
125 isNull() (returns true if the character is '\\0'), isPrint()
126 (true if the character is any sort of printable character,
127 including whitespace), isPunct() (any sort of punctation),
128 isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
129 sort of numeric character, not just 0-9), isLetterOrNumber(), and
130 isDigit() (decimal digits). All of these are wrappers around
131 category() which return the Unicode-defined category of each
132 character.
134 QChar also provides direction(), which indicates the "natural"
135 writing direction of this character. The joining() function
136 indicates how the character joins with its neighbors (needed
137 mostly for Arabic) and finally hasMirrored(), which indicates
138 whether the character needs to be mirrored when it is printed in
139 its "unnatural" writing direction.
141 Composed Unicode characters (like \aring) can be converted to
142 decomposed Unicode ("a" followed by "ring above") by using
143 decomposition().
145 In Unicode, comparison is not necessarily possible and case
146 conversion is very difficult at best. Unicode, covering the
147 "entire" world, also includes most of the world's case and
148 sorting problems. operator==() and friends will do comparison
149 based purely on the numeric Unicode value (code point) of the
150 characters, and toUpper() and toLower() will do case changes when
151 the character has a well-defined uppercase/lowercase equivalent.
152 For locale-dependent comparisons, use
153 QString::localeAwareCompare().
155 The conversion functions include unicode() (to a scalar),
156 toLatin1() (to scalar, but converts all non-Latin-1 characters to
157 0), row() (gives the Unicode row), cell() (gives the Unicode
158 cell), digitValue() (gives the integer value of any of the
159 numerous digit characters), and a host of constructors.
161 QChar provides constructors and cast operators that make it easy
162 to convert to and from traditional 8-bit \c{char}s. If you
163 defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
164 explained in the QString documentation, you will need to
165 explicitly call fromAscii() or fromLatin1(), or use QLatin1Char,
166 to construct a QChar from an 8-bit \c char, and you will need to
167 call toAscii() or toLatin1() to get the 8-bit value back.
169 \sa QString, Unicode, QLatin1Char
173 \enum QChar::UnicodeVersion
175 Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
176 introduced a certain character.
178 \value Unicode_1_1 Version 1.1
179 \value Unicode_2_0 Version 2.0
180 \value Unicode_2_1_2 Version 2.1.2
181 \value Unicode_3_0 Version 3.0
182 \value Unicode_3_1 Version 3.1
183 \value Unicode_3_2 Version 3.2
184 \value Unicode_4_0 Version 4.0
185 \value Unicode_4_1 Version 4.1
186 \value Unicode_5_0 Version 5.0
187 \value Unicode_Unassigned The value is not assigned to any character
188 in version 5.0 of Unicode.
190 \sa unicodeVersion()
194 \enum QChar::Category
196 This enum maps the Unicode character categories.
198 The following characters are normative in Unicode:
200 \value Mark_NonSpacing Unicode class name Mn
202 \value Mark_SpacingCombining Unicode class name Mc
204 \value Mark_Enclosing Unicode class name Me
206 \value Number_DecimalDigit Unicode class name Nd
208 \value Number_Letter Unicode class name Nl
210 \value Number_Other Unicode class name No
212 \value Separator_Space Unicode class name Zs
214 \value Separator_Line Unicode class name Zl
216 \value Separator_Paragraph Unicode class name Zp
218 \value Other_Control Unicode class name Cc
220 \value Other_Format Unicode class name Cf
222 \value Other_Surrogate Unicode class name Cs
224 \value Other_PrivateUse Unicode class name Co
226 \value Other_NotAssigned Unicode class name Cn
229 The following categories are informative in Unicode:
231 \value Letter_Uppercase Unicode class name Lu
233 \value Letter_Lowercase Unicode class name Ll
235 \value Letter_Titlecase Unicode class name Lt
237 \value Letter_Modifier Unicode class name Lm
239 \value Letter_Other Unicode class name Lo
241 \value Punctuation_Connector Unicode class name Pc
243 \value Punctuation_Dash Unicode class name Pd
245 \value Punctuation_Open Unicode class name Ps
247 \value Punctuation_Close Unicode class name Pe
249 \value Punctuation_InitialQuote Unicode class name Pi
251 \value Punctuation_FinalQuote Unicode class name Pf
253 \value Punctuation_Other Unicode class name Po
255 \value Symbol_Math Unicode class name Sm
257 \value Symbol_Currency Unicode class name Sc
259 \value Symbol_Modifier Unicode class name Sk
261 \value Symbol_Other Unicode class name So
263 \value NoCategory Qt cannot find an appropriate category for the character.
265 \omitvalue Punctuation_Dask
267 \sa category()
271 \enum QChar::Direction
273 This enum type defines the Unicode direction attributes. See the
274 \l{http://www.unicode.org/}{Unicode Standard} for a description
275 of the values.
277 In order to conform to C/C++ naming conventions "Dir" is prepended
278 to the codes used in the Unicode Standard.
280 \value DirAL
281 \value DirAN
282 \value DirB
283 \value DirBN
284 \value DirCS
285 \value DirEN
286 \value DirES
287 \value DirET
288 \value DirL
289 \value DirLRE
290 \value DirLRO
291 \value DirNSM
292 \value DirON
293 \value DirPDF
294 \value DirR
295 \value DirRLE
296 \value DirRLO
297 \value DirS
298 \value DirWS
300 \sa direction()
304 \enum QChar::Decomposition
306 This enum type defines the Unicode decomposition attributes. See
307 the \l{http://www.unicode.org/}{Unicode Standard} for a
308 description of the values.
310 \value NoDecomposition
311 \value Canonical
312 \value Circle
313 \value Compat
314 \value Final
315 \value Font
316 \value Fraction
317 \value Initial
318 \value Isolated
319 \value Medial
320 \value Narrow
321 \value NoBreak
322 \value Small
323 \value Square
324 \value Sub
325 \value Super
326 \value Vertical
327 \value Wide
329 \omitvalue Single
331 \sa decomposition()
335 \enum QChar::Joining
337 This enum type defines the Unicode joining attributes. See the
338 \l{http://www.unicode.org/}{Unicode Standard} for a description
339 of the values.
341 \value Center
342 \value Dual
343 \value OtherJoining
344 \value Right
346 \sa joining()
350 \enum QChar::CombiningClass
352 \internal
354 This enum type defines names for some of the Unicode combining
355 classes. See the \l{http://www.unicode.org/}{Unicode Standard}
356 for a description of the values.
358 \value Combining_Above
359 \value Combining_AboveAttached
360 \value Combining_AboveLeft
361 \value Combining_AboveLeftAttached
362 \value Combining_AboveRight
363 \value Combining_AboveRightAttached
364 \value Combining_Below
365 \value Combining_BelowAttached
366 \value Combining_BelowLeft
367 \value Combining_BelowLeftAttached
368 \value Combining_BelowRight
369 \value Combining_BelowRightAttached
370 \value Combining_DoubleAbove
371 \value Combining_DoubleBelow
372 \value Combining_IotaSubscript
373 \value Combining_Left
374 \value Combining_LeftAttached
375 \value Combining_Right
376 \value Combining_RightAttached
380 \enum QChar::SpecialCharacter
382 \value Null A QChar with this value isNull().
383 \value Nbsp Non-breaking space.
384 \value ReplacementCharacter
385 \value ObjectReplacementCharacter The character shown when a font has no glyph for a certain codepoint. The square character is normally used.
386 \value ByteOrderMark
387 \value ByteOrderSwapped
388 \value ParagraphSeparator
389 \value LineSeparator
391 \omitvalue null
392 \omitvalue replacement
393 \omitvalue byteOrderMark
394 \omitvalue byteOrderSwapped
395 \omitvalue nbsp
399 \fn void QChar::setCell(uchar cell)
400 \internal
404 \fn void QChar::setRow(uchar row)
405 \internal
409 \fn QChar::QChar()
411 Constructs a null QChar ('\\0').
413 \sa isNull()
417 \fn QChar::QChar(QLatin1Char ch)
419 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
423 \fn QChar::QChar(SpecialCharacter ch)
425 Constructs a QChar for the predefined character value \a ch.
429 Constructs a QChar corresponding to ASCII/Latin-1 character \a
432 QChar::QChar(char ch)
434 #ifndef QT_NO_CODEC_FOR_C_STRINGS
435 if (QTextCodec::codecForCStrings())
436 // #####
437 ucs = QTextCodec::codecForCStrings()->toUnicode(&ch, 1).at(0).unicode();
438 else
439 #endif
440 ucs = uchar(ch);
444 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
446 QChar::QChar(uchar ch)
448 #ifndef QT_NO_CODEC_FOR_C_STRINGS
449 if (QTextCodec::codecForCStrings()) {
450 // #####
451 char c = char(ch);
452 ucs = QTextCodec::codecForCStrings()->toUnicode(&c, 1).at(0).unicode();
453 } else
454 #endif
455 ucs = ch;
459 \fn QChar::QChar(uchar cell, uchar row)
461 Constructs a QChar for Unicode cell \a cell in row \a row.
463 \sa cell(), row()
467 \fn QChar::QChar(ushort code)
469 Constructs a QChar for the character with Unicode code point \a
470 code.
475 \fn QChar::QChar(short code)
477 Constructs a QChar for the character with Unicode code point \a
478 code.
483 \fn QChar::QChar(uint code)
485 Constructs a QChar for the character with Unicode code point \a
486 code.
491 \fn QChar::QChar(int code)
493 Constructs a QChar for the character with Unicode code point \a
494 code.
499 \fn bool QChar::isNull() const
501 Returns true if the character is the Unicode character 0x0000
502 ('\\0'); otherwise returns false.
506 \fn uchar QChar::cell() const
508 Returns the cell (least significant byte) of the Unicode
509 character.
511 \sa row()
515 \fn uchar QChar::row() const
517 Returns the row (most significant byte) of the Unicode character.
519 \sa cell()
523 Returns true if the character is a printable character; otherwise
524 returns false. This is any character not of category Cc or Cn.
526 Note that this gives no indication of whether the character is
527 available in a particular font.
529 bool QChar::isPrint() const
531 const int test = FLAG(Other_Control) |
532 FLAG(Other_NotAssigned);
533 return !(FLAG(qGetProp(ucs)->category) & test);
537 Returns true if the character is a separator character
538 (Separator_* categories); otherwise returns false.
540 bool QChar::isSpace() const
542 if(ucs >= 9 && ucs <=13)
543 return true;
544 const int test = FLAG(Separator_Space) |
545 FLAG(Separator_Line) |
546 FLAG(Separator_Paragraph);
547 return FLAG(qGetProp(ucs)->category) & test;
551 Returns true if the character is a mark (Mark_* categories);
552 otherwise returns false.
554 See QChar::Category for more information regarding marks.
556 bool QChar::isMark() const
558 const int test = FLAG(Mark_NonSpacing) |
559 FLAG(Mark_SpacingCombining) |
560 FLAG(Mark_Enclosing);
561 return FLAG(qGetProp(ucs)->category) & test;
565 Returns true if the character is a punctuation mark (Punctuation_*
566 categories); otherwise returns false.
568 bool QChar::isPunct() const
570 const int test = FLAG(Punctuation_Connector) |
571 FLAG(Punctuation_Dash) |
572 FLAG(Punctuation_Open) |
573 FLAG(Punctuation_Close) |
574 FLAG(Punctuation_InitialQuote) |
575 FLAG(Punctuation_FinalQuote) |
576 FLAG(Punctuation_Other);
577 return FLAG(qGetProp(ucs)->category) & test;
581 Returns true if the character is a letter (Letter_* categories);
582 otherwise returns false.
584 bool QChar::isLetter() const
586 const int test = FLAG(Letter_Uppercase) |
587 FLAG(Letter_Lowercase) |
588 FLAG(Letter_Titlecase) |
589 FLAG(Letter_Modifier) |
590 FLAG(Letter_Other);
591 return FLAG(qGetProp(ucs)->category) & test;
595 Returns true if the character is a number (Number_* categories,
596 not just 0-9); otherwise returns false.
598 \sa isDigit()
600 bool QChar::isNumber() const
602 const int test = FLAG(Number_DecimalDigit) |
603 FLAG(Number_Letter) |
604 FLAG(Number_Other);
605 return FLAG(qGetProp(ucs)->category) & test;
609 Returns true if the character is a letter or number (Letter_* or
610 Number_* categories); otherwise returns false.
612 bool QChar::isLetterOrNumber() const
614 const int test = FLAG(Letter_Uppercase) |
615 FLAG(Letter_Lowercase) |
616 FLAG(Letter_Titlecase) |
617 FLAG(Letter_Modifier) |
618 FLAG(Letter_Other) |
619 FLAG(Number_DecimalDigit) |
620 FLAG(Number_Letter) |
621 FLAG(Number_Other);
622 return FLAG(qGetProp(ucs)->category) & test;
627 Returns true if the character is a decimal digit
628 (Number_DecimalDigit); otherwise returns false.
630 bool QChar::isDigit() const
632 return (qGetProp(ucs)->category == Number_DecimalDigit);
637 Returns true if the character is a symbol (Symbol_* categories);
638 otherwise returns false.
640 bool QChar::isSymbol() const
642 const int test = FLAG(Symbol_Math) |
643 FLAG(Symbol_Currency) |
644 FLAG(Symbol_Modifier) |
645 FLAG(Symbol_Other);
646 return FLAG(qGetProp(ucs)->category) & test;
650 \fn bool QChar::isHighSurrogate() const
652 Returns true if the QChar is the high part of a utf16 surrogate
653 (ie. if its code point is between 0xd800 and 0xdbff).
657 \fn bool QChar::isLowSurrogate() const
659 Returns true if the QChar is the low part of a utf16 surrogate
660 (ie. if its code point is between 0xdc00 and 0xdfff).
664 \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
666 Converts a UTF16 surrogate pair with the given \a high and \a low values
667 to its UCS-4 code point.
671 \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
673 Converts a utf16 surrogate pair (\a high, \a low) to its ucs4 code
674 point.
678 \fn static ushort QChar::highSurrogate(uint ucs4)
680 Returns the high surrogate value of a ucs4 code point.
681 The returned result is undefined if \a ucs4 is smaller than 0x10000.
685 \fn static ushort QChar::lowSurrogate(uint ucs4)
687 Returns the low surrogate value of a ucs4 code point.
688 The returned result is undefined if \a ucs4 is smaller than 0x10000.
692 Returns the numeric value of the digit, or -1 if the character is
693 not a digit.
695 int QChar::digitValue() const
697 return qGetProp(ucs)->digitValue;
701 \overload
702 Returns the numeric value of the digit, specified by the UCS-2-encoded
703 character, \a ucs2, or -1 if the character is not a digit.
705 int QChar::digitValue(ushort ucs2)
707 return qGetProp(ucs2)->digitValue;
711 \overload
712 Returns the numeric value of the digit specified by the UCS-4-encoded
713 character, \a ucs4, or -1 if the character is not a digit.
715 int QChar::digitValue(uint ucs4)
717 if (ucs4 > LAST_UNICODE_CHAR)
718 return 0;
719 return qGetProp(ucs4)->digitValue;
723 Returns the character's category.
725 QChar::Category QChar::category() const
727 return (QChar::Category) qGetProp(ucs)->category;
730 /*!
731 \overload
732 \since 4.3
733 Returns the category of the UCS-4-encoded character specified by \a ucs4.
735 QChar::Category QChar::category(uint ucs4)
737 if (ucs4 > LAST_UNICODE_CHAR)
738 return QChar::NoCategory;
739 return (QChar::Category) qGetProp(ucs4)->category;
742 /*!
743 \overload
744 Returns the category of the UCS-2-encoded character specified by \a ucs2.
746 QChar::Category QChar::category(ushort ucs2)
748 return (QChar::Category) qGetProp(ucs2)->category;
753 Returns the character's direction.
755 QChar::Direction QChar::direction() const
757 return (QChar::Direction) qGetProp(ucs)->direction;
760 /*!
761 \overload
762 Returns the direction of the UCS-4-encoded character specified by \a ucs4.
764 QChar::Direction QChar::direction(uint ucs4)
766 if (ucs4 > LAST_UNICODE_CHAR)
767 return QChar::DirL;
768 return (QChar::Direction) qGetProp(ucs4)->direction;
771 /*!
772 \overload
773 Returns the direction of the UCS-2-encoded character specified by \a ucs2.
775 QChar::Direction QChar::direction(ushort ucs2)
777 return (QChar::Direction) qGetProp(ucs2)->direction;
781 Returns information about the joining properties of the character
782 (needed for certain languages such as Arabic).
784 QChar::Joining QChar::joining() const
786 return (QChar::Joining) qGetProp(ucs)->joining;
789 /*!
790 \overload
791 Returns information about the joining properties of the UCS-4-encoded
792 character specified by \a ucs4 (needed for certain languages such as
793 Arabic).
795 QChar::Joining QChar::joining(uint ucs4)
797 if (ucs4 > LAST_UNICODE_CHAR)
798 return QChar::OtherJoining;
799 return (QChar::Joining) qGetProp(ucs4)->joining;
802 /*!
803 \overload
804 Returns information about the joining properties of the UCS-2-encoded
805 character specified by \a ucs2 (needed for certain languages such as
806 Arabic).
808 QChar::Joining QChar::joining(ushort ucs2)
810 return (QChar::Joining) qGetProp(ucs2)->joining;
815 Returns true if the character should be reversed if the text
816 direction is reversed; otherwise returns false.
818 Same as (ch.mirroredChar() != ch).
820 \sa mirroredChar()
822 bool QChar::hasMirrored() const
824 return qGetProp(ucs)->mirrorDiff != 0;
828 \fn bool QChar::isLower() const
830 Returns true if the character is a lowercase letter, i.e.
831 category() is Letter_Lowercase.
833 \sa isUpper(), toLower(), toUpper()
837 \fn bool QChar::isUpper() const
839 Returns true if the character is an uppercase letter, i.e.
840 category() is Letter_Uppercase.
842 \sa isLower(), toUpper(), toLower()
846 \fn bool QChar::isTitleCase() const
847 \since 4.3
849 Returns true if the character is a titlecase letter, i.e.
850 category() is Letter_Titlecase.
852 \sa isLower(), toUpper(), toLower(), toTitleCase()
856 Returns the mirrored character if this character is a mirrored
857 character; otherwise returns the character itself.
859 \sa hasMirrored()
861 QChar QChar::mirroredChar() const
863 return ucs + qGetProp(ucs)->mirrorDiff;
866 /*! \overload
867 Returns the mirrored character if the UCS-4-encoded character specified
868 by \a ucs4 is a mirrored character; otherwise returns the character itself.
870 \sa hasMirrored()
872 uint QChar::mirroredChar(uint ucs4)
874 if (ucs4 > LAST_UNICODE_CHAR)
875 return ucs4;
876 return ucs4 + qGetProp(ucs4)->mirrorDiff;
879 /*!
880 \overload
881 Returns the mirrored character if the UCS-2-encoded character specified
882 by \a ucs2 is a mirrored character; otherwise returns the character itself.
884 \sa hasMirrored()
886 ushort QChar::mirroredChar(ushort ucs2)
888 return ucs2 + qGetProp(ucs2)->mirrorDiff;
892 enum {
893 Hangul_SBase = 0xac00,
894 Hangul_LBase = 0x1100,
895 Hangul_VBase = 0x1161,
896 Hangul_TBase = 0x11a7,
897 Hangul_SCount = 11172,
898 Hangul_LCount = 19,
899 Hangul_VCount = 21,
900 Hangul_TCount = 28,
901 Hangul_NCount = 21*28
904 // buffer has to have a length of 3. It's needed for Hangul decomposition
905 static const unsigned short * QT_FASTCALL decompositionHelper
906 (uint ucs4, int *length, int *tag, unsigned short *buffer)
908 *length = 0;
909 if (ucs4 > LAST_UNICODE_CHAR)
910 return 0;
911 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
912 int SIndex = ucs4 - Hangul_SBase;
913 buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L
914 buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
915 buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T
916 *length = buffer[2] == Hangul_TBase ? 2 : 3;
917 *tag = QChar::Canonical;
918 return buffer;
921 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
922 if (index == 0xffff)
923 return 0;
924 const unsigned short *decomposition = uc_decomposition_map+index;
925 *tag = (*decomposition) & 0xff;
926 *length = (*decomposition) >> 8;
927 return decomposition+1;
931 Decomposes a character into its parts. Returns an empty string if
932 no decomposition exists.
934 QString QChar::decomposition() const
936 return decomposition(ucs);
939 /*!
940 \overload
941 Decomposes the UCS-4-encoded character specified by \a ucs4 into its
942 constituent parts. Returns an empty string if no decomposition exists.
944 QString QChar::decomposition(uint ucs4)
946 unsigned short buffer[3];
947 int length;
948 int tag;
949 const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
950 return QString::fromUtf16(d, length);
954 Returns the tag defining the composition of the character. Returns
955 QChar::Single if no decomposition exists.
957 QChar::Decomposition QChar::decompositionTag() const
959 return decompositionTag(ucs);
962 /*!
963 \overload
964 Returns the tag defining the composition of the UCS-4-encoded character
965 specified by \a ucs4. Returns QChar::Single if no decomposition exists.
967 QChar::Decomposition QChar::decompositionTag(uint ucs4)
969 if (ucs4 > LAST_UNICODE_CHAR)
970 return QChar::NoDecomposition;
971 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
972 if (index == 0xffff)
973 return QChar::NoDecomposition;
974 return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
978 Returns the combining class for the character as defined in the
979 Unicode standard. This is mainly useful as a positioning hint for
980 marks attached to a base character.
982 The Qt text rendering engine uses this information to correctly
983 position non-spacing marks around a base character.
985 unsigned char QChar::combiningClass() const
987 return (unsigned char) qGetProp(ucs)->combiningClass;
990 /*! \overload
991 Returns the combining class for the UCS-4-encoded character specified by
992 \a ucs4, as defined in the Unicode standard.
994 unsigned char QChar::combiningClass(uint ucs4)
996 if (ucs4 > LAST_UNICODE_CHAR)
997 return 0;
998 return (unsigned char) qGetProp(ucs4)->combiningClass;
1001 /*! \overload
1002 Returns the combining class for the UCS-2-encoded character specified by
1003 \a ucs2, as defined in the Unicode standard.
1005 unsigned char QChar::combiningClass(ushort ucs2)
1007 return (unsigned char) qGetProp(ucs2)->combiningClass;
1012 Returns the Unicode version that introduced this character.
1014 QChar::UnicodeVersion QChar::unicodeVersion() const
1016 return (QChar::UnicodeVersion) qGetProp(ucs)->unicodeVersion;
1019 /*! \overload
1020 Returns the Unicode version that introduced the character specified in
1021 its UCS-4-encoded form as \a ucs4.
1023 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4)
1025 if (ucs4 > LAST_UNICODE_CHAR)
1026 return QChar::Unicode_Unassigned;
1027 return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1030 /*! \overload
1031 Returns the Unicode version that introduced the character specified in
1032 its UCS-2-encoded form as \a ucs2.
1034 QChar::UnicodeVersion QChar::unicodeVersion(ushort ucs2)
1036 return (QChar::UnicodeVersion) qGetProp(ucs2)->unicodeVersion;
1041 Returns the lowercase equivalent if the character is uppercase or titlecase;
1042 otherwise returns the character itself.
1044 QChar QChar::toLower() const
1046 const QUnicodeTables::Properties *p = qGetProp(ucs);
1047 if (!p->lowerCaseSpecial)
1048 return ucs + p->lowerCaseDiff;
1049 return ucs;
1052 /*! \overload
1053 Returns the lowercase equivalent of the UCS-4-encoded character specified
1054 by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1055 the character itself.
1057 uint QChar::toLower(uint ucs4)
1059 if (ucs4 > LAST_UNICODE_CHAR)
1060 return ucs4;
1061 const QUnicodeTables::Properties *p = qGetProp(ucs4);
1062 if (!p->lowerCaseSpecial)
1063 return ucs4 + p->lowerCaseDiff;
1064 return ucs4;
1067 /*! \overload
1068 Returns the lowercase equivalent of the UCS-2-encoded character specified
1069 by \a ucs2 if the character is uppercase or titlecase; otherwise returns
1070 the character itself.
1072 ushort QChar::toLower(ushort ucs2)
1074 const QUnicodeTables::Properties *p = qGetProp(ucs2);
1075 if (!p->lowerCaseSpecial)
1076 return ucs2 + p->lowerCaseDiff;
1077 return ucs2;
1081 Returns the uppercase equivalent if the character is lowercase or titlecase;
1082 otherwise returns the character itself.
1084 QChar QChar::toUpper() const
1086 const QUnicodeTables::Properties *p = qGetProp(ucs);
1087 if (!p->upperCaseSpecial)
1088 return ucs + p->upperCaseDiff;
1089 return ucs;
1092 /*! \overload
1093 Returns the uppercase equivalent of the UCS-4-encoded character specified
1094 by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1095 the character itself.
1097 uint QChar::toUpper(uint ucs4)
1099 if (ucs4 > LAST_UNICODE_CHAR)
1100 return ucs4;
1101 const QUnicodeTables::Properties *p = qGetProp(ucs4);
1102 if (!p->upperCaseSpecial)
1103 return ucs4 + p->upperCaseDiff;
1104 return ucs4;
1107 /*! \overload
1108 Returns the uppercase equivalent of the UCS-2-encoded character specified
1109 by \a ucs2 if the character is lowercase or titlecase; otherwise returns
1110 the character itself.
1112 ushort QChar::toUpper(ushort ucs2)
1114 const QUnicodeTables::Properties *p = qGetProp(ucs2);
1115 if (!p->upperCaseSpecial)
1116 return ucs2 + p->upperCaseDiff;
1117 return ucs2;
1121 Returns the title case equivalent if the character is lowercase or uppercase;
1122 otherwise returns the character itself.
1124 QChar QChar::toTitleCase() const
1126 const QUnicodeTables::Properties *p = qGetProp(ucs);
1127 if (!p->titleCaseSpecial)
1128 return ucs + p->titleCaseDiff;
1129 return ucs;
1133 \overload
1134 Returns the title case equivalent of the UCS-4-encoded character specified
1135 by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1136 the character itself.
1138 uint QChar::toTitleCase(uint ucs4)
1140 if (ucs4 > LAST_UNICODE_CHAR)
1141 return ucs4;
1142 const QUnicodeTables::Properties *p = qGetProp(ucs4);
1143 if (!p->titleCaseSpecial)
1144 return ucs4 + p->titleCaseDiff;
1145 return ucs4;
1149 \overload
1150 Returns the title case equivalent of the UCS-2-encoded character specified
1151 by \a ucs2 if the character is lowercase or uppercase; otherwise returns
1152 the character itself.
1154 ushort QChar::toTitleCase(ushort ucs2)
1156 const QUnicodeTables::Properties *p = qGetProp(ucs2);
1157 if (!p->titleCaseSpecial)
1158 return ucs2 + p->titleCaseDiff;
1159 return ucs2;
1163 static inline uint foldCase(const ushort *ch, const ushort *start)
1165 uint c = *ch;
1166 if (QChar(c).isLowSurrogate() && ch > start && QChar(*(ch - 1)).isHighSurrogate())
1167 c = QChar::surrogateToUcs4(*(ch - 1), c);
1168 return *ch + qGetProp(c)->caseFoldDiff;
1171 static inline uint foldCase(uint ch, uint &last)
1173 uint c = ch;
1174 if (QChar(c).isLowSurrogate() && QChar(last).isHighSurrogate())
1175 c = QChar::surrogateToUcs4(last, c);
1176 last = ch;
1177 return ch + qGetProp(c)->caseFoldDiff;
1180 static inline ushort foldCase(ushort ch)
1182 return ch + qGetProp(ch)->caseFoldDiff;
1186 Returns the case folded equivalent of the character. For most Unicode characters this
1187 is the same as toLowerCase().
1189 QChar QChar::toCaseFolded() const
1191 return ucs + qGetProp(ucs)->caseFoldDiff;
1195 \overload
1196 Returns the case folded equivalent of the UCS-4-encoded character specified
1197 by \a ucs4. For most Unicode characters this is the same as toLowerCase().
1199 uint QChar::toCaseFolded(uint ucs4)
1201 if (ucs4 > LAST_UNICODE_CHAR)
1202 return ucs4;
1203 return ucs4 + qGetProp(ucs4)->caseFoldDiff;
1207 \overload
1208 Returns the case folded equivalent of the UCS-2-encoded character specified
1209 by \a ucs2. For most Unicode characters this is the same as toLowerCase().
1211 ushort QChar::toCaseFolded(ushort ucs2)
1213 return ucs2 + qGetProp(ucs2)->caseFoldDiff;
1218 \fn char QChar::latin1() const
1220 Use toLatin1() instead.
1224 \fn char QChar::ascii() const
1226 Use toAscii() instead.
1230 \fn char QChar::toLatin1() const
1232 Returns the Latin-1 character equivalent to the QChar, or 0. This
1233 is mainly useful for non-internationalized software.
1235 \sa toAscii(), unicode(), QTextCodec::codecForCStrings()
1239 \fn char QChar::toAscii() const
1240 Returns the character value of the QChar obtained using the current
1241 codec used to read C strings, or 0 if the character is not representable
1242 using this codec. The default codec handles Latin-1 encoded text,
1243 but this can be changed to assist developers writing source code using
1244 other encodings.
1246 The main purpose of this function is to preserve ASCII characters used
1247 in C strings. This is mainly useful for developers of non-internationalized
1248 software.
1250 \sa toLatin1(), unicode(), QTextCodec::codecForCStrings()
1252 #ifdef Q_COMPILER_MANGLES_RETURN_TYPE
1253 const char QChar::toAscii() const
1254 #else
1255 char QChar::toAscii() const
1256 #endif
1258 #ifndef QT_NO_CODEC_FOR_C_STRINGS
1259 if (QTextCodec::codecForCStrings())
1260 // #####
1261 return QTextCodec::codecForCStrings()->fromUnicode(QString(*this)).at(0);
1262 #endif
1263 return ucs > 0xff ? 0 : char(ucs);
1267 \fn QChar QChar::fromLatin1(char c)
1269 Converts the Latin-1 character \a c to its equivalent QChar. This
1270 is mainly useful for non-internationalized software.
1272 \sa fromAscii(), unicode(), QTextCodec::codecForCStrings()
1276 Converts the ASCII character \a c to its equivalent QChar. This
1277 is mainly useful for non-internationalized software.
1279 An alternative is to use QLatin1Char.
1281 \sa fromLatin1(), unicode(), QTextCodec::codecForCStrings()
1283 QChar QChar::fromAscii(char c)
1285 #ifndef QT_NO_CODEC_FOR_C_STRINGS
1286 if (QTextCodec::codecForCStrings())
1287 // #####
1288 return QTextCodec::codecForCStrings()->toUnicode(&c, 1).at(0).unicode();
1289 #endif
1290 return QChar(ushort((uchar)c));
1293 #ifndef QT_NO_DATASTREAM
1295 \relates QChar
1297 Writes the char \a chr to the stream \a out.
1299 \sa {Format of the QDataStream operators}
1302 QDataStream &operator<<(QDataStream &out, const QChar &chr)
1304 out << quint16(chr.unicode());
1305 return out;
1310 \relates QChar
1312 Reads a char from the stream \a in into char \a chr.
1314 \sa {Format of the QDataStream operators}
1317 QDataStream &operator>>(QDataStream &in, QChar &chr)
1319 quint16 u;
1320 in >> u;
1321 chr.unicode() = ushort(u);
1322 return in;
1324 #endif // QT_NO_DATASTREAM
1327 \fn ushort & QChar::unicode()
1329 Returns a reference to the numeric Unicode value of the QChar.
1333 \fn ushort QChar::unicode() const
1335 \overload
1338 /*****************************************************************************
1339 Documentation of QChar related functions
1340 *****************************************************************************/
1343 \fn bool operator==(QChar c1, QChar c2)
1345 \relates QChar
1347 Returns true if \a c1 and \a c2 are the same Unicode character;
1348 otherwise returns false.
1352 \fn int operator!=(QChar c1, QChar c2)
1354 \relates QChar
1356 Returns true if \a c1 and \a c2 are not the same Unicode
1357 character; otherwise returns false.
1361 \fn int operator<=(QChar c1, QChar c2)
1363 \relates QChar
1365 Returns true if the numeric Unicode value of \a c1 is less than
1366 or equal to that of \a c2; otherwise returns false.
1370 \fn int operator>=(QChar c1, QChar c2)
1372 \relates QChar
1374 Returns true if the numeric Unicode value of \a c1 is greater than
1375 or equal to that of \a c2; otherwise returns false.
1379 \fn int operator<(QChar c1, QChar c2)
1381 \relates QChar
1383 Returns true if the numeric Unicode value of \a c1 is less than
1384 that of \a c2; otherwise returns false.
1388 \fn int operator>(QChar c1, QChar c2)
1390 \relates QChar
1392 Returns true if the numeric Unicode value of \a c1 is greater than
1393 that of \a c2; otherwise returns false.
1397 \fn bool QChar::mirrored() const
1399 Use hasMirrored() instead.
1403 \fn QChar QChar::lower() const
1405 Use toLower() instead.
1409 \fn QChar QChar::upper() const
1411 Use toUpper() instead.
1415 \fn bool QChar::networkOrdered()
1417 See if QSysInfo::ByteOrder == QSysInfo::BigEndian instead.
1421 // ---------------------------------------------------------------------------
1424 static QString decomposeHelper
1425 (const QString &str, bool canonical, QChar::UnicodeVersion version)
1427 unsigned short buffer[3];
1429 QString s = str;
1431 const unsigned short *utf16 = s.utf16();
1432 const unsigned short *uc = utf16 + s.length();
1433 while (uc != utf16) {
1434 uint ucs4 = *(--uc);
1435 if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1436 ushort high = *(uc - 1);
1437 if (QChar(high).isHighSurrogate()) {
1438 --uc;
1439 ucs4 = QChar::surrogateToUcs4(high, ucs4);
1442 if (QChar::unicodeVersion(ucs4) > version)
1443 continue;
1444 int length;
1445 int tag;
1446 const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1447 if (!d || (canonical && tag != QChar::Canonical))
1448 continue;
1450 s.replace(uc - utf16, ucs4 > 0x10000 ? 2 : 1, (const QChar *)d, length);
1451 // since the insert invalidates the pointers and we do decomposition recursive
1452 int pos = uc - utf16;
1453 utf16 = s.utf16();
1454 uc = utf16 + pos + length;
1457 return s;
1461 static ushort ligatureHelper(ushort u1, ushort u2)
1463 // hangul L-V pair
1464 int LIndex = u1 - Hangul_LBase;
1465 if (0 <= LIndex && LIndex < Hangul_LCount) {
1466 int VIndex = u2 - Hangul_VBase;
1467 if (0 <= VIndex && VIndex < Hangul_VCount)
1468 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1471 // hangul LV-T pair
1472 int SIndex = u1 - Hangul_SBase;
1473 if (0 <= SIndex && SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1474 int TIndex = u2 - Hangul_TBase;
1475 if (0 <= TIndex && TIndex <= Hangul_TCount)
1476 return u1 + TIndex;
1479 const unsigned short index = GET_LIGATURE_INDEX(u2);
1480 if (index == 0xffff)
1481 return 0;
1482 const unsigned short *ligatures = uc_ligature_map+index;
1483 ushort length = *ligatures;
1484 ++ligatures;
1485 // ### use bsearch
1486 for (uint i = 0; i < length; ++i)
1487 if (ligatures[2*i] == u1)
1488 return ligatures[2*i+1];
1489 return 0;
1492 static QString composeHelper(const QString &str)
1494 QString s = str;
1496 if (s.length() < 2)
1497 return s;
1499 // the loop can partly ignore high Unicode as all ligatures are in the BMP
1500 int starter = 0;
1501 int lastCombining = 0;
1502 int pos = 0;
1503 while (pos < s.length()) {
1504 uint uc = s.utf16()[pos];
1505 if (QChar(uc).isHighSurrogate() && pos < s.length()-1) {
1506 ushort low = s.utf16()[pos+1];
1507 if (QChar(low).isLowSurrogate()) {
1508 uc = QChar::surrogateToUcs4(uc, low);
1509 ++pos;
1512 int combining = QChar::combiningClass(uc);
1513 if (starter == pos - 1 || combining > lastCombining) {
1514 // allowed to form ligature with S
1515 QChar ligature = ligatureHelper(s.utf16()[starter], uc);
1516 if (ligature.unicode()) {
1517 s[starter] = ligature;
1518 s.remove(pos, 1);
1519 continue;
1522 if (!combining)
1523 starter = pos;
1524 lastCombining = combining;
1525 ++pos;
1527 return s;
1531 static QString canonicalOrderHelper
1532 (const QString &str, QChar::UnicodeVersion version)
1534 QString s = str;
1535 const int l = s.length()-1;
1536 int pos = 0;
1537 while (pos < l) {
1538 int p2 = pos+1;
1539 uint u1 = s.at(pos).unicode();
1540 if (QChar(u1).isHighSurrogate()) {
1541 ushort low = s.at(pos+1).unicode();
1542 if (QChar(low).isLowSurrogate()) {
1543 p2++;
1544 u1 = QChar::surrogateToUcs4(u1, low);
1545 if (p2 >= l)
1546 break;
1549 uint u2 = s.at(p2).unicode();
1550 if (QChar(u2).isHighSurrogate() && p2 < l-1) {
1551 ushort low = s.at(p2+1).unicode();
1552 if (QChar(low).isLowSurrogate()) {
1553 p2++;
1554 u2 = QChar::surrogateToUcs4(u2, low);
1558 int c2 = QChar::combiningClass(u2);
1559 if (QChar::unicodeVersion(u2) > version)
1560 c2 = 0;
1562 if (c2 == 0) {
1563 pos = p2+1;
1564 continue;
1566 int c1 = QChar::combiningClass(u1);
1567 if (QChar::unicodeVersion(u1) > version)
1568 c1 = 0;
1570 if (c1 > c2) {
1571 QChar *uc = s.data();
1572 int p = pos;
1573 // exchange characters
1574 if (u2 < 0x10000) {
1575 uc[p++] = u2;
1576 } else {
1577 uc[p++] = QChar::highSurrogate(u2);
1578 uc[p++] = QChar::lowSurrogate(u2);
1580 if (u1 < 0x10000) {
1581 uc[p++] = u1;
1582 } else {
1583 uc[p++] = QChar::highSurrogate(u1);
1584 uc[p++] = QChar::lowSurrogate(u1);
1586 if (pos > 0)
1587 --pos;
1588 if (pos > 0 && s.at(pos).isLowSurrogate())
1589 --pos;
1590 } else {
1591 ++pos;
1592 if (u1 > 0x10000)
1593 ++pos;
1596 return s;
1599 int QT_FASTCALL QUnicodeTables::script(unsigned int uc)
1601 if (uc > 0xffff)
1602 return Common;
1603 int script = uc_scripts[uc >> 7];
1604 if (script < ScriptSentinel)
1605 return script;
1606 script = (((script - ScriptSentinel) * UnicodeBlockSize) + UnicodeBlockCount);
1607 script = uc_scripts[script + (uc & 0x7f)];
1608 return script;
1612 Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL QUnicodeTables::lineBreakClass(uint ucs4)
1614 return (QUnicodeTables::LineBreakClass) qGetProp(ucs4)->line_break_class;
1618 QT_END_NAMESPACE