1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /* Classes to iterate over grapheme, word, sentence, or line. */
9 #include "mozilla/intl/Segmenter.h"
11 #include "mozilla/intl/LineBreaker.h"
12 #include "mozilla/intl/WordBreaker.h"
13 #include "mozilla/intl/UnicodeProperties.h"
14 #include "nsUnicodeProperties.h"
15 #include "nsCharTraits.h"
17 using namespace mozilla::unicode
;
19 namespace mozilla::intl
{
21 SegmentIteratorUtf16::SegmentIteratorUtf16(Span
<const char16_t
> aText
)
24 Maybe
<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos
) {
31 LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span
<const char16_t
> aText
,
32 const LineBreakOptions
& aOptions
)
33 : SegmentIteratorUtf16(aText
), mOptions(aOptions
) {}
35 Maybe
<uint32_t> LineBreakIteratorUtf16::Next() {
36 const int32_t nextPos
=
37 LineBreaker::Next(mText
.Elements(), mText
.Length(), mPos
);
38 if (nextPos
== NS_LINEBREAKER_NEED_MORE_TEXT
) {
45 WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span
<const char16_t
> aText
)
46 : SegmentIteratorUtf16(aText
) {}
48 Maybe
<uint32_t> WordBreakIteratorUtf16::Next() {
49 const int32_t nextPos
=
50 WordBreaker::Next(mText
.Elements(), mText
.Length(), mPos
);
51 if (nextPos
== NS_WORDBREAKER_NEED_MORE_TEXT
) {
58 GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
59 Span
<const char16_t
> aText
)
60 : SegmentIteratorUtf16(aText
) {}
63 HST_NONE
= U_HST_NOT_APPLICABLE
,
64 HST_L
= U_HST_LEADING_JAMO
,
65 HST_V
= U_HST_VOWEL_JAMO
,
66 HST_T
= U_HST_TRAILING_JAMO
,
67 HST_LV
= U_HST_LV_SYLLABLE
,
68 HST_LVT
= U_HST_LVT_SYLLABLE
71 static HSType
GetHangulSyllableType(uint32_t aCh
) {
72 return HSType(UnicodeProperties::GetIntPropertyValue(
73 aCh
, UnicodeProperties::IntProperty::HangulSyllableType
));
76 Maybe
<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
77 const auto len
= mText
.Length();
79 // The iterator has already reached the end.
83 uint32_t ch
= mText
[mPos
++];
85 if (mPos
< len
&& NS_IS_SURROGATE_PAIR(ch
, mText
[mPos
])) {
86 ch
= SURROGATE_TO_UCS4(ch
, mText
[mPos
++]);
87 } else if ((ch
& ~0xff) == 0x1100 || (ch
>= 0xa960 && ch
<= 0xa97f) ||
88 (ch
>= 0xac00 && ch
<= 0xd7ff)) {
89 // Handle conjoining Jamo that make Hangul syllables
90 HSType hangulState
= GetHangulSyllableType(ch
);
93 HSType hangulType
= GetHangulSyllableType(ch
);
98 if (hangulState
== HST_L
) {
99 hangulState
= hangulType
;
105 if ((hangulState
!= HST_NONE
) && (hangulState
!= HST_T
) &&
106 (hangulState
!= HST_LVT
)) {
107 hangulState
= hangulType
;
113 if (hangulState
!= HST_NONE
&& hangulState
!= HST_L
) {
114 hangulState
= hangulType
;
126 const uint32_t kVS16
= 0xfe0f;
127 const uint32_t kZWJ
= 0x200d;
128 // UTF-16 surrogate values for Fitzpatrick type modifiers
129 const uint32_t kFitzpatrickHigh
= 0xD83C;
130 const uint32_t kFitzpatrickLowFirst
= 0xDFFB;
131 const uint32_t kFitzpatrickLowLast
= 0xDFFF;
133 bool baseIsEmoji
= (GetEmojiPresentation(ch
) == EmojiDefault
) ||
134 (GetEmojiPresentation(ch
) == TextDefault
&&
135 ((mPos
< len
&& mText
[mPos
] == kVS16
) ||
136 (mPos
+ 1 < len
&& mText
[mPos
] == kFitzpatrickHigh
&&
137 mText
[mPos
+ 1] >= kFitzpatrickLowFirst
&&
138 mText
[mPos
+ 1] <= kFitzpatrickLowLast
)));
139 bool prevWasZwj
= false;
145 // Check for surrogate pairs; note that isolated surrogates will just
146 // be treated as generic (non-cluster-extending) characters here,
147 // which is fine for cluster-iterating purposes
148 if (mPos
< len
- 1 && NS_IS_SURROGATE_PAIR(ch
, mText
[mPos
+ 1])) {
149 ch
= SURROGATE_TO_UCS4(ch
, mText
[mPos
+ 1]);
154 IsClusterExtender(ch
) ||
155 (baseIsEmoji
&& prevWasZwj
&&
156 ((GetEmojiPresentation(ch
) == EmojiDefault
) ||
157 (GetEmojiPresentation(ch
) == TextDefault
&& mPos
+ chLen
< len
&&
158 mText
[mPos
+ chLen
] == kVS16
)));
159 if (!extendCluster
) {
163 prevWasZwj
= (ch
== kZWJ
);
167 MOZ_ASSERT(mPos
<= len
, "Next() has overshot the string!");
171 GraphemeClusterBreakReverseIteratorUtf16::
172 GraphemeClusterBreakReverseIteratorUtf16(Span
<const char16_t
> aText
)
173 : SegmentIteratorUtf16(aText
) {
174 mPos
= mText
.Length();
177 Maybe
<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() {
186 if (mPos
> 0 && NS_IS_SURROGATE_PAIR(mText
[mPos
- 1], ch
)) {
187 ch
= SURROGATE_TO_UCS4(mText
[--mPos
], ch
);
190 if (!IsClusterExtender(ch
)) {
195 // XXX May need to handle conjoining Jamo
200 Maybe
<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos
) {
207 Result
<UniquePtr
<Segmenter
>, ICUError
> Segmenter::TryCreate(
208 Span
<const char> aLocale
, const SegmenterOptions
& aOptions
) {
209 if (aOptions
.mGranularity
== SegmenterGranularity::Sentence
) {
210 // Grapheme and Sentence iterator are not yet implemented.
211 return Err(ICUError::InternalError
);
213 return MakeUnique
<Segmenter
>(aLocale
, aOptions
);
216 UniquePtr
<SegmentIteratorUtf16
> Segmenter::Segment(
217 Span
<const char16_t
> aText
) const {
218 switch (mOptions
.mGranularity
) {
219 case SegmenterGranularity::Grapheme
:
220 return MakeUnique
<GraphemeClusterBreakIteratorUtf16
>(aText
);
221 case SegmenterGranularity::Sentence
:
222 MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
224 case SegmenterGranularity::Word
:
225 return MakeUnique
<WordBreakIteratorUtf16
>(aText
);
226 case SegmenterGranularity::Line
:
227 return MakeUnique
<LineBreakIteratorUtf16
>(aText
);
229 MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
233 } // namespace mozilla::intl