1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /* Classes to iterate over grapheme, word, sentence, or line. */
9 #ifndef intl_components_Segmenter_h_
10 #define intl_components_Segmenter_h_
12 #include "mozilla/intl/ICUError.h"
13 #include "mozilla/Maybe.h"
14 #include "mozilla/Result.h"
15 #include "mozilla/Span.h"
16 #include "mozilla/UniquePtr.h"
18 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
20 struct ICU4XLineSegmenter
;
21 struct ICU4XLineBreakIteratorUtf16
;
22 struct ICU4XWordSegmenter
;
23 struct ICU4XWordBreakIteratorUtf16
;
24 struct ICU4XGraphemeClusterSegmenter
;
25 struct ICU4XGraphemeClusterBreakIteratorUtf16
;
26 struct ICU4XSentenceSegmenter
;
27 struct ICU4XSentenceBreakIteratorUtf16
;
31 namespace mozilla::intl
{
33 enum class SegmenterGranularity
: uint8_t {
40 struct SegmenterOptions final
{
41 SegmenterGranularity mGranularity
= SegmenterGranularity::Grapheme
;
45 * Interface of segment iterators. Subclass this class to implement iterator for
48 class SegmentIteratorUtf16
{
50 virtual ~SegmentIteratorUtf16() = default;
52 // Disable copy or move semantics. Move semantic could be enabled in the
54 SegmentIteratorUtf16(SegmentIteratorUtf16
&&) = delete;
55 SegmentIteratorUtf16
& operator=(SegmentIteratorUtf16
&&) = delete;
56 SegmentIteratorUtf16(const SegmentIteratorUtf16
&) = delete;
57 SegmentIteratorUtf16
& operator=(const SegmentIteratorUtf16
&) = delete;
60 * Advance the iterator to the next break position.
62 * @return the break position. If there's no further break position, return
65 virtual Maybe
<uint32_t> Next() = 0;
68 * Advance the iterator to the first break position following the specified
71 * Note: if this iterator's current position is already >= aPos, this method
72 * behaves the same as Next().
74 virtual Maybe
<uint32_t> Seek(uint32_t aPos
);
77 explicit SegmentIteratorUtf16(Span
<const char16_t
> aText
);
79 // The text to iterate over.
80 Span
<const char16_t
> mText
;
82 // The current break position within mText.
86 // Each enum value has the same meaning with respect to the `word-break`
87 // property values in the CSS Text spec. See the details in
88 // https://drafts.csswg.org/css-text-3/#word-break-property
89 enum class WordBreakRule
: uint8_t {
95 // Each enum value has the same meaning with respect to the `line-break`
96 // property values in the CSS Text spec. See the details in
97 // https://drafts.csswg.org/css-text-3/#line-break-property.
98 enum class LineBreakRule
: uint8_t {
106 // Extra options for line break iterator.
107 struct LineBreakOptions final
{
108 WordBreakRule mWordBreakRule
= WordBreakRule::Normal
;
109 LineBreakRule mLineBreakRule
= LineBreakRule::Auto
;
110 bool mScriptIsChineseOrJapanese
= false;
114 * Line break iterator for UTF-16 text.
116 class LineBreakIteratorUtf16 final
: public SegmentIteratorUtf16
{
118 explicit LineBreakIteratorUtf16(Span
<const char16_t
> aText
,
119 const LineBreakOptions
& aOptions
= {});
120 ~LineBreakIteratorUtf16() override
;
122 Maybe
<uint32_t> Next() override
;
123 Maybe
<uint32_t> Seek(uint32_t aPos
) override
;
126 LineBreakOptions mOptions
;
129 capi::ICU4XLineSegmenter
* mSegmenter
= nullptr;
130 capi::ICU4XLineBreakIteratorUtf16
* mIterator
= nullptr;
135 * Word break iterator for UTF-16 text.
137 class WordBreakIteratorUtf16 final
: public SegmentIteratorUtf16
{
139 explicit WordBreakIteratorUtf16(Span
<const char16_t
> aText
);
140 ~WordBreakIteratorUtf16() override
;
142 Maybe
<uint32_t> Next() override
;
143 Maybe
<uint32_t> Seek(uint32_t aPos
) override
;
145 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
147 capi::ICU4XWordSegmenter
* mSegmenter
= nullptr;
148 capi::ICU4XWordBreakIteratorUtf16
* mIterator
= nullptr;
153 * Grapheme cluster break iterator for UTF-16 text.
155 class GraphemeClusterBreakIteratorUtf16 final
: public SegmentIteratorUtf16
{
157 explicit GraphemeClusterBreakIteratorUtf16(Span
<const char16_t
> aText
);
158 ~GraphemeClusterBreakIteratorUtf16() override
;
160 Maybe
<uint32_t> Next() override
;
161 Maybe
<uint32_t> Seek(uint32_t aPos
) override
;
163 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
165 static capi::ICU4XGraphemeClusterSegmenter
* sSegmenter
;
166 capi::ICU4XGraphemeClusterBreakIteratorUtf16
* mIterator
= nullptr;
171 * Grapheme cluster break reverse iterator for UTF-16 text.
173 * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it
176 class GraphemeClusterBreakReverseIteratorUtf16 final
177 : public SegmentIteratorUtf16
{
179 explicit GraphemeClusterBreakReverseIteratorUtf16(Span
<const char16_t
> aText
);
181 Maybe
<uint32_t> Next() override
;
182 Maybe
<uint32_t> Seek(uint32_t aPos
) override
;
185 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
187 * Sentence break iterator for UTF-16 text.
189 class SentenceBreakIteratorUtf16 final
: public SegmentIteratorUtf16
{
191 explicit SentenceBreakIteratorUtf16(Span
<const char16_t
> aText
);
192 ~SentenceBreakIteratorUtf16() override
;
194 Maybe
<uint32_t> Next() override
;
195 Maybe
<uint32_t> Seek(uint32_t aPos
) override
;
198 capi::ICU4XSentenceSegmenter
* mSegmenter
= nullptr;
199 capi::ICU4XSentenceBreakIteratorUtf16
* mIterator
= nullptr;
204 * This component is a Mozilla-focused API for working with segmenters in
205 * internationalization code.
207 * This is a factor class. Calling Segment() to create an iterator over a text
208 * of given granularity.
210 class Segmenter final
{
212 // NOTE: aLocale is a no-op currently.
213 static Result
<UniquePtr
<Segmenter
>, ICUError
> TryCreate(
214 Span
<const char> aLocale
, const SegmenterOptions
& aOptions
);
216 explicit Segmenter(Span
<const char> aLocale
, const SegmenterOptions
& aOptions
)
217 : mOptions(aOptions
) {}
219 // Creates an iterator over aText of a given granularity in mOptions.
220 UniquePtr
<SegmentIteratorUtf16
> Segment(Span
<const char16_t
> aText
) const;
222 // TODO: Implement an iterator for Latin1 text.
223 // UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
226 SegmenterOptions mOptions
;
229 } // namespace mozilla::intl