Bug 1843499 - Part 2: Rethrow with exception stack in for-of loop. r=iain
[gecko.git] / intl / lwbrk / Segmenter.h
bloba3233dc8ed9115884189577b26685419e588891a
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /* Classes to iterate over grapheme, word, sentence, or line. */
9 #ifndef intl_components_Segmenter_h_
10 #define intl_components_Segmenter_h_
12 #include "mozilla/intl/ICUError.h"
13 #include "mozilla/Maybe.h"
14 #include "mozilla/Result.h"
15 #include "mozilla/Span.h"
16 #include "mozilla/UniquePtr.h"
18 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
19 namespace capi {
20 struct ICU4XLineSegmenter;
21 struct ICU4XLineBreakIteratorUtf16;
22 struct ICU4XWordSegmenter;
23 struct ICU4XWordBreakIteratorUtf16;
24 struct ICU4XGraphemeClusterSegmenter;
25 struct ICU4XGraphemeClusterBreakIteratorUtf16;
26 struct ICU4XSentenceSegmenter;
27 struct ICU4XSentenceBreakIteratorUtf16;
28 } // namespace capi
29 #endif
31 namespace mozilla::intl {
33 enum class SegmenterGranularity : uint8_t {
34 Grapheme,
35 Word,
36 Sentence,
37 Line,
40 struct SegmenterOptions final {
41 SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme;
44 /**
45 * Interface of segment iterators. Subclass this class to implement iterator for
46 * UTF-16 text.
48 class SegmentIteratorUtf16 {
49 public:
50 virtual ~SegmentIteratorUtf16() = default;
52 // Disable copy or move semantics. Move semantic could be enabled in the
53 // future if needed.
54 SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete;
55 SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete;
56 SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete;
57 SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete;
59 /**
60 * Advance the iterator to the next break position.
62 * @return the break position. If there's no further break position, return
63 * Nothing().
65 virtual Maybe<uint32_t> Next() = 0;
67 /**
68 * Advance the iterator to the first break position following the specified
69 * position aPos.
71 * Note: if this iterator's current position is already >= aPos, this method
72 * behaves the same as Next().
74 virtual Maybe<uint32_t> Seek(uint32_t aPos);
76 protected:
77 explicit SegmentIteratorUtf16(Span<const char16_t> aText);
79 // The text to iterate over.
80 Span<const char16_t> mText;
82 // The current break position within mText.
83 uint32_t mPos = 0;
86 // Each enum value has the same meaning with respect to the `word-break`
87 // property values in the CSS Text spec. See the details in
88 // https://drafts.csswg.org/css-text-3/#word-break-property
89 enum class WordBreakRule : uint8_t {
90 Normal = 0,
91 BreakAll,
92 KeepAll,
95 // Each enum value has the same meaning with respect to the `line-break`
96 // property values in the CSS Text spec. See the details in
97 // https://drafts.csswg.org/css-text-3/#line-break-property.
98 enum class LineBreakRule : uint8_t {
99 Auto = 0,
100 Loose,
101 Normal,
102 Strict,
103 Anywhere,
106 // Extra options for line break iterator.
107 struct LineBreakOptions final {
108 WordBreakRule mWordBreakRule = WordBreakRule::Normal;
109 LineBreakRule mLineBreakRule = LineBreakRule::Auto;
110 bool mScriptIsChineseOrJapanese = false;
114 * Line break iterator for UTF-16 text.
116 class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
117 public:
118 explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
119 const LineBreakOptions& aOptions = {});
120 ~LineBreakIteratorUtf16() override;
122 Maybe<uint32_t> Next() override;
123 Maybe<uint32_t> Seek(uint32_t aPos) override;
125 private:
126 LineBreakOptions mOptions;
128 #ifdef MOZ_ICU4X
129 capi::ICU4XLineSegmenter* mSegmenter = nullptr;
130 capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr;
131 #endif
135 * Word break iterator for UTF-16 text.
137 class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
138 public:
139 explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
140 ~WordBreakIteratorUtf16() override;
142 Maybe<uint32_t> Next() override;
143 Maybe<uint32_t> Seek(uint32_t aPos) override;
145 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
146 private:
147 capi::ICU4XWordSegmenter* mSegmenter = nullptr;
148 capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr;
149 #endif
153 * Grapheme cluster break iterator for UTF-16 text.
155 class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
156 public:
157 explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
158 ~GraphemeClusterBreakIteratorUtf16() override;
160 Maybe<uint32_t> Next() override;
161 Maybe<uint32_t> Seek(uint32_t aPos) override;
163 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
164 private:
165 static capi::ICU4XGraphemeClusterSegmenter* sSegmenter;
166 capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr;
167 #endif
171 * Grapheme cluster break reverse iterator for UTF-16 text.
173 * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it
174 * at your own risk.
176 class GraphemeClusterBreakReverseIteratorUtf16 final
177 : public SegmentIteratorUtf16 {
178 public:
179 explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText);
181 Maybe<uint32_t> Next() override;
182 Maybe<uint32_t> Seek(uint32_t aPos) override;
185 #if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API)
187 * Sentence break iterator for UTF-16 text.
189 class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
190 public:
191 explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText);
192 ~SentenceBreakIteratorUtf16() override;
194 Maybe<uint32_t> Next() override;
195 Maybe<uint32_t> Seek(uint32_t aPos) override;
197 private:
198 capi::ICU4XSentenceSegmenter* mSegmenter = nullptr;
199 capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr;
201 #endif
204 * This component is a Mozilla-focused API for working with segmenters in
205 * internationalization code.
207 * This is a factor class. Calling Segment() to create an iterator over a text
208 * of given granularity.
210 class Segmenter final {
211 public:
212 // NOTE: aLocale is a no-op currently.
213 static Result<UniquePtr<Segmenter>, ICUError> TryCreate(
214 Span<const char> aLocale, const SegmenterOptions& aOptions);
216 explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions)
217 : mOptions(aOptions) {}
219 // Creates an iterator over aText of a given granularity in mOptions.
220 UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const;
222 // TODO: Implement an iterator for Latin1 text.
223 // UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
225 private:
226 SegmenterOptions mOptions;
229 } // namespace mozilla::intl
231 #endif