1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
5 use icu_segmenter::LineBreakOptions;
6 use icu_segmenter::LineBreakStrictness;
7 use icu_segmenter::LineBreakWordOption;
11 use crate::errors::ffi::ICU4XError;
12 use crate::provider::ffi::ICU4XDataProvider;
13 use alloc::boxed::Box;
14 use core::convert::TryFrom;
16 LineBreakIteratorLatin1, LineBreakIteratorPotentiallyIllFormedUtf8, LineBreakIteratorUtf16,
21 /// An ICU4X line-break segmenter, capable of finding breakpoints in strings.
22 #[diplomat::rust_link(icu::segmenter::LineSegmenter, Struct)]
23 pub struct ICU4XLineSegmenter(LineSegmenter);
25 #[diplomat::rust_link(icu::segmenter::LineBreakStrictness, Enum)]
26 pub enum ICU4XLineBreakStrictness {
33 #[diplomat::rust_link(icu::segmenter::LineBreakWordOption, Enum)]
34 pub enum ICU4XLineBreakWordOption {
40 #[diplomat::rust_link(icu::segmenter::LineBreakOptions, Struct)]
41 #[diplomat::attr(dart, rename = "LineBreakOptions")]
42 pub struct ICU4XLineBreakOptionsV1 {
43 pub strictness: ICU4XLineBreakStrictness,
44 pub word_option: ICU4XLineBreakWordOption,
49 #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)]
50 #[diplomat::rust_link(
51 icu::segmenter::LineBreakIteratorPotentiallyIllFormedUtf8,
55 #[diplomat::rust_link(icu::segmenter::LineBreakIteratorUtf8, Typedef, hidden)]
56 pub struct ICU4XLineBreakIteratorUtf8<'a>(LineBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>);
59 #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)]
60 #[diplomat::rust_link(icu::segmenter::LineBreakIteratorUtf16, Typedef, compact)]
61 pub struct ICU4XLineBreakIteratorUtf16<'a>(LineBreakIteratorUtf16<'a, 'a>);
64 #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)]
65 #[diplomat::rust_link(icu::segmenter::LineBreakIteratorLatin1, Typedef, compact)]
66 pub struct ICU4XLineBreakIteratorLatin1<'a>(LineBreakIteratorLatin1<'a, 'a>);
68 impl ICU4XLineSegmenter {
69 /// Construct a [`ICU4XLineSegmenter`] with default options. It automatically loads the best
70 /// available payload data for Burmese, Khmer, Lao, and Thai.
71 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)]
72 #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto")]
74 provider: &ICU4XDataProvider,
75 ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
76 Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
77 LineSegmenter::new_auto [r => Ok(r)],
78 LineSegmenter::try_new_auto_with_any_provider,
79 LineSegmenter::try_new_auto_with_buffer_provider,
84 /// Construct a [`ICU4XLineSegmenter`] with default options and LSTM payload data for
85 /// Burmese, Khmer, Lao, and Thai.
86 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)]
87 #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm")]
89 provider: &ICU4XDataProvider,
90 ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
91 Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
92 LineSegmenter::new_lstm [r => Ok(r)],
93 LineSegmenter::try_new_lstm_with_any_provider,
94 LineSegmenter::try_new_lstm_with_buffer_provider,
99 /// Construct a [`ICU4XLineSegmenter`] with default options and dictionary payload data for
100 /// Burmese, Khmer, Lao, and Thai..
101 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)]
102 #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary")]
103 pub fn create_dictionary(
104 provider: &ICU4XDataProvider,
105 ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
106 Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
107 LineSegmenter::new_dictionary [r => Ok(r)],
108 LineSegmenter::try_new_dictionary_with_any_provider,
109 LineSegmenter::try_new_dictionary_with_buffer_provider,
114 /// Construct a [`ICU4XLineSegmenter`] with custom options. It automatically loads the best
115 /// available payload data for Burmese, Khmer, Lao, and Thai.
116 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto_with_options, FnInStruct)]
117 #[diplomat::attr(dart, rename = "auto_with_options")]
118 #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_v1")]
119 pub fn create_auto_with_options_v1(
120 provider: &ICU4XDataProvider,
121 options: ICU4XLineBreakOptionsV1,
122 ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
123 Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
124 LineSegmenter::new_auto_with_options [r => Ok(r)],
125 LineSegmenter::try_new_auto_with_options_with_any_provider,
126 LineSegmenter::try_new_auto_with_options_with_buffer_provider,
132 /// Construct a [`ICU4XLineSegmenter`] with custom options and LSTM payload data for
133 /// Burmese, Khmer, Lao, and Thai.
134 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm_with_options, FnInStruct)]
135 #[diplomat::attr(dart, rename = "lstm_with_options")]
136 #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_v1")]
137 pub fn create_lstm_with_options_v1(
138 provider: &ICU4XDataProvider,
139 options: ICU4XLineBreakOptionsV1,
140 ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
141 Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
142 LineSegmenter::new_lstm_with_options [r => Ok(r)],
143 LineSegmenter::try_new_lstm_with_options_with_any_provider,
144 LineSegmenter::try_new_lstm_with_options_with_buffer_provider,
150 /// Construct a [`ICU4XLineSegmenter`] with custom options and dictionary payload data for
151 /// Burmese, Khmer, Lao, and Thai.
152 #[diplomat::rust_link(
153 icu::segmenter::LineSegmenter::new_dictionary_with_options,
156 #[diplomat::attr(dart, rename = "dictionary_with_options")]
157 #[diplomat::attr(all(supports = constructors, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_v1")]
158 pub fn create_dictionary_with_options_v1(
159 provider: &ICU4XDataProvider,
160 options: ICU4XLineBreakOptionsV1,
161 ) -> Result<Box<ICU4XLineSegmenter>, ICU4XError> {
162 Ok(Box::new(ICU4XLineSegmenter(call_constructor!(
163 LineSegmenter::new_dictionary_with_options [r => Ok(r)],
164 LineSegmenter::try_new_dictionary_with_options_with_any_provider,
165 LineSegmenter::try_new_dictionary_with_options_with_buffer_provider,
171 /// Segments a string.
173 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
174 /// to the WHATWG Encoding Standard.
175 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_utf8, FnInStruct)]
176 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_str, FnInStruct, hidden)]
177 #[diplomat::attr(dart, disable)]
178 pub fn segment_utf8<'a>(
180 input: &'a DiplomatStr,
181 ) -> Box<ICU4XLineBreakIteratorUtf8<'a>> {
182 Box::new(ICU4XLineBreakIteratorUtf8(self.0.segment_utf8(input)))
185 /// Segments a string.
187 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
188 /// to the WHATWG Encoding Standard.
189 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_utf16, FnInStruct)]
190 #[diplomat::attr(dart, rename = "segment")]
191 pub fn segment_utf16<'a>(
193 input: &'a DiplomatStr16,
194 ) -> Box<ICU4XLineBreakIteratorUtf16<'a>> {
195 Box::new(ICU4XLineBreakIteratorUtf16(self.0.segment_utf16(input)))
198 /// Segments a Latin-1 string.
199 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_latin1, FnInStruct)]
200 #[diplomat::attr(dart, disable)]
201 pub fn segment_latin1<'a>(
204 ) -> Box<ICU4XLineBreakIteratorLatin1<'a>> {
205 Box::new(ICU4XLineBreakIteratorLatin1(self.0.segment_latin1(input)))
209 impl<'a> ICU4XLineBreakIteratorUtf8<'a> {
210 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
211 /// out of range of a 32-bit signed integer.
212 #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)]
213 #[diplomat::rust_link(
214 icu::segmenter::LineBreakIterator::Item,
215 AssociatedTypeInStruct,
218 pub fn next(&mut self) -> i32 {
221 .and_then(|u| i32::try_from(u).ok())
226 impl<'a> ICU4XLineBreakIteratorUtf16<'a> {
227 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
228 /// out of range of a 32-bit signed integer.
229 #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)]
230 #[diplomat::rust_link(
231 icu::segmenter::LineBreakIterator::Item,
232 AssociatedTypeInStruct,
235 pub fn next(&mut self) -> i32 {
238 .and_then(|u| i32::try_from(u).ok())
243 impl<'a> ICU4XLineBreakIteratorLatin1<'a> {
244 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
245 /// out of range of a 32-bit signed integer.
246 #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)]
247 #[diplomat::rust_link(
248 icu::segmenter::LineBreakIterator::Item,
249 AssociatedTypeInStruct,
252 pub fn next(&mut self) -> i32 {
255 .and_then(|u| i32::try_from(u).ok())
261 impl From<ffi::ICU4XLineBreakStrictness> for LineBreakStrictness {
262 fn from(other: ffi::ICU4XLineBreakStrictness) -> Self {
264 ffi::ICU4XLineBreakStrictness::Loose => Self::Loose,
265 ffi::ICU4XLineBreakStrictness::Normal => Self::Normal,
266 ffi::ICU4XLineBreakStrictness::Strict => Self::Strict,
267 ffi::ICU4XLineBreakStrictness::Anywhere => Self::Anywhere,
272 impl From<ffi::ICU4XLineBreakWordOption> for LineBreakWordOption {
273 fn from(other: ffi::ICU4XLineBreakWordOption) -> Self {
275 ffi::ICU4XLineBreakWordOption::Normal => Self::Normal,
276 ffi::ICU4XLineBreakWordOption::BreakAll => Self::BreakAll,
277 ffi::ICU4XLineBreakWordOption::KeepAll => Self::KeepAll,
282 impl From<ffi::ICU4XLineBreakOptionsV1> for LineBreakOptions {
283 fn from(other: ffi::ICU4XLineBreakOptionsV1) -> Self {
284 let mut options = LineBreakOptions::default();
285 options.strictness = other.strictness.into();
286 options.word_option = other.word_option.into();
287 options.ja_zh = other.ja_zh;