1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "nsLineBreaker.h"
8 #include "nsContentUtils.h"
9 #include "gfxTextRun.h" // for the gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_* values
10 #include "nsHyphenationManager.h"
11 #include "nsHyphenator.h"
12 #include "mozilla/AutoRestore.h"
13 #include "mozilla/gfx/2D.h"
14 #include "mozilla/intl/LineBreaker.h" // for LineBreaker::ComputeBreakPositions
15 #include "mozilla/intl/Locale.h"
16 #include "mozilla/intl/UnicodeProperties.h"
18 using mozilla::AutoRestore
;
19 using mozilla::intl::LineBreaker
;
20 using mozilla::intl::LineBreakRule
;
21 using mozilla::intl::Locale
;
22 using mozilla::intl::LocaleParser
;
23 using mozilla::intl::UnicodeProperties
;
24 using mozilla::intl::WordBreakRule
;
26 nsLineBreaker::nsLineBreaker()
27 : mCurrentWordLanguage(nullptr),
28 mCurrentWordContainsMixedLang(false),
29 mCurrentWordContainsComplexChar(false),
30 mScriptIsChineseOrJapanese(false),
31 mAfterBreakableSpace(false),
33 mWordBreak(WordBreakRule::Normal
),
34 mLineBreak(LineBreakRule::Auto
),
35 mWordContinuation(false) {}
37 nsLineBreaker::~nsLineBreaker() {
38 NS_ASSERTION(mCurrentWord
.Length() == 0,
39 "Should have Reset() before destruction!");
42 static void SetupCapitalization(const char16_t
* aWord
, uint32_t aLength
,
43 bool* aCapitalization
) {
44 // Capitalize the first alphanumeric character after a space or punctuation.
45 using mozilla::intl::GeneralCategory
;
46 bool capitalizeNextChar
= true;
47 for (uint32_t i
= 0; i
< aLength
; ++i
) {
48 uint32_t ch
= aWord
[i
];
49 if (i
+ 1 < aLength
&& NS_IS_SURROGATE_PAIR(ch
, aWord
[i
+ 1])) {
50 ch
= SURROGATE_TO_UCS4(ch
, aWord
[i
+ 1]);
52 auto category
= UnicodeProperties::CharType(ch
);
54 case GeneralCategory::Uppercase_Letter
:
55 case GeneralCategory::Lowercase_Letter
:
56 case GeneralCategory::Titlecase_Letter
:
57 case GeneralCategory::Modifier_Letter
:
58 case GeneralCategory::Other_Letter
:
59 case GeneralCategory::Decimal_Number
:
60 case GeneralCategory::Letter_Number
:
61 case GeneralCategory::Other_Number
:
62 if (capitalizeNextChar
) {
63 aCapitalization
[i
] = true;
64 capitalizeNextChar
= false;
67 case GeneralCategory::Space_Separator
:
68 case GeneralCategory::Line_Separator
:
69 case GeneralCategory::Paragraph_Separator
:
70 case GeneralCategory::Dash_Punctuation
:
71 case GeneralCategory::Initial_Punctuation
:
72 /* These punctuation categories are excluded, for examples like
73 * "what colo[u]r" -> "What Colo[u]r?" (rather than "What Colo[U]R?")
75 * "snake_case" -> "Snake_case" (to match word selection behavior)
76 case GeneralCategory::Open_Punctuation:
77 case GeneralCategory::Close_Punctuation:
78 case GeneralCategory::Connector_Punctuation:
80 capitalizeNextChar
= true;
82 case GeneralCategory::Final_Punctuation
:
83 /* Special-case: exclude Unicode single-close-quote/apostrophe,
84 for examples like "Lowe’s" etc. */
86 capitalizeNextChar
= true;
89 case GeneralCategory::Other_Punctuation
:
90 /* Special-case: exclude ASCII apostrophe, for "Lowe's" etc.,
91 and MIDDLE DOT, for Catalan "l·l". */
92 if (ch
!= '\'' && ch
!= 0x00B7) {
93 capitalizeNextChar
= true;
105 nsresult
nsLineBreaker::FlushCurrentWord() {
106 uint32_t length
= mCurrentWord
.Length();
107 AutoTArray
<uint8_t, 4000> breakState
;
108 if (!breakState
.AppendElements(length
, mozilla::fallible
)) {
109 return NS_ERROR_OUT_OF_MEMORY
;
112 if (mLineBreak
== LineBreakRule::Anywhere
) {
113 memset(breakState
.Elements(),
114 gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL
,
115 length
* sizeof(uint8_t));
116 } else if (!mCurrentWordContainsComplexChar
) {
117 // For break-strict set everything internal to "break", otherwise
119 memset(breakState
.Elements(),
120 mWordBreak
== WordBreakRule::BreakAll
121 ? gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL
122 : gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NONE
,
123 length
* sizeof(uint8_t));
125 LineBreaker::ComputeBreakPositions(
126 mCurrentWord
.Elements(), length
, mWordBreak
, mLineBreak
,
127 mScriptIsChineseOrJapanese
, breakState
.Elements());
130 bool autoHyphenate
= mCurrentWordLanguage
&& !mCurrentWordContainsMixedLang
;
132 for (i
= 0; autoHyphenate
&& i
< mTextItems
.Length(); ++i
) {
133 TextItem
* ti
= &mTextItems
[i
];
134 if (!(ti
->mFlags
& BREAK_USE_AUTO_HYPHENATION
)) {
135 autoHyphenate
= false;
139 RefPtr
<nsHyphenator
> hyphenator
=
140 nsHyphenationManager::Instance()->GetHyphenator(mCurrentWordLanguage
);
142 FindHyphenationPoints(hyphenator
, mCurrentWord
.Elements(),
143 mCurrentWord
.Elements() + length
,
144 breakState
.Elements());
148 nsTArray
<bool> capitalizationState
;
150 for (i
= 0; i
< mTextItems
.Length(); ++i
) {
151 TextItem
* ti
= &mTextItems
[i
];
152 NS_ASSERTION(ti
->mLength
> 0, "Zero length word contribution?");
154 if ((ti
->mFlags
& BREAK_SUPPRESS_INITIAL
) && ti
->mSinkOffset
== 0) {
155 breakState
[offset
] = gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NONE
;
157 if (ti
->mFlags
& BREAK_SUPPRESS_INSIDE
) {
158 uint32_t exclude
= ti
->mSinkOffset
== 0 ? 1 : 0;
159 memset(breakState
.Elements() + offset
+ exclude
,
160 gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NONE
,
161 (ti
->mLength
- exclude
) * sizeof(uint8_t));
164 // Don't set the break state for the first character of the word, because
165 // it was already set correctly earlier and we don't know what the true
167 uint32_t skipSet
= i
== 0 ? 1 : 0;
169 ti
->mSink
->SetBreaks(ti
->mSinkOffset
+ skipSet
, ti
->mLength
- skipSet
,
170 breakState
.Elements() + offset
+ skipSet
);
172 if (!mWordContinuation
&& (ti
->mFlags
& BREAK_NEED_CAPITALIZATION
)) {
173 if (capitalizationState
.Length() == 0) {
174 if (!capitalizationState
.AppendElements(length
, mozilla::fallible
)) {
175 return NS_ERROR_OUT_OF_MEMORY
;
177 memset(capitalizationState
.Elements(), false, length
* sizeof(bool));
178 SetupCapitalization(mCurrentWord
.Elements(), length
,
179 capitalizationState
.Elements());
181 ti
->mSink
->SetCapitalization(ti
->mSinkOffset
, ti
->mLength
,
182 capitalizationState
.Elements() + offset
);
186 offset
+= ti
->mLength
;
189 mCurrentWord
.Clear();
191 mCurrentWordContainsComplexChar
= false;
192 mCurrentWordContainsMixedLang
= false;
193 mCurrentWordLanguage
= nullptr;
194 mWordContinuation
= false;
198 // If the aFlags parameter to AppendText has all these bits set,
199 // then we don't need to worry about finding break opportunities
200 // in the appended text.
201 #define NO_BREAKS_NEEDED_FLAGS \
202 (BREAK_SUPPRESS_INITIAL | BREAK_SUPPRESS_INSIDE | \
203 BREAK_SKIP_SETTING_NO_BREAKS)
205 nsresult
nsLineBreaker::AppendText(nsAtom
* aHyphenationLanguage
,
206 const char16_t
* aText
, uint32_t aLength
,
207 uint32_t aFlags
, nsILineBreakSink
* aSink
) {
208 NS_ASSERTION(aLength
> 0, "Appending empty text...");
212 // Continue the current word
213 if (mCurrentWord
.Length() > 0) {
214 NS_ASSERTION(!mAfterBreakableSpace
&& !mBreakHere
,
215 "These should not be set");
217 while (offset
< aLength
&& !IsSpace(aText
[offset
])) {
218 mCurrentWord
.AppendElement(aText
[offset
]);
219 if (!mCurrentWordContainsComplexChar
&& IsComplexChar(aText
[offset
])) {
220 mCurrentWordContainsComplexChar
= true;
222 UpdateCurrentWordLanguage(aHyphenationLanguage
);
227 mTextItems
.AppendElement(TextItem(aSink
, 0, offset
, aFlags
));
230 if (offset
== aLength
) {
234 // We encountered whitespace, so we're done with this word
235 nsresult rv
= FlushCurrentWord();
241 AutoTArray
<uint8_t, 4000> breakState
;
243 if (!breakState
.AppendElements(aLength
, mozilla::fallible
)) {
244 return NS_ERROR_OUT_OF_MEMORY
;
248 bool noCapitalizationNeeded
= true;
249 nsTArray
<bool> capitalizationState
;
250 if (aSink
&& (aFlags
& BREAK_NEED_CAPITALIZATION
)) {
251 if (!capitalizationState
.AppendElements(aLength
, mozilla::fallible
)) {
252 return NS_ERROR_OUT_OF_MEMORY
;
254 memset(capitalizationState
.Elements(), false, aLength
* sizeof(bool));
255 noCapitalizationNeeded
= false;
258 uint32_t start
= offset
;
259 bool noBreaksNeeded
=
260 !aSink
|| ((aFlags
& NO_BREAKS_NEEDED_FLAGS
) == NO_BREAKS_NEEDED_FLAGS
&&
261 !mBreakHere
&& !mAfterBreakableSpace
);
262 if (noBreaksNeeded
&& noCapitalizationNeeded
) {
263 // Skip to the space before the last word, since either the break data
264 // here is not needed, or no breaks are set in the sink and there cannot
265 // be any breaks in this chunk; and we don't need to do word-initial
266 // capitalization. All we need is the context for the next chunk (if any).
268 while (offset
> start
) {
270 if (IsSpace(aText
[offset
])) {
275 uint32_t wordStart
= offset
;
276 bool wordHasComplexChar
= false;
278 RefPtr
<nsHyphenator
> hyphenator
;
279 if ((aFlags
& BREAK_USE_AUTO_HYPHENATION
) &&
280 !(aFlags
& BREAK_SUPPRESS_INSIDE
) && aHyphenationLanguage
) {
282 nsHyphenationManager::Instance()->GetHyphenator(aHyphenationLanguage
);
286 char16_t ch
= aText
[offset
];
287 bool isSpace
= IsSpace(ch
);
288 bool isBreakableSpace
= isSpace
&& !(aFlags
& BREAK_SUPPRESS_INSIDE
);
290 if (aSink
&& !noBreaksNeeded
) {
292 mBreakHere
|| (mAfterBreakableSpace
&& !isBreakableSpace
) ||
293 mWordBreak
== WordBreakRule::BreakAll
||
294 mLineBreak
== LineBreakRule::Anywhere
295 ? gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL
296 : gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NONE
;
299 mAfterBreakableSpace
= isBreakableSpace
;
301 if (isSpace
|| ch
== '\n') {
302 if (offset
> wordStart
&& aSink
) {
303 if (!(aFlags
& BREAK_SUPPRESS_INSIDE
)) {
304 if (mLineBreak
== LineBreakRule::Anywhere
) {
305 memset(breakState
.Elements() + wordStart
,
306 gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL
,
308 } else if (wordHasComplexChar
) {
309 // Save current start-of-word state because ComputeBreakPositions()
310 // will set it to false.
311 AutoRestore
<uint8_t> saveWordStartBreakState(breakState
[wordStart
]);
312 LineBreaker::ComputeBreakPositions(
313 aText
+ wordStart
, offset
- wordStart
, mWordBreak
, mLineBreak
,
314 mScriptIsChineseOrJapanese
, breakState
.Elements() + wordStart
);
317 FindHyphenationPoints(hyphenator
, aText
+ wordStart
, aText
+ offset
,
318 breakState
.Elements() + wordStart
);
321 if (!mWordContinuation
&& !noCapitalizationNeeded
) {
322 SetupCapitalization(aText
+ wordStart
, offset
- wordStart
,
323 capitalizationState
.Elements() + wordStart
);
326 wordHasComplexChar
= false;
327 mWordContinuation
= false;
329 if (offset
>= aLength
) {
334 if (!wordHasComplexChar
&& IsComplexChar(ch
)) {
335 wordHasComplexChar
= true;
338 if (offset
>= aLength
) {
340 mCurrentWordContainsComplexChar
= wordHasComplexChar
;
341 uint32_t len
= offset
- wordStart
;
342 char16_t
* elems
= mCurrentWord
.AppendElements(len
);
344 return NS_ERROR_OUT_OF_MEMORY
;
346 memcpy(elems
, aText
+ wordStart
, sizeof(char16_t
) * len
);
347 mTextItems
.AppendElement(TextItem(aSink
, wordStart
, len
, aFlags
));
348 // Ensure that the break-before for this word is written out
349 offset
= wordStart
+ 1;
350 UpdateCurrentWordLanguage(aHyphenationLanguage
);
357 if (!noBreaksNeeded
) {
358 aSink
->SetBreaks(start
, offset
- start
, breakState
.Elements() + start
);
360 if (!noCapitalizationNeeded
) {
361 aSink
->SetCapitalization(start
, offset
- start
,
362 capitalizationState
.Elements() + start
);
368 void nsLineBreaker::FindHyphenationPoints(nsHyphenator
* aHyphenator
,
369 const char16_t
* aTextStart
,
370 const char16_t
* aTextLimit
,
371 uint8_t* aBreakState
) {
372 nsDependentSubstring
string(aTextStart
, aTextLimit
);
373 AutoTArray
<bool, 200> hyphens
;
374 if (NS_SUCCEEDED(aHyphenator
->Hyphenate(string
, hyphens
))) {
375 for (uint32_t i
= 0; i
+ 1 < string
.Length(); ++i
) {
378 gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_HYPHEN
;
384 nsresult
nsLineBreaker::AppendText(nsAtom
* aHyphenationLanguage
,
385 const uint8_t* aText
, uint32_t aLength
,
386 uint32_t aFlags
, nsILineBreakSink
* aSink
) {
387 NS_ASSERTION(aLength
> 0, "Appending empty text...");
389 if (aFlags
& (BREAK_NEED_CAPITALIZATION
| BREAK_USE_AUTO_HYPHENATION
)) {
390 // Defer to the Unicode path if capitalization or hyphenation is required
392 const char* cp
= reinterpret_cast<const char*>(aText
);
393 CopyASCIItoUTF16(nsDependentCSubstring(cp
, cp
+ aLength
), str
);
394 return AppendText(aHyphenationLanguage
, str
.get(), aLength
, aFlags
, aSink
);
399 // Continue the current word
400 if (mCurrentWord
.Length() > 0) {
401 NS_ASSERTION(!mAfterBreakableSpace
&& !mBreakHere
,
402 "These should not be set");
404 while (offset
< aLength
&& !IsSpace(aText
[offset
])) {
405 mCurrentWord
.AppendElement(aText
[offset
]);
406 if (!mCurrentWordContainsComplexChar
&&
407 IsComplexASCIIChar(aText
[offset
])) {
408 mCurrentWordContainsComplexChar
= true;
414 mTextItems
.AppendElement(TextItem(aSink
, 0, offset
, aFlags
));
417 if (offset
== aLength
) {
418 // We did not encounter whitespace so the word hasn't finished yet.
422 // We encountered whitespace, so we're done with this word
423 nsresult rv
= FlushCurrentWord();
429 AutoTArray
<uint8_t, 4000> breakState
;
431 if (!breakState
.AppendElements(aLength
, mozilla::fallible
)) {
432 return NS_ERROR_OUT_OF_MEMORY
;
436 uint32_t start
= offset
;
437 bool noBreaksNeeded
=
438 !aSink
|| ((aFlags
& NO_BREAKS_NEEDED_FLAGS
) == NO_BREAKS_NEEDED_FLAGS
&&
439 !mBreakHere
&& !mAfterBreakableSpace
);
440 if (noBreaksNeeded
) {
441 // Skip to the space before the last word, since either the break data
442 // here is not needed, or no breaks are set in the sink and there cannot
443 // be any breaks in this chunk; all we need is the context for the next
446 while (offset
> start
) {
448 if (IsSpace(aText
[offset
])) {
453 uint32_t wordStart
= offset
;
454 bool wordHasComplexChar
= false;
457 uint8_t ch
= aText
[offset
];
458 bool isSpace
= IsSpace(ch
);
459 bool isBreakableSpace
= isSpace
&& !(aFlags
& BREAK_SUPPRESS_INSIDE
);
462 // Consider word-break style. Since the break position of CJK scripts
463 // will be set by nsILineBreaker, we don't consider CJK at this point.
465 mBreakHere
|| (mAfterBreakableSpace
&& !isBreakableSpace
) ||
466 mWordBreak
== WordBreakRule::BreakAll
||
467 mLineBreak
== LineBreakRule::Anywhere
468 ? gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL
469 : gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NONE
;
472 mAfterBreakableSpace
= isBreakableSpace
;
475 if (offset
> wordStart
&& aSink
&& !(aFlags
& BREAK_SUPPRESS_INSIDE
)) {
476 if (mLineBreak
== LineBreakRule::Anywhere
) {
477 memset(breakState
.Elements() + wordStart
,
478 gfxTextRun::CompressedGlyph::FLAG_BREAK_TYPE_NORMAL
,
480 } else if (wordHasComplexChar
) {
481 // Save current start-of-word state because ComputeBreakPositions()
482 // will set it to false.
483 AutoRestore
<uint8_t> saveWordStartBreakState(breakState
[wordStart
]);
484 LineBreaker::ComputeBreakPositions(
485 aText
+ wordStart
, offset
- wordStart
, mWordBreak
, mLineBreak
,
486 mScriptIsChineseOrJapanese
, breakState
.Elements() + wordStart
);
490 wordHasComplexChar
= false;
491 mWordContinuation
= false;
493 if (offset
>= aLength
) {
498 if (!wordHasComplexChar
&& IsComplexASCIIChar(ch
)) {
499 wordHasComplexChar
= true;
502 if (offset
>= aLength
) {
504 mCurrentWordContainsComplexChar
= wordHasComplexChar
;
505 uint32_t len
= offset
- wordStart
;
506 char16_t
* elems
= mCurrentWord
.AppendElements(len
);
508 return NS_ERROR_OUT_OF_MEMORY
;
511 for (i
= wordStart
; i
< offset
; ++i
) {
512 elems
[i
- wordStart
] = aText
[i
];
514 mTextItems
.AppendElement(TextItem(aSink
, wordStart
, len
, aFlags
));
515 // Ensure that the break-before for this word is written out
516 offset
= wordStart
+ 1;
522 if (!noBreaksNeeded
) {
523 aSink
->SetBreaks(start
, offset
- start
, breakState
.Elements() + start
);
528 void nsLineBreaker::UpdateCurrentWordLanguage(nsAtom
* aHyphenationLanguage
) {
529 if (mCurrentWordLanguage
&& mCurrentWordLanguage
!= aHyphenationLanguage
) {
530 mCurrentWordContainsMixedLang
= true;
531 mScriptIsChineseOrJapanese
= false;
533 if (aHyphenationLanguage
&& !mCurrentWordLanguage
) {
536 LocaleParser::TryParse(nsAtomCString(aHyphenationLanguage
), loc
);
538 if (result
.isErr()) {
541 if (loc
.Script().Missing() && loc
.AddLikelySubtags().isErr()) {
544 mScriptIsChineseOrJapanese
=
545 loc
.Script().EqualTo("Hans") || loc
.Script().EqualTo("Hant") ||
546 loc
.Script().EqualTo("Jpan") || loc
.Script().EqualTo("Hrkt");
548 mCurrentWordLanguage
= aHyphenationLanguage
;
552 nsresult
nsLineBreaker::AppendInvisibleWhitespace(uint32_t aFlags
) {
553 nsresult rv
= FlushCurrentWord();
558 bool isBreakableSpace
= !(aFlags
& BREAK_SUPPRESS_INSIDE
);
559 if (mAfterBreakableSpace
&& !isBreakableSpace
) {
562 mAfterBreakableSpace
= isBreakableSpace
;
563 mWordContinuation
= false;
567 nsresult
nsLineBreaker::Reset(bool* aTrailingBreak
) {
568 nsresult rv
= FlushCurrentWord();
573 *aTrailingBreak
= mBreakHere
|| mAfterBreakableSpace
;
575 mAfterBreakableSpace
= false;