1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozInlineSpellWordUtil.h"
11 #include "mozilla/BinarySearch.h"
12 #include "mozilla/EditorBase.h"
13 #include "mozilla/HTMLEditor.h"
14 #include "mozilla/Logging.h"
15 #include "mozilla/dom/Element.h"
19 #include "nsComponentManagerUtils.h"
20 #include "nsUnicodeProperties.h"
21 #include "nsServiceManagerUtils.h"
22 #include "nsIContent.h"
23 #include "nsTextFragment.h"
25 #include "nsContentUtils.h"
28 using namespace mozilla
;
30 static LazyLogModule sInlineSpellWordUtilLog
{"InlineSpellWordUtil"};
32 // IsIgnorableCharacter
34 // These characters are ones that we should ignore in input.
36 inline bool IsIgnorableCharacter(char ch
) {
37 return (ch
== static_cast<char>(0xAD)); // SOFT HYPHEN
40 inline bool IsIgnorableCharacter(char16_t ch
) {
41 return (ch
== 0xAD || // SOFT HYPHEN
42 ch
== 0x1806); // MONGOLIAN TODO SOFT HYPHEN
45 // IsConditionalPunctuation
47 // Some characters (like apostrophes) require characters on each side to be
48 // part of a word, and are otherwise punctuation.
50 inline bool IsConditionalPunctuation(char ch
) {
51 return (ch
== '\'' || // RIGHT SINGLE QUOTATION MARK
52 ch
== static_cast<char>(0xB7)); // MIDDLE DOT
55 inline bool IsConditionalPunctuation(char16_t ch
) {
56 return (ch
== '\'' || ch
== 0x2019 || // RIGHT SINGLE QUOTATION MARK
57 ch
== 0x00B7); // MIDDLE DOT
60 static bool IsAmbiguousDOMWordSeprator(char16_t ch
) {
61 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
62 return (ch
== '@' || ch
== ':' || ch
== '.' || ch
== '/' || ch
== '-' ||
63 IsConditionalPunctuation(ch
));
66 static bool IsAmbiguousDOMWordSeprator(char ch
) {
67 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
68 return IsAmbiguousDOMWordSeprator(static_cast<char16_t
>(ch
));
73 // Determines if the given character should be considered as a DOM Word
74 // separator. Basically, this is whitespace, although it could also have
75 // certain punctuation that we know ALWAYS breaks words. This is important.
76 // For example, we can't have any punctuation that could appear in a URL
77 // or email address in this, because those need to always fit into a single
80 static bool IsDOMWordSeparator(char ch
) {
81 // simple spaces or no-break space
82 return (ch
== ' ' || ch
== '\t' || ch
== '\n' || ch
== '\r' ||
83 ch
== static_cast<char>(0xA0));
86 static bool IsDOMWordSeparator(char16_t ch
) {
88 if (ch
== ' ' || ch
== '\t' || ch
== '\n' || ch
== '\r') return true;
90 // complex spaces - check only if char isn't ASCII (uncommon)
91 if (ch
>= 0xA0 && (ch
== 0x00A0 || // NO-BREAK SPACE
92 ch
== 0x2002 || // EN SPACE
93 ch
== 0x2003 || // EM SPACE
94 ch
== 0x2009 || // THIN SPACE
95 ch
== 0x3000)) // IDEOGRAPHIC SPACE
98 // otherwise not a space
102 bool NodeOffset::operator==(
103 const mozilla::RangeBoundary
& aRangeBoundary
) const {
104 if (aRangeBoundary
.Container() != mNode
) {
108 const Maybe
<uint32_t> rangeBoundaryOffset
=
109 aRangeBoundary
.Offset(RangeBoundary::OffsetFilter::kValidOffsets
);
111 MOZ_ASSERT(mOffset
>= 0);
112 return rangeBoundaryOffset
&&
113 (*rangeBoundaryOffset
== static_cast<uint32_t>(mOffset
));
116 bool NodeOffsetRange::operator==(const nsRange
& aRange
) const {
117 return mBegin
== aRange
.StartRef() && mEnd
== aRange
.EndRef();
121 Maybe
<mozInlineSpellWordUtil
> mozInlineSpellWordUtil::Create(
122 const EditorBase
& aEditorBase
) {
123 dom::Document
* document
= aEditorBase
.GetDocument();
124 if (NS_WARN_IF(!document
)) {
128 const bool isContentEditableOrDesignMode
= aEditorBase
.IsHTMLEditor();
130 // Find the root node for the editor. For contenteditable the mRootNode could
131 // change to shadow root if the begin and end are inside the shadowDOM.
132 nsINode
* rootNode
= aEditorBase
.GetRoot();
133 if (NS_WARN_IF(!rootNode
)) {
137 mozInlineSpellWordUtil util
{*document
, isContentEditableOrDesignMode
,
139 return Some(std::move(util
));
142 static inline bool IsSpellCheckingTextNode(nsINode
* aNode
) {
143 nsIContent
* parent
= aNode
->GetParent();
145 parent
->IsAnyOfHTMLElements(nsGkAtoms::script
, nsGkAtoms::style
))
147 return aNode
->IsText();
150 typedef void (*OnLeaveNodeFunPtr
)(nsINode
* aNode
, void* aClosure
);
152 // Find the next node in the DOM tree in preorder.
153 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
154 // why we can't just use GetNextNode here, sadly.
155 static nsINode
* FindNextNode(nsINode
* aNode
, const nsINode
* aRoot
,
156 OnLeaveNodeFunPtr aOnLeaveNode
, void* aClosure
) {
157 MOZ_ASSERT(aNode
, "Null starting node?");
159 nsINode
* next
= aNode
->GetFirstChild();
160 if (next
) return next
;
162 // Don't look at siblings or otherwise outside of aRoot
163 if (aNode
== aRoot
) return nullptr;
165 next
= aNode
->GetNextSibling();
166 if (next
) return next
;
171 aOnLeaveNode(aNode
, aClosure
);
174 next
= aNode
->GetParent();
175 if (next
== aRoot
|| !next
) return nullptr;
178 next
= aNode
->GetNextSibling();
179 if (next
) return next
;
183 // aNode is not a text node. Find the first text node starting at aNode/aOffset
184 // in a preorder DOM traversal.
185 static nsINode
* FindNextTextNode(nsINode
* aNode
, int32_t aOffset
,
186 const nsINode
* aRoot
) {
187 MOZ_ASSERT(aNode
, "Null starting node?");
188 MOZ_ASSERT(!IsSpellCheckingTextNode(aNode
),
189 "FindNextTextNode should start with a non-text node");
192 // Need to start at the aOffset'th child
193 nsIContent
* child
= aNode
->GetChildAt_Deprecated(aOffset
);
198 // aOffset was beyond the end of the child list.
199 // goto next node after the last descendant of aNode in
200 // a preorder DOM traversal.
201 checkNode
= aNode
->GetNextNonChildNode(aRoot
);
204 while (checkNode
&& !IsSpellCheckingTextNode(checkNode
)) {
205 checkNode
= checkNode
->GetNextNode(aRoot
);
210 // mozInlineSpellWordUtil::SetPositionAndEnd
212 // We have two ranges "hard" and "soft". The hard boundary is simply
213 // the scope of the root node. The soft boundary is that which is set
214 // by the caller of this class by calling this function. If this function is
215 // not called, the soft boundary is the same as the hard boundary.
217 // When we reach the soft boundary (mSoftText.GetEnd()), we keep
218 // going until we reach the end of a word. This allows the caller to set the
219 // end of the range to anything, and we will always check whole multiples of
220 // words. When we reach the hard boundary we stop no matter what.
222 // There is no beginning soft boundary. This is because we only go to the
223 // previous node once, when finding the previous word boundary in
224 // SetPosition(). You might think of the soft boundary as being this initial
227 nsresult
mozInlineSpellWordUtil::SetPositionAndEnd(nsINode
* aPositionNode
,
228 int32_t aPositionOffset
,
230 int32_t aEndOffset
) {
231 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
232 ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__
, aPositionNode
,
233 aPositionOffset
, aEndNode
, aEndOffset
));
235 MOZ_ASSERT(aPositionNode
, "Null begin node?");
236 MOZ_ASSERT(aEndNode
, "Null end node?");
238 MOZ_ASSERT(mRootNode
, "Not initialized");
240 // Find a appropriate root if we are dealing with contenteditable nodes which
241 // are in the shadow DOM.
242 if (mIsContentEditableOrDesignMode
) {
243 nsINode
* rootNode
= aPositionNode
->SubtreeRoot();
244 if (rootNode
!= aEndNode
->SubtreeRoot()) {
245 return NS_ERROR_FAILURE
;
248 if (mozilla::dom::ShadowRoot::FromNode(rootNode
)) {
249 mRootNode
= rootNode
;
253 mSoftText
.Invalidate();
255 if (!IsSpellCheckingTextNode(aPositionNode
)) {
256 // Start at the start of the first text node after aNode/aOffset.
257 aPositionNode
= FindNextTextNode(aPositionNode
, aPositionOffset
, mRootNode
);
260 NodeOffset softBegin
= NodeOffset(aPositionNode
, aPositionOffset
);
262 if (!IsSpellCheckingTextNode(aEndNode
)) {
263 // End at the start of the first text node after aEndNode/aEndOffset.
264 aEndNode
= FindNextTextNode(aEndNode
, aEndOffset
, mRootNode
);
267 NodeOffset softEnd
= NodeOffset(aEndNode
, aEndOffset
);
269 nsresult rv
= EnsureWords(std::move(softBegin
), std::move(softEnd
));
274 int32_t textOffset
= MapDOMPositionToSoftTextOffset(mSoftText
.GetBegin());
275 if (textOffset
< 0) {
279 mNextWordIndex
= FindRealWordContaining(textOffset
, HINT_END
, true);
283 nsresult
mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin
,
284 NodeOffset aSoftEnd
) {
285 if (mSoftText
.mIsValid
) return NS_OK
;
286 mSoftText
.AdjustBeginAndBuildText(std::move(aSoftBegin
), std::move(aSoftEnd
),
290 Result
<RealWords
, nsresult
> realWords
= BuildRealWords();
291 if (realWords
.isErr()) {
292 return realWords
.unwrapErr();
295 mRealWords
= realWords
.unwrap();
296 mSoftText
.mIsValid
= true;
300 nsresult
mozInlineSpellWordUtil::MakeRangeForWord(const RealWord
& aWord
,
301 nsRange
** aRange
) const {
303 MapSoftTextOffsetToDOMPosition(aWord
.mSoftTextOffset
, HINT_BEGIN
);
304 NodeOffset end
= MapSoftTextOffsetToDOMPosition(aWord
.EndOffset(), HINT_END
);
305 return MakeRange(begin
, end
, aRange
);
307 void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
308 const RealWord
& aWord
, NodeOffsetRange
* aNodeOffsetRange
) {
310 MapSoftTextOffsetToDOMPosition(aWord
.mSoftTextOffset
, HINT_BEGIN
);
311 NodeOffset end
= MapSoftTextOffsetToDOMPosition(aWord
.EndOffset(), HINT_END
);
312 *aNodeOffsetRange
= NodeOffsetRange(begin
, end
);
315 // mozInlineSpellWordUtil::GetRangeForWord
317 nsresult
mozInlineSpellWordUtil::GetRangeForWord(nsINode
* aWordNode
,
320 // Set our soft end and start
321 NodeOffset
pt(aWordNode
, aWordOffset
);
323 if (!mSoftText
.mIsValid
|| pt
!= mSoftText
.GetBegin() ||
324 pt
!= mSoftText
.GetEnd()) {
325 mSoftText
.Invalidate();
326 NodeOffset softBegin
= pt
;
327 NodeOffset softEnd
= pt
;
328 nsresult rv
= EnsureWords(std::move(softBegin
), std::move(softEnd
));
334 int32_t offset
= MapDOMPositionToSoftTextOffset(pt
);
335 if (offset
< 0) return MakeRange(pt
, pt
, aRange
);
336 int32_t wordIndex
= FindRealWordContaining(offset
, HINT_BEGIN
, false);
337 if (wordIndex
< 0) return MakeRange(pt
, pt
, aRange
);
338 return MakeRangeForWord(mRealWords
[wordIndex
], aRange
);
341 // This is to fix characters that the spellchecker may not like
342 static void NormalizeWord(const nsAString
& aInput
, int32_t aPos
, int32_t aLen
,
343 nsAString
& aOutput
) {
345 for (int32_t i
= 0; i
< aLen
; i
++) {
346 char16_t ch
= aInput
.CharAt(i
+ aPos
);
348 // remove ignorable characters from the word
349 if (IsIgnorableCharacter(ch
)) continue;
351 // the spellchecker doesn't handle curly apostrophes in all languages
352 if (ch
== 0x2019) { // RIGHT SINGLE QUOTATION MARK
360 // mozInlineSpellWordUtil::GetNextWord
362 // FIXME-optimization: we shouldn't have to generate a range every single
363 // time. It would be better if the inline spellchecker didn't require a
364 // range unless the word was misspelled. This may or may not be possible.
366 bool mozInlineSpellWordUtil::GetNextWord(Word
& aWord
) {
367 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
368 ("%s: mNextWordIndex=%d", __FUNCTION__
, mNextWordIndex
));
370 if (mNextWordIndex
< 0 || mNextWordIndex
>= int32_t(mRealWords
.Length())) {
372 aWord
.mSkipChecking
= true;
376 const RealWord
& realWord
= mRealWords
[mNextWordIndex
];
377 MakeNodeOffsetRangeForWord(realWord
, &aWord
.mNodeOffsetRange
);
379 aWord
.mSkipChecking
= !realWord
.mCheckableWord
;
380 ::NormalizeWord(mSoftText
.GetValue(), realWord
.mSoftTextOffset
,
381 realWord
.mLength
, aWord
.mText
);
383 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
384 ("%s: returning: %s (skip=%d)", __FUNCTION__
,
385 NS_ConvertUTF16toUTF8(aWord
.mText
).get(), aWord
.mSkipChecking
));
390 // mozInlineSpellWordUtil::MakeRange
392 // Convenience function for creating a range over the current document.
394 nsresult
mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin
, NodeOffset aEnd
,
395 nsRange
** aRange
) const {
396 NS_ENSURE_ARG_POINTER(aBegin
.mNode
);
398 return NS_ERROR_NOT_INITIALIZED
;
402 RefPtr
<nsRange
> range
= nsRange::Create(aBegin
.mNode
, aBegin
.mOffset
,
403 aEnd
.mNode
, aEnd
.mOffset
, error
);
404 if (NS_WARN_IF(error
.Failed())) {
405 return error
.StealNSResult();
408 range
.forget(aRange
);
413 already_AddRefed
<nsRange
> mozInlineSpellWordUtil::MakeRange(
414 const NodeOffsetRange
& aRange
) {
415 IgnoredErrorResult ignoredError
;
416 RefPtr
<nsRange
> range
=
417 nsRange::Create(aRange
.Begin().Node(), aRange
.Begin().Offset(),
418 aRange
.End().Node(), aRange
.End().Offset(), ignoredError
);
419 NS_WARNING_ASSERTION(!ignoredError
.Failed(), "Creating a range failed");
420 return range
.forget();
423 /*********** Word Splitting ************/
425 // classifies a given character in the DOM word
428 CHAR_CLASS_SEPARATOR
,
429 CHAR_CLASS_END_OF_INPUT
432 // Encapsulates DOM-word to real-word splitting
434 struct MOZ_STACK_CLASS WordSplitState
{
435 const T
& mDOMWordText
;
436 int32_t mDOMWordOffset
;
437 CharClass mCurCharClass
;
439 explicit WordSplitState(const T
& aString
)
440 : mDOMWordText(aString
),
442 mCurCharClass(CHAR_CLASS_END_OF_INPUT
) {}
444 CharClass
ClassifyCharacter(int32_t aIndex
, bool aRecurse
) const;
446 void AdvanceThroughSeparators();
447 void AdvanceThroughWord();
449 // Finds special words like email addresses and URLs that may start at the
450 // current position, and returns their length, or 0 if not found. This allows
451 // arbitrary word breaking rules to be used for these special entities, as
452 // long as they can not contain whitespace.
453 bool IsSpecialWord() const;
455 // Similar to IsSpecialWord except that this takes a split word as
456 // input. This checks for things that do not require special word-breaking
458 bool ShouldSkipWord(int32_t aStart
, int32_t aLength
) const;
460 // Finds the last sequence of DOM word separators before aBeforeOffset and
461 // returns the offset to its first element.
462 Maybe
<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
463 int32_t aBeforeOffset
) const;
465 char16_t
GetUnicharAt(int32_t aIndex
) const;
468 // WordSplitState::ClassifyCharacter
470 CharClass WordSplitState
<T
>::ClassifyCharacter(int32_t aIndex
,
471 bool aRecurse
) const {
472 MOZ_ASSERT(aIndex
>= 0 && aIndex
<= int32_t(mDOMWordText
.Length()),
473 "Index out of range");
474 if (aIndex
== int32_t(mDOMWordText
.Length())) return CHAR_CLASS_SEPARATOR
;
476 // this will classify the character, we want to treat "ignorable" characters
477 // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
478 nsUGenCategory charCategory
=
479 mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex
));
480 if (charCategory
== nsUGenCategory::kLetter
||
481 IsIgnorableCharacter(mDOMWordText
[aIndex
]) ||
482 mDOMWordText
[aIndex
] == 0x200C /* ZWNJ */ ||
483 mDOMWordText
[aIndex
] == 0x200D /* ZWJ */)
484 return CHAR_CLASS_WORD
;
486 // If conditional punctuation is surrounded immediately on both sides by word
487 // characters it also counts as a word character.
488 if (IsConditionalPunctuation(mDOMWordText
[aIndex
])) {
490 // not allowed to look around, this punctuation counts like a separator
491 return CHAR_CLASS_SEPARATOR
;
494 // check the left-hand character
495 if (aIndex
== 0) return CHAR_CLASS_SEPARATOR
;
496 if (ClassifyCharacter(aIndex
- 1, false) != CHAR_CLASS_WORD
)
497 return CHAR_CLASS_SEPARATOR
;
498 // If the previous charatcer is a word-char, make sure that it's not a
499 // special dot character.
500 if (mDOMWordText
[aIndex
- 1] == '.') return CHAR_CLASS_SEPARATOR
;
502 // now we know left char is a word-char, check the right-hand character
503 if (aIndex
== int32_t(mDOMWordText
.Length() - 1)) {
504 return CHAR_CLASS_SEPARATOR
;
507 if (ClassifyCharacter(aIndex
+ 1, false) != CHAR_CLASS_WORD
)
508 return CHAR_CLASS_SEPARATOR
;
509 // If the next charatcer is a word-char, make sure that it's not a
510 // special dot character.
511 if (mDOMWordText
[aIndex
+ 1] == '.') return CHAR_CLASS_SEPARATOR
;
513 // char on either side is a word, this counts as a word
514 return CHAR_CLASS_WORD
;
517 // The dot character, if appearing at the end of a word, should
518 // be considered part of that word. Example: "etc.", or
520 if (aIndex
> 0 && mDOMWordText
[aIndex
] == '.' &&
521 mDOMWordText
[aIndex
- 1] != '.' &&
522 ClassifyCharacter(aIndex
- 1, false) != CHAR_CLASS_WORD
) {
523 return CHAR_CLASS_WORD
;
526 // all other punctuation
527 if (charCategory
== nsUGenCategory::kSeparator
||
528 charCategory
== nsUGenCategory::kOther
||
529 charCategory
== nsUGenCategory::kPunctuation
||
530 charCategory
== nsUGenCategory::kSymbol
) {
531 // Don't break on hyphens, as hunspell handles them on its own.
532 if (aIndex
> 0 && mDOMWordText
[aIndex
] == '-' &&
533 mDOMWordText
[aIndex
- 1] != '-' &&
534 ClassifyCharacter(aIndex
- 1, false) == CHAR_CLASS_WORD
) {
535 // A hyphen is only meaningful as a separator inside a word
536 // if the previous and next characters are a word character.
537 if (aIndex
== int32_t(mDOMWordText
.Length()) - 1)
538 return CHAR_CLASS_SEPARATOR
;
539 if (mDOMWordText
[aIndex
+ 1] != '.' &&
540 ClassifyCharacter(aIndex
+ 1, false) == CHAR_CLASS_WORD
)
541 return CHAR_CLASS_WORD
;
543 return CHAR_CLASS_SEPARATOR
;
546 // any other character counts as a word
547 return CHAR_CLASS_WORD
;
550 // WordSplitState::Advance
552 void WordSplitState
<T
>::Advance() {
553 MOZ_ASSERT(mDOMWordOffset
>= 0, "Negative word index");
554 MOZ_ASSERT(mDOMWordOffset
< (int32_t)mDOMWordText
.Length(),
555 "Length beyond end");
558 if (mDOMWordOffset
>= (int32_t)mDOMWordText
.Length())
559 mCurCharClass
= CHAR_CLASS_END_OF_INPUT
;
561 mCurCharClass
= ClassifyCharacter(mDOMWordOffset
, true);
564 // WordSplitState::AdvanceThroughSeparators
566 void WordSplitState
<T
>::AdvanceThroughSeparators() {
567 while (mCurCharClass
== CHAR_CLASS_SEPARATOR
) Advance();
570 // WordSplitState::AdvanceThroughWord
572 void WordSplitState
<T
>::AdvanceThroughWord() {
573 while (mCurCharClass
== CHAR_CLASS_WORD
) Advance();
576 // WordSplitState::IsSpecialWord
578 bool WordSplitState
<T
>::IsSpecialWord() const {
579 // Search for email addresses. We simply define these as any sequence of
580 // characters with an '@' character in the middle. The DOM word is already
581 // split on whitepace, so we know that everything to the end is the address
582 int32_t firstColon
= -1;
583 for (int32_t i
= mDOMWordOffset
; i
< int32_t(mDOMWordText
.Length()); i
++) {
584 if (mDOMWordText
[i
] == '@') {
585 // only accept this if there are unambiguous word characters (don't bother
586 // recursing to disambiguate apostrophes) on each side. This prevents
587 // classifying, e.g. "@home" as an email address
589 // Use this condition to only accept words with '@' in the middle of
590 // them. It works, but the inlinespellcker doesn't like this. The problem
591 // is that you type "fhsgfh@" that's a misspelled word followed by a
592 // symbol, but when you type another letter "fhsgfh@g" that first word
593 // need to be unmarked misspelled. It doesn't do this. it only checks the
594 // current position for potentially removing a spelling range.
595 if (i
> 0 && ClassifyCharacter(i
- 1, false) == CHAR_CLASS_WORD
&&
596 i
< (int32_t)mDOMWordText
.Length() - 1 &&
597 ClassifyCharacter(i
+ 1, false) == CHAR_CLASS_WORD
) {
600 } else if (mDOMWordText
[i
] == ':' && firstColon
< 0) {
603 // If the first colon is followed by a slash, consider it a URL
604 // This will catch things like asdf://foo.com
605 if (firstColon
< (int32_t)mDOMWordText
.Length() - 1 &&
606 mDOMWordText
[firstColon
+ 1] == '/') {
612 // Check the text before the first colon against some known protocols. It
613 // is impossible to check against all protocols, especially since you can
614 // plug in new protocols. We also don't want to waste time here checking
615 // against a lot of obscure protocols.
616 if (firstColon
> mDOMWordOffset
) {
618 Substring(mDOMWordText
, mDOMWordOffset
, firstColon
- mDOMWordOffset
));
619 if (protocol
.EqualsIgnoreCase("http") ||
620 protocol
.EqualsIgnoreCase("https") ||
621 protocol
.EqualsIgnoreCase("news") ||
622 protocol
.EqualsIgnoreCase("file") ||
623 protocol
.EqualsIgnoreCase("javascript") ||
624 protocol
.EqualsIgnoreCase("data") || protocol
.EqualsIgnoreCase("ftp")) {
629 // not anything special
633 // WordSplitState::ShouldSkipWord
635 bool WordSplitState
<T
>::ShouldSkipWord(int32_t aStart
, int32_t aLength
) const {
636 int32_t last
= aStart
+ aLength
;
638 // check to see if the word contains a digit
639 for (int32_t i
= aStart
; i
< last
; i
++) {
640 if (mozilla::unicode::GetGenCategory(GetUnicharAt(i
)) ==
641 nsUGenCategory::kNumber
) {
651 Maybe
<int32_t> WordSplitState
<T
>::FindOffsetOfLastDOMWordSeparatorSequence(
652 const int32_t aBeforeOffset
) const {
653 for (int32_t i
= aBeforeOffset
- 1; i
>= 0; --i
) {
654 if (IsDOMWordSeparator(mDOMWordText
[i
]) ||
655 (!IsAmbiguousDOMWordSeprator(mDOMWordText
[i
]) &&
656 ClassifyCharacter(i
, true) == CHAR_CLASS_SEPARATOR
)) {
657 // Be greedy, find as many separators as we can
658 for (int32_t j
= i
- 1; j
>= 0; --j
) {
659 if (IsDOMWordSeparator(mDOMWordText
[j
]) ||
660 (!IsAmbiguousDOMWordSeprator(mDOMWordText
[j
]) &&
661 ClassifyCharacter(j
, true) == CHAR_CLASS_SEPARATOR
)) {
674 char16_t WordSplitState
<nsDependentSubstring
>::GetUnicharAt(
675 int32_t aIndex
) const {
676 return mDOMWordText
[aIndex
];
680 char16_t WordSplitState
<nsDependentCSubstring
>::GetUnicharAt(
681 int32_t aIndex
) const {
682 return static_cast<char16_t
>(static_cast<uint8_t>(mDOMWordText
[aIndex
]));
685 static inline bool IsBRElement(nsINode
* aNode
) {
686 return aNode
->IsHTMLElement(nsGkAtoms::br
);
690 * Given a TextNode, finds the last sequence of DOM word separators before
691 * aBeforeOffset and returns the offset to its first element.
693 * @param aContent the TextNode to check.
694 * @param aBeforeOffset the offset in the TextNode before which we will search
695 * for the DOM separator. You can pass INT32_MAX to search the entire
696 * length of the string.
698 static Maybe
<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
699 nsIContent
* aContent
, int32_t aBeforeOffset
) {
700 const nsTextFragment
* textFragment
= aContent
->GetText();
701 MOZ_ASSERT(textFragment
, "Where is our text?");
702 int32_t end
= std::min(aBeforeOffset
, int32_t(textFragment
->GetLength()));
704 if (textFragment
->Is2b()) {
705 nsDependentSubstring
targetText(textFragment
->Get2b(), end
);
706 WordSplitState
<nsDependentSubstring
> state(targetText
);
707 return state
.FindOffsetOfLastDOMWordSeparatorSequence(end
);
710 nsDependentCSubstring
targetText(textFragment
->Get1b(), end
);
711 WordSplitState
<nsDependentCSubstring
> state(targetText
);
712 return state
.FindOffsetOfLastDOMWordSeparatorSequence(end
);
716 * Check if there's a DOM word separator before aBeforeOffset in this node.
717 * Always returns true if it's a BR element.
718 * aSeparatorOffset is set to the index of the first character in the last
719 * separator if any is found (0 for BR elements).
721 * This function does not modify aSeparatorOffset when it returns false.
723 static bool ContainsDOMWordSeparator(nsINode
* aNode
, int32_t aBeforeOffset
,
724 int32_t* aSeparatorOffset
) {
725 if (IsBRElement(aNode
)) {
726 *aSeparatorOffset
= 0;
730 if (!IsSpellCheckingTextNode(aNode
)) return false;
732 const Maybe
<int32_t> separatorOffset
=
733 FindOffsetOfLastDOMWordSeparatorSequence(aNode
->AsContent(),
735 if (separatorOffset
) {
736 *aSeparatorOffset
= *separatorOffset
;
743 static bool IsBreakElement(nsINode
* aNode
) {
744 if (!aNode
->IsElement()) {
748 dom::Element
* element
= aNode
->AsElement();
749 if (element
->IsHTMLElement(nsGkAtoms::br
)) {
753 // If we don't have a frame, we don't consider ourselves a break
754 // element. In particular, words can span us.
755 nsIFrame
* frame
= element
->GetPrimaryFrame();
760 auto* disp
= frame
->StyleDisplay();
761 // Anything that's not an inline element is a break element.
762 // XXXbz should replaced inlines be break elements, though?
763 // Also should inline-block and such be break elements?
765 // FIXME(emilio): We should teach the spell checker to deal with generated
766 // content (it doesn't at all), then remove the IsListItem() check, as there
767 // could be no marker, etc...
768 return !disp
->IsInlineFlow() || disp
->IsListItem();
771 struct CheckLeavingBreakElementClosure
{
772 bool mLeftBreakElement
;
775 static void CheckLeavingBreakElement(nsINode
* aNode
, void* aClosure
) {
776 CheckLeavingBreakElementClosure
* cl
=
777 static_cast<CheckLeavingBreakElementClosure
*>(aClosure
);
778 if (!cl
->mLeftBreakElement
&& IsBreakElement(aNode
)) {
779 cl
->mLeftBreakElement
= true;
783 void mozInlineSpellWordUtil::NormalizeWord(nsAString
& aWord
) {
785 ::NormalizeWord(aWord
, 0, aWord
.Length(), result
);
789 void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText(
790 NodeOffset aBegin
, NodeOffset aEnd
, const nsINode
* aRootNode
) {
791 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
, ("%s", __FUNCTION__
));
793 mBegin
= std::move(aBegin
);
794 mEnd
= std::move(aEnd
);
796 // First we have to work backwards from mBegin to find a text node
797 // containing a DOM word separator, a non-inline-element
798 // boundary, or the hard start node. That's where we'll start building the
800 nsINode
* node
= mBegin
.mNode
;
801 int32_t firstOffsetInNode
= 0;
802 int32_t checkBeforeOffset
= mBegin
.mOffset
;
804 if (ContainsDOMWordSeparator(node
, checkBeforeOffset
, &firstOffsetInNode
)) {
805 if (node
== mBegin
.mNode
) {
806 // If we find a word separator on the first node, look at the preceding
807 // word on the text node as well.
808 if (firstOffsetInNode
> 0) {
809 // Try to find the previous word boundary in the current node. If
810 // we can't find one, start checking previous sibling nodes (if any
811 // adjacent ones exist) to see if we can find any text nodes with
812 // DOM word separators. We bail out as soon as we see a node that is
813 // not a text node, or we run out of previous sibling nodes. In the
814 // event that we simply cannot find any preceding word separator, the
815 // offset is set to 0, and the soft text beginning node is set to the
816 // "most previous" text node before the original starting node, or
817 // kept at the original starting node if no previous text nodes exist.
818 int32_t newOffset
= 0;
819 if (!ContainsDOMWordSeparator(node
, firstOffsetInNode
- 1,
821 nsIContent
* prevNode
= node
->GetPreviousSibling();
822 while (prevNode
&& IsSpellCheckingTextNode(prevNode
)) {
823 mBegin
.mNode
= prevNode
;
824 const Maybe
<int32_t> separatorOffset
=
825 FindOffsetOfLastDOMWordSeparatorSequence(prevNode
, INT32_MAX
);
826 if (separatorOffset
) {
827 newOffset
= *separatorOffset
;
830 prevNode
= prevNode
->GetPreviousSibling();
833 firstOffsetInNode
= newOffset
;
835 firstOffsetInNode
= 0;
838 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
839 ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__
,
840 mBegin
.mOffset
, firstOffsetInNode
));
841 mBegin
.mOffset
= firstOffsetInNode
;
845 checkBeforeOffset
= INT32_MAX
;
846 if (IsBreakElement(node
)) {
847 // Since GerPrevNode follows tree *preorder*, we're about to traverse up
848 // out of 'node'. Since node induces breaks (e.g., it's a block), don't
849 // bother trying to look outside it, just stop now.
852 // GetPreviousContent below expects aRootNode to be an ancestor of node.
853 if (!node
->IsInclusiveDescendantOf(aRootNode
)) {
856 node
= node
->GetPrevNode(aRootNode
);
859 // Now build up the string moving forward through the DOM until we reach
860 // the soft end and *then* see a DOM word separator, a non-inline-element
861 // boundary, or the hard end node.
864 bool seenSoftEnd
= false;
865 // Leave this outside the loop so large heap string allocations can be reused
868 if (node
== mEnd
.mNode
) {
873 if (IsSpellCheckingTextNode(node
)) {
874 nsIContent
* content
= static_cast<nsIContent
*>(node
);
875 MOZ_ASSERT(content
, "Where is our content?");
876 const nsTextFragment
* textFragment
= content
->GetText();
877 MOZ_ASSERT(textFragment
, "Where is our text?");
878 uint32_t lastOffsetInNode
= textFragment
->GetLength();
881 // check whether we can stop after this
883 node
== mEnd
.mNode
? AssertedCast
<uint32_t>(mEnd
.mOffset
) : 0;
884 i
< textFragment
->GetLength(); ++i
) {
885 if (IsDOMWordSeparator(textFragment
->CharAt(i
))) {
887 // stop at the first separator after the soft end point
888 lastOffsetInNode
= i
;
894 if (firstOffsetInNode
>= 0 &&
895 static_cast<uint32_t>(firstOffsetInNode
) < lastOffsetInNode
) {
896 const uint32_t len
= lastOffsetInNode
- firstOffsetInNode
;
897 mDOMMapping
.AppendElement(DOMTextMapping(
898 NodeOffset(node
, firstOffsetInNode
), mValue
.Length(), len
));
900 const bool ok
= textFragment
->AppendTo(
901 mValue
, static_cast<uint32_t>(firstOffsetInNode
), len
,
904 // probably out of memory, remove from mDOMMapping
905 mDOMMapping
.RemoveLastElement();
910 firstOffsetInNode
= 0;
915 CheckLeavingBreakElementClosure closure
= {false};
916 node
= FindNextNode(node
, aRootNode
, CheckLeavingBreakElement
, &closure
);
917 if (closure
.mLeftBreakElement
|| (node
&& IsBreakElement(node
))) {
918 // We left, or are entering, a break element (e.g., block). Maybe we can
920 if (seenSoftEnd
) break;
926 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
927 ("%s: got DOM string: %s", __FUNCTION__
,
928 NS_ConvertUTF16toUTF8(mValue
).get()));
931 auto mozInlineSpellWordUtil::BuildRealWords() const
932 -> Result
<RealWords
, nsresult
> {
933 // This is pretty simple. We just have to walk mSoftText.GetValue(),
934 // tokenizing it into "real words". We do an outer traversal of words
935 // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of
937 int32_t wordStart
= -1;
939 for (int32_t i
= 0; i
< int32_t(mSoftText
.GetValue().Length()); ++i
) {
940 if (IsDOMWordSeparator(mSoftText
.GetValue().CharAt(i
))) {
941 if (wordStart
>= 0) {
942 nsresult rv
= SplitDOMWordAndAppendTo(wordStart
, i
, realWords
);
954 if (wordStart
>= 0) {
955 nsresult rv
= SplitDOMWordAndAppendTo(
956 wordStart
, mSoftText
.GetValue().Length(), realWords
);
965 /*********** DOM/realwords<->mSoftText.GetValue() mapping functions
968 int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
969 const NodeOffset
& aNodeOffset
) const {
970 if (!mSoftText
.mIsValid
) {
971 NS_ERROR("Soft text must be valid if we're to map into it");
975 for (int32_t i
= 0; i
< int32_t(mSoftText
.GetDOMMapping().Length()); ++i
) {
976 const DOMTextMapping
& map
= mSoftText
.GetDOMMapping()[i
];
977 if (map
.mNodeOffset
.mNode
== aNodeOffset
.mNode
) {
978 // Allow offsets at either end of the string, in particular, allow the
979 // offset that's at the end of the contributed string
980 int32_t offsetInContributedString
=
981 aNodeOffset
.mOffset
- map
.mNodeOffset
.mOffset
;
982 if (offsetInContributedString
>= 0 &&
983 offsetInContributedString
<= map
.mLength
)
984 return map
.mSoftTextOffset
+ offsetInContributedString
;
994 class FirstLargerOffset
{
995 int32_t mSoftTextOffset
;
998 explicit FirstLargerOffset(int32_t aSoftTextOffset
)
999 : mSoftTextOffset(aSoftTextOffset
) {}
1000 int operator()(const T
& t
) const {
1001 // We want the first larger offset, so never return 0 (which would
1002 // short-circuit evaluation before finding the last such offset).
1003 return mSoftTextOffset
< t
.mSoftTextOffset
? -1 : 1;
1008 bool FindLastNongreaterOffset(const nsTArray
<T
>& aContainer
,
1009 int32_t aSoftTextOffset
, size_t* aIndex
) {
1010 if (aContainer
.Length() == 0) {
1014 BinarySearchIf(aContainer
, 0, aContainer
.Length(),
1015 FirstLargerOffset
<T
>(aSoftTextOffset
), aIndex
);
1017 // There was at least one mapping with offset <= aSoftTextOffset. Step back
1018 // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
1021 // Every mapping had offset greater than aSoftTextOffset.
1022 MOZ_ASSERT(aContainer
[*aIndex
].mSoftTextOffset
> aSoftTextOffset
);
1029 NodeOffset
mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
1030 int32_t aSoftTextOffset
, DOMMapHint aHint
) const {
1031 MOZ_ASSERT(mSoftText
.mIsValid
,
1032 "Soft text must be valid if we're to map out of it");
1033 if (!mSoftText
.mIsValid
) return NodeOffset(nullptr, -1);
1035 // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
1037 bool found
= FindLastNongreaterOffset(mSoftText
.GetDOMMapping(),
1038 aSoftTextOffset
, &index
);
1040 return NodeOffset(nullptr, -1);
1043 // 'index' is now the last mapping, if any, such that
1044 // mSoftTextOffset <= aSoftTextOffset.
1045 // If we're doing HINT_END, then we may want to return the end of the
1046 // the previous mapping instead of the start of this mapping
1047 if (aHint
== HINT_END
&& index
> 0) {
1048 const DOMTextMapping
& map
= mSoftText
.GetDOMMapping()[index
- 1];
1049 if (map
.mSoftTextOffset
+ map
.mLength
== aSoftTextOffset
)
1050 return NodeOffset(map
.mNodeOffset
.mNode
,
1051 map
.mNodeOffset
.mOffset
+ map
.mLength
);
1054 // We allow ourselves to return the end of this mapping even if we're
1055 // doing HINT_START. This will only happen if there is no mapping which this
1056 // point is the start of. I'm not 100% sure this is OK...
1057 const DOMTextMapping
& map
= mSoftText
.GetDOMMapping()[index
];
1058 int32_t offset
= aSoftTextOffset
- map
.mSoftTextOffset
;
1059 if (offset
>= 0 && offset
<= map
.mLength
)
1060 return NodeOffset(map
.mNodeOffset
.mNode
, map
.mNodeOffset
.mOffset
+ offset
);
1062 return NodeOffset(nullptr, -1);
1066 void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint
,
1067 nsACString
& aResult
) {
1070 aResult
.AssignLiteral("begin");
1073 aResult
.AssignLiteral("end");
1078 int32_t mozInlineSpellWordUtil::FindRealWordContaining(
1079 int32_t aSoftTextOffset
, DOMMapHint aHint
, bool aSearchForward
) const {
1080 if (MOZ_LOG_TEST(sInlineSpellWordUtilLog
, LogLevel::Debug
)) {
1082 mozInlineSpellWordUtil::ToString(aHint
, hint
);
1085 sInlineSpellWordUtilLog
, LogLevel::Debug
,
1086 ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__
,
1087 aSoftTextOffset
, hint
.get(), static_cast<int32_t>(aSearchForward
)));
1090 MOZ_ASSERT(mSoftText
.mIsValid
,
1091 "Soft text must be valid if we're to map out of it");
1092 if (!mSoftText
.mIsValid
) return -1;
1094 // Find the last word, if any, such that mRealWords[index].mSoftTextOffset
1095 // <= aSoftTextOffset
1097 bool found
= FindLastNongreaterOffset(mRealWords
, aSoftTextOffset
, &index
);
1102 // 'index' is now the last word, if any, such that
1103 // mSoftTextOffset <= aSoftTextOffset.
1104 // If we're doing HINT_END, then we may want to return the end of the
1105 // the previous word instead of the start of this word
1106 if (aHint
== HINT_END
&& index
> 0) {
1107 const RealWord
& word
= mRealWords
[index
- 1];
1108 if (word
.EndOffset() == aSoftTextOffset
) {
1113 // We allow ourselves to return the end of this word even if we're
1114 // doing HINT_BEGIN. This will only happen if there is no word which this
1115 // point is the start of. I'm not 100% sure this is OK...
1116 const RealWord
& word
= mRealWords
[index
];
1117 int32_t offset
= aSoftTextOffset
- word
.mSoftTextOffset
;
1118 if (offset
>= 0 && offset
<= static_cast<int32_t>(word
.mLength
)) return index
;
1120 if (aSearchForward
) {
1121 if (mRealWords
[0].mSoftTextOffset
> aSoftTextOffset
) {
1122 // All words have mSoftTextOffset > aSoftTextOffset
1125 // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
1126 // Word index+1, if it exists, will be the first with
1127 // mSoftTextOffset > aSoftTextOffset.
1128 if (index
+ 1 < mRealWords
.Length()) return index
+ 1;
1134 // mozInlineSpellWordUtil::SplitDOMWordAndAppendTo
1136 nsresult
mozInlineSpellWordUtil::SplitDOMWordAndAppendTo(
1137 int32_t aStart
, int32_t aEnd
, nsTArray
<RealWord
>& aRealWords
) const {
1138 nsDependentSubstring
targetText(mSoftText
.GetValue(), aStart
, aEnd
- aStart
);
1139 WordSplitState
<nsDependentSubstring
> state(targetText
);
1140 state
.mCurCharClass
= state
.ClassifyCharacter(0, true);
1142 state
.AdvanceThroughSeparators();
1143 if (state
.mCurCharClass
!= CHAR_CLASS_END_OF_INPUT
&& state
.IsSpecialWord()) {
1144 int32_t specialWordLength
=
1145 state
.mDOMWordText
.Length() - state
.mDOMWordOffset
;
1146 if (!aRealWords
.AppendElement(
1147 RealWord(aStart
+ state
.mDOMWordOffset
, specialWordLength
, false),
1149 return NS_ERROR_OUT_OF_MEMORY
;
1155 while (state
.mCurCharClass
!= CHAR_CLASS_END_OF_INPUT
) {
1156 state
.AdvanceThroughSeparators();
1157 if (state
.mCurCharClass
== CHAR_CLASS_END_OF_INPUT
) break;
1159 // save the beginning of the word
1160 int32_t wordOffset
= state
.mDOMWordOffset
;
1162 // find the end of the word
1163 state
.AdvanceThroughWord();
1164 int32_t wordLen
= state
.mDOMWordOffset
- wordOffset
;
1165 if (!aRealWords
.AppendElement(
1166 RealWord(aStart
+ wordOffset
, wordLen
,
1167 !state
.ShouldSkipWord(wordOffset
, wordLen
)),
1169 return NS_ERROR_OUT_OF_MEMORY
;