1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozInlineSpellWordUtil.h"
11 #include "mozilla/BinarySearch.h"
12 #include "mozilla/HTMLEditor.h"
13 #include "mozilla/Logging.h"
14 #include "mozilla/TextEditor.h"
15 #include "mozilla/dom/Element.h"
19 #include "nsComponentManagerUtils.h"
20 #include "nsUnicodeProperties.h"
21 #include "nsServiceManagerUtils.h"
22 #include "nsIContent.h"
23 #include "nsTextFragment.h"
25 #include "nsContentUtils.h"
28 using namespace mozilla
;
30 static LazyLogModule sInlineSpellWordUtilLog
{"InlineSpellWordUtil"};
32 // IsIgnorableCharacter
34 // These characters are ones that we should ignore in input.
36 inline bool IsIgnorableCharacter(char ch
) {
37 return (ch
== static_cast<char>(0xAD)); // SOFT HYPHEN
40 inline bool IsIgnorableCharacter(char16_t ch
) {
41 return (ch
== 0xAD || // SOFT HYPHEN
42 ch
== 0x1806); // MONGOLIAN TODO SOFT HYPHEN
45 // IsConditionalPunctuation
47 // Some characters (like apostrophes) require characters on each side to be
48 // part of a word, and are otherwise punctuation.
50 inline bool IsConditionalPunctuation(char ch
) {
51 return (ch
== '\'' || // RIGHT SINGLE QUOTATION MARK
52 ch
== static_cast<char>(0xB7)); // MIDDLE DOT
55 inline bool IsConditionalPunctuation(char16_t ch
) {
56 return (ch
== '\'' || ch
== 0x2019 || // RIGHT SINGLE QUOTATION MARK
57 ch
== 0x00B7); // MIDDLE DOT
60 static bool IsAmbiguousDOMWordSeprator(char16_t ch
) {
61 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
62 return (ch
== '@' || ch
== ':' || ch
== '.' || ch
== '/' || ch
== '-' ||
63 IsConditionalPunctuation(ch
));
66 static bool IsAmbiguousDOMWordSeprator(char ch
) {
67 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
68 return IsAmbiguousDOMWordSeprator(static_cast<char16_t
>(ch
));
73 // Determines if the given character should be considered as a DOM Word
74 // separator. Basically, this is whitespace, although it could also have
75 // certain punctuation that we know ALWAYS breaks words. This is important.
76 // For example, we can't have any punctuation that could appear in a URL
77 // or email address in this, because those need to always fit into a single
80 static bool IsDOMWordSeparator(char ch
) {
81 // simple spaces or no-break space
82 return (ch
== ' ' || ch
== '\t' || ch
== '\n' || ch
== '\r' ||
83 ch
== static_cast<char>(0xA0));
86 static bool IsDOMWordSeparator(char16_t ch
) {
88 if (ch
== ' ' || ch
== '\t' || ch
== '\n' || ch
== '\r') return true;
90 // complex spaces - check only if char isn't ASCII (uncommon)
91 if (ch
>= 0xA0 && (ch
== 0x00A0 || // NO-BREAK SPACE
92 ch
== 0x2002 || // EN SPACE
93 ch
== 0x2003 || // EM SPACE
94 ch
== 0x2009 || // THIN SPACE
95 ch
== 0x3000)) // IDEOGRAPHIC SPACE
98 // otherwise not a space
103 Maybe
<mozInlineSpellWordUtil
> mozInlineSpellWordUtil::Create(
104 const TextEditor
& aTextEditor
) {
105 mozInlineSpellWordUtil util
;
106 util
.mDocument
= aTextEditor
.GetDocument();
107 if (NS_WARN_IF(!util
.mDocument
)) {
111 util
.mIsContentEditableOrDesignMode
= !!aTextEditor
.AsHTMLEditor();
113 // Find the root node for the editor. For contenteditable the mRootNode could
114 // change to shadow root if the begin and end are inside the shadowDOM.
115 util
.mRootNode
= aTextEditor
.GetRoot();
116 if (NS_WARN_IF(!util
.mRootNode
)) {
119 return Some(std::move(util
));
122 static inline bool IsSpellCheckingTextNode(nsINode
* aNode
) {
123 nsIContent
* parent
= aNode
->GetParent();
125 parent
->IsAnyOfHTMLElements(nsGkAtoms::script
, nsGkAtoms::style
))
127 return aNode
->IsText();
130 typedef void (*OnLeaveNodeFunPtr
)(nsINode
* aNode
, void* aClosure
);
132 // Find the next node in the DOM tree in preorder.
133 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
134 // why we can't just use GetNextNode here, sadly.
135 static nsINode
* FindNextNode(nsINode
* aNode
, nsINode
* aRoot
,
136 OnLeaveNodeFunPtr aOnLeaveNode
, void* aClosure
) {
137 MOZ_ASSERT(aNode
, "Null starting node?");
139 nsINode
* next
= aNode
->GetFirstChild();
140 if (next
) return next
;
142 // Don't look at siblings or otherwise outside of aRoot
143 if (aNode
== aRoot
) return nullptr;
145 next
= aNode
->GetNextSibling();
146 if (next
) return next
;
151 aOnLeaveNode(aNode
, aClosure
);
154 next
= aNode
->GetParent();
155 if (next
== aRoot
|| !next
) return nullptr;
158 next
= aNode
->GetNextSibling();
159 if (next
) return next
;
163 // aNode is not a text node. Find the first text node starting at aNode/aOffset
164 // in a preorder DOM traversal.
165 static nsINode
* FindNextTextNode(nsINode
* aNode
, int32_t aOffset
,
167 MOZ_ASSERT(aNode
, "Null starting node?");
168 NS_ASSERTION(!IsSpellCheckingTextNode(aNode
),
169 "FindNextTextNode should start with a non-text node");
172 // Need to start at the aOffset'th child
173 nsIContent
* child
= aNode
->GetChildAt_Deprecated(aOffset
);
178 // aOffset was beyond the end of the child list.
179 // goto next node after the last descendant of aNode in
180 // a preorder DOM traversal.
181 checkNode
= aNode
->GetNextNonChildNode(aRoot
);
184 while (checkNode
&& !IsSpellCheckingTextNode(checkNode
)) {
185 checkNode
= checkNode
->GetNextNode(aRoot
);
190 // mozInlineSpellWordUtil::SetPositionAndEnd
192 // We have two ranges "hard" and "soft". The hard boundary is simply
193 // the scope of the root node. The soft boundary is that which is set
194 // by the caller of this class by calling this function. If this function is
195 // not called, the soft boundary is the same as the hard boundary.
197 // When we reach the soft boundary (mSoftEnd), we keep
198 // going until we reach the end of a word. This allows the caller to set the
199 // end of the range to anything, and we will always check whole multiples of
200 // words. When we reach the hard boundary we stop no matter what.
202 // There is no beginning soft boundary. This is because we only go to the
203 // previous node once, when finding the previous word boundary in
204 // SetPosition(). You might think of the soft boundary as being this initial
207 nsresult
mozInlineSpellWordUtil::SetPositionAndEnd(nsINode
* aPositionNode
,
208 int32_t aPositionOffset
,
210 int32_t aEndOffset
) {
211 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
212 ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__
, aPositionNode
,
213 aPositionOffset
, aEndNode
, aEndOffset
));
215 MOZ_ASSERT(aPositionNode
, "Null begin node?");
216 MOZ_ASSERT(aEndNode
, "Null end node?");
218 NS_ASSERTION(mRootNode
, "Not initialized");
220 // Find a appropriate root if we are dealing with contenteditable nodes which
221 // are in the shadow DOM.
222 if (mIsContentEditableOrDesignMode
) {
223 nsINode
* rootNode
= aPositionNode
->SubtreeRoot();
224 if (rootNode
!= aEndNode
->SubtreeRoot()) {
225 return NS_ERROR_FAILURE
;
228 if (mozilla::dom::ShadowRoot::FromNode(rootNode
)) {
229 mRootNode
= rootNode
;
235 if (!IsSpellCheckingTextNode(aPositionNode
)) {
236 // Start at the start of the first text node after aNode/aOffset.
237 aPositionNode
= FindNextTextNode(aPositionNode
, aPositionOffset
, mRootNode
);
240 mSoftBegin
= NodeOffset(aPositionNode
, aPositionOffset
);
242 if (!IsSpellCheckingTextNode(aEndNode
)) {
243 // End at the start of the first text node after aEndNode/aEndOffset.
244 aEndNode
= FindNextTextNode(aEndNode
, aEndOffset
, mRootNode
);
247 mSoftEnd
= NodeOffset(aEndNode
, aEndOffset
);
249 nsresult rv
= EnsureWords();
254 int32_t textOffset
= MapDOMPositionToSoftTextOffset(mSoftBegin
);
255 if (textOffset
< 0) {
259 mNextWordIndex
= FindRealWordContaining(textOffset
, HINT_END
, true);
263 nsresult
mozInlineSpellWordUtil::EnsureWords() {
264 if (mSoftTextValid
) return NS_OK
;
266 nsresult rv
= BuildRealWords();
271 mSoftTextValid
= true;
275 nsresult
mozInlineSpellWordUtil::MakeRangeForWord(const RealWord
& aWord
,
278 MapSoftTextOffsetToDOMPosition(aWord
.mSoftTextOffset
, HINT_BEGIN
);
279 NodeOffset end
= MapSoftTextOffsetToDOMPosition(aWord
.EndOffset(), HINT_END
);
280 return MakeRange(begin
, end
, aRange
);
282 void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
283 const RealWord
& aWord
, NodeOffsetRange
* aNodeOffsetRange
) {
285 MapSoftTextOffsetToDOMPosition(aWord
.mSoftTextOffset
, HINT_BEGIN
);
286 NodeOffset end
= MapSoftTextOffsetToDOMPosition(aWord
.EndOffset(), HINT_END
);
287 *aNodeOffsetRange
= NodeOffsetRange(begin
, end
);
290 // mozInlineSpellWordUtil::GetRangeForWord
292 nsresult
mozInlineSpellWordUtil::GetRangeForWord(nsINode
* aWordNode
,
295 // Set our soft end and start
296 NodeOffset
pt(aWordNode
, aWordOffset
);
298 if (!mSoftTextValid
|| pt
!= mSoftBegin
|| pt
!= mSoftEnd
) {
300 mSoftBegin
= mSoftEnd
= pt
;
301 nsresult rv
= EnsureWords();
307 int32_t offset
= MapDOMPositionToSoftTextOffset(pt
);
308 if (offset
< 0) return MakeRange(pt
, pt
, aRange
);
309 int32_t wordIndex
= FindRealWordContaining(offset
, HINT_BEGIN
, false);
310 if (wordIndex
< 0) return MakeRange(pt
, pt
, aRange
);
311 return MakeRangeForWord(mRealWords
[wordIndex
], aRange
);
314 // This is to fix characters that the spellchecker may not like
315 static void NormalizeWord(const nsAString
& aInput
, int32_t aPos
, int32_t aLen
,
316 nsAString
& aOutput
) {
318 for (int32_t i
= 0; i
< aLen
; i
++) {
319 char16_t ch
= aInput
.CharAt(i
+ aPos
);
321 // remove ignorable characters from the word
322 if (IsIgnorableCharacter(ch
)) continue;
324 // the spellchecker doesn't handle curly apostrophes in all languages
325 if (ch
== 0x2019) { // RIGHT SINGLE QUOTATION MARK
333 // mozInlineSpellWordUtil::GetNextWord
335 // FIXME-optimization: we shouldn't have to generate a range every single
336 // time. It would be better if the inline spellchecker didn't require a
337 // range unless the word was misspelled. This may or may not be possible.
339 bool mozInlineSpellWordUtil::GetNextWord(nsAString
& aText
,
340 NodeOffsetRange
* aNodeOffsetRange
,
341 bool* aSkipChecking
) {
342 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
343 ("%s: mNextWordIndex=%d", __FUNCTION__
, mNextWordIndex
));
345 if (mNextWordIndex
< 0 || mNextWordIndex
>= int32_t(mRealWords
.Length())) {
347 *aSkipChecking
= true;
351 const RealWord
& word
= mRealWords
[mNextWordIndex
];
352 MakeNodeOffsetRangeForWord(word
, aNodeOffsetRange
);
354 *aSkipChecking
= !word
.mCheckableWord
;
355 ::NormalizeWord(mSoftText
, word
.mSoftTextOffset
, word
.mLength
, aText
);
357 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
358 ("%s: returning: %s (skip=%d)", __FUNCTION__
,
359 NS_ConvertUTF16toUTF8(aText
).get(), *aSkipChecking
));
364 // mozInlineSpellWordUtil::MakeRange
366 // Convenience function for creating a range over the current document.
368 nsresult
mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin
, NodeOffset aEnd
,
369 nsRange
** aRange
) const {
370 NS_ENSURE_ARG_POINTER(aBegin
.mNode
);
372 return NS_ERROR_NOT_INITIALIZED
;
376 RefPtr
<nsRange
> range
= nsRange::Create(aBegin
.mNode
, aBegin
.mOffset
,
377 aEnd
.mNode
, aEnd
.mOffset
, error
);
378 if (NS_WARN_IF(error
.Failed())) {
379 return error
.StealNSResult();
382 range
.forget(aRange
);
387 already_AddRefed
<nsRange
> mozInlineSpellWordUtil::MakeRange(
388 const NodeOffsetRange
& aRange
) {
389 IgnoredErrorResult ignoredError
;
390 RefPtr
<nsRange
> range
=
391 nsRange::Create(aRange
.Begin().Node(), aRange
.Begin().Offset(),
392 aRange
.End().Node(), aRange
.End().Offset(), ignoredError
);
393 NS_WARNING_ASSERTION(!ignoredError
.Failed(), "Creating a range failed");
394 return range
.forget();
397 /*********** Word Splitting ************/
399 // classifies a given character in the DOM word
402 CHAR_CLASS_SEPARATOR
,
403 CHAR_CLASS_END_OF_INPUT
406 // Encapsulates DOM-word to real-word splitting
408 struct MOZ_STACK_CLASS WordSplitState
{
409 const T
& mDOMWordText
;
410 int32_t mDOMWordOffset
;
411 CharClass mCurCharClass
;
413 explicit WordSplitState(const T
& aString
)
414 : mDOMWordText(aString
),
416 mCurCharClass(CHAR_CLASS_END_OF_INPUT
) {}
418 CharClass
ClassifyCharacter(int32_t aIndex
, bool aRecurse
) const;
420 void AdvanceThroughSeparators();
421 void AdvanceThroughWord();
423 // Finds special words like email addresses and URLs that may start at the
424 // current position, and returns their length, or 0 if not found. This allows
425 // arbitrary word breaking rules to be used for these special entities, as
426 // long as they can not contain whitespace.
427 bool IsSpecialWord() const;
429 // Similar to IsSpecialWord except that this takes a split word as
430 // input. This checks for things that do not require special word-breaking
432 bool ShouldSkipWord(int32_t aStart
, int32_t aLength
) const;
434 // Checks to see if there's a DOM word separator before aBeforeOffset within
435 // it. This function does not modify aSeparatorOffset when it returns false.
436 bool GetDOMWordSeparatorOffset(int32_t aOffset
,
437 int32_t* aSeparatorOffset
) const;
439 char16_t
GetUnicharAt(int32_t aIndex
) const;
442 // WordSplitState::ClassifyCharacter
444 CharClass WordSplitState
<T
>::ClassifyCharacter(int32_t aIndex
,
445 bool aRecurse
) const {
446 NS_ASSERTION(aIndex
>= 0 && aIndex
<= int32_t(mDOMWordText
.Length()),
447 "Index out of range");
448 if (aIndex
== int32_t(mDOMWordText
.Length())) return CHAR_CLASS_SEPARATOR
;
450 // this will classify the character, we want to treat "ignorable" characters
451 // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
452 nsUGenCategory charCategory
=
453 mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex
));
454 if (charCategory
== nsUGenCategory::kLetter
||
455 IsIgnorableCharacter(mDOMWordText
[aIndex
]) ||
456 mDOMWordText
[aIndex
] == 0x200C /* ZWNJ */ ||
457 mDOMWordText
[aIndex
] == 0x200D /* ZWJ */)
458 return CHAR_CLASS_WORD
;
460 // If conditional punctuation is surrounded immediately on both sides by word
461 // characters it also counts as a word character.
462 if (IsConditionalPunctuation(mDOMWordText
[aIndex
])) {
464 // not allowed to look around, this punctuation counts like a separator
465 return CHAR_CLASS_SEPARATOR
;
468 // check the left-hand character
469 if (aIndex
== 0) return CHAR_CLASS_SEPARATOR
;
470 if (ClassifyCharacter(aIndex
- 1, false) != CHAR_CLASS_WORD
)
471 return CHAR_CLASS_SEPARATOR
;
472 // If the previous charatcer is a word-char, make sure that it's not a
473 // special dot character.
474 if (mDOMWordText
[aIndex
- 1] == '.') return CHAR_CLASS_SEPARATOR
;
476 // now we know left char is a word-char, check the right-hand character
477 if (aIndex
== int32_t(mDOMWordText
.Length() - 1)) {
478 return CHAR_CLASS_SEPARATOR
;
481 if (ClassifyCharacter(aIndex
+ 1, false) != CHAR_CLASS_WORD
)
482 return CHAR_CLASS_SEPARATOR
;
483 // If the next charatcer is a word-char, make sure that it's not a
484 // special dot character.
485 if (mDOMWordText
[aIndex
+ 1] == '.') return CHAR_CLASS_SEPARATOR
;
487 // char on either side is a word, this counts as a word
488 return CHAR_CLASS_WORD
;
491 // The dot character, if appearing at the end of a word, should
492 // be considered part of that word. Example: "etc.", or
494 if (aIndex
> 0 && mDOMWordText
[aIndex
] == '.' &&
495 mDOMWordText
[aIndex
- 1] != '.' &&
496 ClassifyCharacter(aIndex
- 1, false) != CHAR_CLASS_WORD
) {
497 return CHAR_CLASS_WORD
;
500 // all other punctuation
501 if (charCategory
== nsUGenCategory::kSeparator
||
502 charCategory
== nsUGenCategory::kOther
||
503 charCategory
== nsUGenCategory::kPunctuation
||
504 charCategory
== nsUGenCategory::kSymbol
) {
505 // Don't break on hyphens, as hunspell handles them on its own.
506 if (aIndex
> 0 && mDOMWordText
[aIndex
] == '-' &&
507 mDOMWordText
[aIndex
- 1] != '-' &&
508 ClassifyCharacter(aIndex
- 1, false) == CHAR_CLASS_WORD
) {
509 // A hyphen is only meaningful as a separator inside a word
510 // if the previous and next characters are a word character.
511 if (aIndex
== int32_t(mDOMWordText
.Length()) - 1)
512 return CHAR_CLASS_SEPARATOR
;
513 if (mDOMWordText
[aIndex
+ 1] != '.' &&
514 ClassifyCharacter(aIndex
+ 1, false) == CHAR_CLASS_WORD
)
515 return CHAR_CLASS_WORD
;
517 return CHAR_CLASS_SEPARATOR
;
520 // any other character counts as a word
521 return CHAR_CLASS_WORD
;
524 // WordSplitState::Advance
526 void WordSplitState
<T
>::Advance() {
527 NS_ASSERTION(mDOMWordOffset
>= 0, "Negative word index");
528 NS_ASSERTION(mDOMWordOffset
< (int32_t)mDOMWordText
.Length(),
529 "Length beyond end");
532 if (mDOMWordOffset
>= (int32_t)mDOMWordText
.Length())
533 mCurCharClass
= CHAR_CLASS_END_OF_INPUT
;
535 mCurCharClass
= ClassifyCharacter(mDOMWordOffset
, true);
538 // WordSplitState::AdvanceThroughSeparators
540 void WordSplitState
<T
>::AdvanceThroughSeparators() {
541 while (mCurCharClass
== CHAR_CLASS_SEPARATOR
) Advance();
544 // WordSplitState::AdvanceThroughWord
546 void WordSplitState
<T
>::AdvanceThroughWord() {
547 while (mCurCharClass
== CHAR_CLASS_WORD
) Advance();
550 // WordSplitState::IsSpecialWord
552 bool WordSplitState
<T
>::IsSpecialWord() const {
553 // Search for email addresses. We simply define these as any sequence of
554 // characters with an '@' character in the middle. The DOM word is already
555 // split on whitepace, so we know that everything to the end is the address
556 int32_t firstColon
= -1;
557 for (int32_t i
= mDOMWordOffset
; i
< int32_t(mDOMWordText
.Length()); i
++) {
558 if (mDOMWordText
[i
] == '@') {
559 // only accept this if there are unambiguous word characters (don't bother
560 // recursing to disambiguate apostrophes) on each side. This prevents
561 // classifying, e.g. "@home" as an email address
563 // Use this condition to only accept words with '@' in the middle of
564 // them. It works, but the inlinespellcker doesn't like this. The problem
565 // is that you type "fhsgfh@" that's a misspelled word followed by a
566 // symbol, but when you type another letter "fhsgfh@g" that first word
567 // need to be unmarked misspelled. It doesn't do this. it only checks the
568 // current position for potentially removing a spelling range.
569 if (i
> 0 && ClassifyCharacter(i
- 1, false) == CHAR_CLASS_WORD
&&
570 i
< (int32_t)mDOMWordText
.Length() - 1 &&
571 ClassifyCharacter(i
+ 1, false) == CHAR_CLASS_WORD
) {
574 } else if (mDOMWordText
[i
] == ':' && firstColon
< 0) {
577 // If the first colon is followed by a slash, consider it a URL
578 // This will catch things like asdf://foo.com
579 if (firstColon
< (int32_t)mDOMWordText
.Length() - 1 &&
580 mDOMWordText
[firstColon
+ 1] == '/') {
586 // Check the text before the first colon against some known protocols. It
587 // is impossible to check against all protocols, especially since you can
588 // plug in new protocols. We also don't want to waste time here checking
589 // against a lot of obscure protocols.
590 if (firstColon
> mDOMWordOffset
) {
592 Substring(mDOMWordText
, mDOMWordOffset
, firstColon
- mDOMWordOffset
));
593 if (protocol
.EqualsIgnoreCase("http") ||
594 protocol
.EqualsIgnoreCase("https") ||
595 protocol
.EqualsIgnoreCase("news") ||
596 protocol
.EqualsIgnoreCase("file") ||
597 protocol
.EqualsIgnoreCase("javascript") ||
598 protocol
.EqualsIgnoreCase("data") || protocol
.EqualsIgnoreCase("ftp")) {
603 // not anything special
607 // WordSplitState::ShouldSkipWord
609 bool WordSplitState
<T
>::ShouldSkipWord(int32_t aStart
, int32_t aLength
) const {
610 int32_t last
= aStart
+ aLength
;
612 // check to see if the word contains a digit
613 for (int32_t i
= aStart
; i
< last
; i
++) {
614 if (mozilla::unicode::GetGenCategory(GetUnicharAt(i
)) ==
615 nsUGenCategory::kNumber
) {
625 bool WordSplitState
<T
>::GetDOMWordSeparatorOffset(
626 int32_t aOffset
, int32_t* aSeparatorOffset
) const {
627 for (int32_t i
= aOffset
- 1; i
>= 0; --i
) {
628 if (IsDOMWordSeparator(mDOMWordText
[i
]) ||
629 (!IsAmbiguousDOMWordSeprator(mDOMWordText
[i
]) &&
630 ClassifyCharacter(i
, true) == CHAR_CLASS_SEPARATOR
)) {
631 // Be greedy, find as many separators as we can
632 for (int32_t j
= i
- 1; j
>= 0; --j
) {
633 if (IsDOMWordSeparator(mDOMWordText
[j
]) ||
634 (!IsAmbiguousDOMWordSeprator(mDOMWordText
[j
]) &&
635 ClassifyCharacter(j
, true) == CHAR_CLASS_SEPARATOR
)) {
641 *aSeparatorOffset
= i
;
649 char16_t WordSplitState
<nsDependentSubstring
>::GetUnicharAt(
650 int32_t aIndex
) const {
651 return mDOMWordText
[aIndex
];
655 char16_t WordSplitState
<nsDependentCSubstring
>::GetUnicharAt(
656 int32_t aIndex
) const {
657 return static_cast<char16_t
>(static_cast<uint8_t>(mDOMWordText
[aIndex
]));
660 static inline bool IsBRElement(nsINode
* aNode
) {
661 return aNode
->IsHTMLElement(nsGkAtoms::br
);
665 * Given a TextNode, checks to see if there's a DOM word separator before
666 * aBeforeOffset within it. This function does not modify aSeparatorOffset when
669 * @param aContent the TextNode to check.
670 * @param aBeforeOffset the offset in the TextNode before which we will search
671 * for the DOM separator. You can pass INT32_MAX to search the entire
672 * length of the string.
673 * @param aSeparatorOffset will be set to the offset of the first separator it
674 * encounters. Will not be written to if no separator is found.
675 * @returns True if it found a separator.
677 static bool TextNodeContainsDOMWordSeparator(nsIContent
* aContent
,
678 int32_t aBeforeOffset
,
679 int32_t* aSeparatorOffset
) {
680 const nsTextFragment
* textFragment
= aContent
->GetText();
681 NS_ASSERTION(textFragment
, "Where is our text?");
682 int32_t end
= std::min(aBeforeOffset
, int32_t(textFragment
->GetLength()));
684 if (textFragment
->Is2b()) {
685 nsDependentSubstring
targetText(textFragment
->Get2b(), end
);
686 WordSplitState
<nsDependentSubstring
> state(targetText
);
687 return state
.GetDOMWordSeparatorOffset(end
, aSeparatorOffset
);
690 nsDependentCSubstring
targetText(textFragment
->Get1b(), end
);
691 WordSplitState
<nsDependentCSubstring
> state(targetText
);
692 return state
.GetDOMWordSeparatorOffset(end
, aSeparatorOffset
);
696 * Check if there's a DOM word separator before aBeforeOffset in this node.
697 * Always returns true if it's a BR element.
698 * aSeparatorOffset is set to the index of the first character in the last
699 * separator if any is found (0 for BR elements).
701 * This function does not modify aSeparatorOffset when it returns false.
703 static bool ContainsDOMWordSeparator(nsINode
* aNode
, int32_t aBeforeOffset
,
704 int32_t* aSeparatorOffset
) {
705 if (IsBRElement(aNode
)) {
706 *aSeparatorOffset
= 0;
710 if (!IsSpellCheckingTextNode(aNode
)) return false;
712 return TextNodeContainsDOMWordSeparator(aNode
->AsContent(), aBeforeOffset
,
716 static bool IsBreakElement(nsINode
* aNode
) {
717 if (!aNode
->IsElement()) {
721 dom::Element
* element
= aNode
->AsElement();
722 if (element
->IsHTMLElement(nsGkAtoms::br
)) {
726 // If we don't have a frame, we don't consider ourselves a break
727 // element. In particular, words can span us.
728 nsIFrame
* frame
= element
->GetPrimaryFrame();
733 auto* disp
= frame
->StyleDisplay();
734 // Anything that's not an inline element is a break element.
735 // XXXbz should replaced inlines be break elements, though?
736 // Also should inline-block and such be break elements?
738 // FIXME(emilio): We should teach the spell checker to deal with generated
739 // content (it doesn't at all), then remove the IsListItem() check, as there
740 // could be no marker, etc...
741 return !disp
->IsInlineFlow() || disp
->IsListItem();
744 struct CheckLeavingBreakElementClosure
{
745 bool mLeftBreakElement
;
748 static void CheckLeavingBreakElement(nsINode
* aNode
, void* aClosure
) {
749 CheckLeavingBreakElementClosure
* cl
=
750 static_cast<CheckLeavingBreakElementClosure
*>(aClosure
);
751 if (!cl
->mLeftBreakElement
&& IsBreakElement(aNode
)) {
752 cl
->mLeftBreakElement
= true;
756 void mozInlineSpellWordUtil::NormalizeWord(nsAString
& aWord
) {
758 ::NormalizeWord(aWord
, 0, aWord
.Length(), result
);
762 void mozInlineSpellWordUtil::BuildSoftText() {
763 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
, ("%s", __FUNCTION__
));
765 // First we have to work backwards from mSoftStart to find a text node
766 // containing a DOM word separator, a non-inline-element
767 // boundary, or the hard start node. That's where we'll start building the
769 nsINode
* node
= mSoftBegin
.mNode
;
770 int32_t firstOffsetInNode
= 0;
771 int32_t checkBeforeOffset
= mSoftBegin
.mOffset
;
773 if (ContainsDOMWordSeparator(node
, checkBeforeOffset
, &firstOffsetInNode
)) {
774 if (node
== mSoftBegin
.mNode
) {
775 // If we find a word separator on the first node, look at the preceding
776 // word on the text node as well.
777 int32_t newOffset
= 0;
778 if (firstOffsetInNode
> 0) {
779 // Try to find the previous word boundary in the current node. If
780 // we can't find one, start checking previous sibling nodes (if any
781 // adjacent ones exist) to see if we can find any text nodes with
782 // DOM word separators. We bail out as soon as we see a node that is
783 // not a text node, or we run out of previous sibling nodes. In the
784 // event that we simply cannot find any preceding word separator, the
785 // offset is set to 0, and the soft text beginning node is set to the
786 // "most previous" text node before the original starting node, or
787 // kept at the original starting node if no previous text nodes exist.
788 if (!ContainsDOMWordSeparator(node
, firstOffsetInNode
- 1,
790 nsIContent
* prevNode
= node
->GetPreviousSibling();
791 while (prevNode
&& IsSpellCheckingTextNode(prevNode
)) {
792 mSoftBegin
.mNode
= prevNode
;
793 if (TextNodeContainsDOMWordSeparator(prevNode
, INT32_MAX
,
797 prevNode
= prevNode
->GetPreviousSibling();
801 firstOffsetInNode
= newOffset
;
802 mSoftBegin
.mOffset
= newOffset
;
806 checkBeforeOffset
= INT32_MAX
;
807 if (IsBreakElement(node
)) {
808 // Since GetPreviousContent follows tree *preorder*, we're about to
809 // traverse up out of 'node'. Since node induces breaks (e.g., it's a
810 // block), don't bother trying to look outside it, just stop now.
813 // GetPreviousContent below expects mRootNode to be an ancestor of node.
814 if (!node
->IsInclusiveDescendantOf(mRootNode
)) {
817 node
= node
->GetPreviousContent(mRootNode
);
820 // Now build up the string moving forward through the DOM until we reach
821 // the soft end and *then* see a DOM word separator, a non-inline-element
822 // boundary, or the hard end node.
823 mSoftText
.Truncate();
824 mSoftTextDOMMapping
.Clear();
825 bool seenSoftEnd
= false;
826 // Leave this outside the loop so large heap string allocations can be reused
829 if (node
== mSoftEnd
.mNode
) {
834 if (IsSpellCheckingTextNode(node
)) {
835 nsIContent
* content
= static_cast<nsIContent
*>(node
);
836 NS_ASSERTION(content
, "Where is our content?");
837 const nsTextFragment
* textFragment
= content
->GetText();
838 NS_ASSERTION(textFragment
, "Where is our text?");
839 int32_t lastOffsetInNode
= textFragment
->GetLength();
842 // check whether we can stop after this
843 for (int32_t i
= node
== mSoftEnd
.mNode
? mSoftEnd
.mOffset
: 0;
844 i
< int32_t(textFragment
->GetLength()); ++i
) {
845 if (IsDOMWordSeparator(textFragment
->CharAt(i
))) {
847 // stop at the first separator after the soft end point
848 lastOffsetInNode
= i
;
854 if (firstOffsetInNode
< lastOffsetInNode
) {
855 int32_t len
= lastOffsetInNode
- firstOffsetInNode
;
856 mSoftTextDOMMapping
.AppendElement(DOMTextMapping(
857 NodeOffset(node
, firstOffsetInNode
), mSoftText
.Length(), len
));
859 bool ok
= textFragment
->AppendTo(mSoftText
, firstOffsetInNode
, len
,
862 // probably out of memory, remove from mSoftTextDOMMapping
863 mSoftTextDOMMapping
.RemoveLastElement();
868 firstOffsetInNode
= 0;
873 CheckLeavingBreakElementClosure closure
= {false};
874 node
= FindNextNode(node
, mRootNode
, CheckLeavingBreakElement
, &closure
);
875 if (closure
.mLeftBreakElement
|| (node
&& IsBreakElement(node
))) {
876 // We left, or are entering, a break element (e.g., block). Maybe we can
878 if (seenSoftEnd
) break;
880 mSoftText
.Append(' ');
884 MOZ_LOG(sInlineSpellWordUtilLog
, LogLevel::Debug
,
885 ("%s: got DOM string: %s", __FUNCTION__
,
886 NS_ConvertUTF16toUTF8(mSoftText
).get()));
889 nsresult
mozInlineSpellWordUtil::BuildRealWords() {
890 // This is pretty simple. We just have to walk mSoftText, tokenizing it
891 // into "real words".
892 // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
893 // SplitDOMWord on each of those DOM words
894 int32_t wordStart
= -1;
896 for (int32_t i
= 0; i
< int32_t(mSoftText
.Length()); ++i
) {
897 if (IsDOMWordSeparator(mSoftText
.CharAt(i
))) {
898 if (wordStart
>= 0) {
899 nsresult rv
= SplitDOMWord(wordStart
, i
);
911 if (wordStart
>= 0) {
912 nsresult rv
= SplitDOMWord(wordStart
, mSoftText
.Length());
921 /*********** DOM/realwords<->mSoftText mapping functions ************/
923 int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
924 NodeOffset aNodeOffset
) {
925 if (!mSoftTextValid
) {
926 NS_ERROR("Soft text must be valid if we're to map into it");
930 for (int32_t i
= 0; i
< int32_t(mSoftTextDOMMapping
.Length()); ++i
) {
931 const DOMTextMapping
& map
= mSoftTextDOMMapping
[i
];
932 if (map
.mNodeOffset
.mNode
== aNodeOffset
.mNode
) {
933 // Allow offsets at either end of the string, in particular, allow the
934 // offset that's at the end of the contributed string
935 int32_t offsetInContributedString
=
936 aNodeOffset
.mOffset
- map
.mNodeOffset
.mOffset
;
937 if (offsetInContributedString
>= 0 &&
938 offsetInContributedString
<= map
.mLength
)
939 return map
.mSoftTextOffset
+ offsetInContributedString
;
949 class FirstLargerOffset
{
950 int32_t mSoftTextOffset
;
953 explicit FirstLargerOffset(int32_t aSoftTextOffset
)
954 : mSoftTextOffset(aSoftTextOffset
) {}
955 int operator()(const T
& t
) const {
956 // We want the first larger offset, so never return 0 (which would
957 // short-circuit evaluation before finding the last such offset).
958 return mSoftTextOffset
< t
.mSoftTextOffset
? -1 : 1;
963 bool FindLastNongreaterOffset(const nsTArray
<T
>& aContainer
,
964 int32_t aSoftTextOffset
, size_t* aIndex
) {
965 if (aContainer
.Length() == 0) {
969 BinarySearchIf(aContainer
, 0, aContainer
.Length(),
970 FirstLargerOffset
<T
>(aSoftTextOffset
), aIndex
);
972 // There was at least one mapping with offset <= aSoftTextOffset. Step back
973 // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
976 // Every mapping had offset greater than aSoftTextOffset.
977 MOZ_ASSERT(aContainer
[*aIndex
].mSoftTextOffset
> aSoftTextOffset
);
984 NodeOffset
mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
985 int32_t aSoftTextOffset
, DOMMapHint aHint
) {
986 NS_ASSERTION(mSoftTextValid
,
987 "Soft text must be valid if we're to map out of it");
988 if (!mSoftTextValid
) return NodeOffset(nullptr, -1);
990 // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
993 FindLastNongreaterOffset(mSoftTextDOMMapping
, aSoftTextOffset
, &index
);
995 return NodeOffset(nullptr, -1);
998 // 'index' is now the last mapping, if any, such that
999 // mSoftTextOffset <= aSoftTextOffset.
1000 // If we're doing HINT_END, then we may want to return the end of the
1001 // the previous mapping instead of the start of this mapping
1002 if (aHint
== HINT_END
&& index
> 0) {
1003 const DOMTextMapping
& map
= mSoftTextDOMMapping
[index
- 1];
1004 if (map
.mSoftTextOffset
+ map
.mLength
== aSoftTextOffset
)
1005 return NodeOffset(map
.mNodeOffset
.mNode
,
1006 map
.mNodeOffset
.mOffset
+ map
.mLength
);
1009 // We allow ourselves to return the end of this mapping even if we're
1010 // doing HINT_START. This will only happen if there is no mapping which this
1011 // point is the start of. I'm not 100% sure this is OK...
1012 const DOMTextMapping
& map
= mSoftTextDOMMapping
[index
];
1013 int32_t offset
= aSoftTextOffset
- map
.mSoftTextOffset
;
1014 if (offset
>= 0 && offset
<= map
.mLength
)
1015 return NodeOffset(map
.mNodeOffset
.mNode
, map
.mNodeOffset
.mOffset
+ offset
);
1017 return NodeOffset(nullptr, -1);
1021 void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint
,
1022 nsACString
& aResult
) {
1025 aResult
.AssignLiteral("begin");
1028 aResult
.AssignLiteral("end");
1033 int32_t mozInlineSpellWordUtil::FindRealWordContaining(
1034 int32_t aSoftTextOffset
, DOMMapHint aHint
, bool aSearchForward
) const {
1035 if (MOZ_LOG_TEST(sInlineSpellWordUtilLog
, LogLevel::Debug
)) {
1037 mozInlineSpellWordUtil::ToString(aHint
, hint
);
1040 sInlineSpellWordUtilLog
, LogLevel::Debug
,
1041 ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__
,
1042 aSoftTextOffset
, hint
.get(), static_cast<int32_t>(aSearchForward
)));
1045 NS_ASSERTION(mSoftTextValid
,
1046 "Soft text must be valid if we're to map out of it");
1047 if (!mSoftTextValid
) return -1;
1049 // Find the last word, if any, such that mSoftTextOffset <= aSoftTextOffset
1051 bool found
= FindLastNongreaterOffset(mRealWords
, aSoftTextOffset
, &index
);
1056 // 'index' is now the last word, if any, such that
1057 // mSoftTextOffset <= aSoftTextOffset.
1058 // If we're doing HINT_END, then we may want to return the end of the
1059 // the previous word instead of the start of this word
1060 if (aHint
== HINT_END
&& index
> 0) {
1061 const RealWord
& word
= mRealWords
[index
- 1];
1062 if (word
.mSoftTextOffset
+ word
.mLength
== aSoftTextOffset
)
1066 // We allow ourselves to return the end of this word even if we're
1067 // doing HINT_START. This will only happen if there is no word which this
1068 // point is the start of. I'm not 100% sure this is OK...
1069 const RealWord
& word
= mRealWords
[index
];
1070 int32_t offset
= aSoftTextOffset
- word
.mSoftTextOffset
;
1071 if (offset
>= 0 && offset
<= static_cast<int32_t>(word
.mLength
)) return index
;
1073 if (aSearchForward
) {
1074 if (mRealWords
[0].mSoftTextOffset
> aSoftTextOffset
) {
1075 // All words have mSoftTextOffset > aSoftTextOffset
1078 // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
1079 // Word index+1, if it exists, will be the first with
1080 // mSoftTextOffset > aSoftTextOffset.
1081 if (index
+ 1 < mRealWords
.Length()) return index
+ 1;
1087 // mozInlineSpellWordUtil::SplitDOMWord
1089 nsresult
mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart
, int32_t aEnd
) {
1090 nsDependentSubstring
targetText(mSoftText
, aStart
, aEnd
- aStart
);
1091 WordSplitState
<nsDependentSubstring
> state(targetText
);
1092 state
.mCurCharClass
= state
.ClassifyCharacter(0, true);
1094 state
.AdvanceThroughSeparators();
1095 if (state
.mCurCharClass
!= CHAR_CLASS_END_OF_INPUT
&& state
.IsSpecialWord()) {
1096 int32_t specialWordLength
=
1097 state
.mDOMWordText
.Length() - state
.mDOMWordOffset
;
1098 if (!mRealWords
.AppendElement(
1099 RealWord(aStart
+ state
.mDOMWordOffset
, specialWordLength
, false),
1101 return NS_ERROR_OUT_OF_MEMORY
;
1107 while (state
.mCurCharClass
!= CHAR_CLASS_END_OF_INPUT
) {
1108 state
.AdvanceThroughSeparators();
1109 if (state
.mCurCharClass
== CHAR_CLASS_END_OF_INPUT
) break;
1111 // save the beginning of the word
1112 int32_t wordOffset
= state
.mDOMWordOffset
;
1114 // find the end of the word
1115 state
.AdvanceThroughWord();
1116 int32_t wordLen
= state
.mDOMWordOffset
- wordOffset
;
1117 if (!mRealWords
.AppendElement(
1118 RealWord(aStart
+ wordOffset
, wordLen
,
1119 !state
.ShouldSkipWord(wordOffset
, wordLen
)),
1121 return NS_ERROR_OUT_OF_MEMORY
;