Bug 1888590 - Mark some subtests on trusted-types-event-handlers.html as failing...
[gecko.git] / extensions / spellcheck / src / mozInlineSpellWordUtil.cpp
blob17a661ade3825f5e62b70008ebbbad2ca64fa3a2
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozInlineSpellWordUtil.h"
8 #include <algorithm>
9 #include <utility>
11 #include "mozilla/BinarySearch.h"
12 #include "mozilla/EditorBase.h"
13 #include "mozilla/HTMLEditor.h"
14 #include "mozilla/Logging.h"
15 #include "mozilla/dom/Element.h"
17 #include "nsDebug.h"
18 #include "nsAtom.h"
19 #include "nsComponentManagerUtils.h"
20 #include "nsUnicodeProperties.h"
21 #include "nsServiceManagerUtils.h"
22 #include "nsIContent.h"
23 #include "nsTextFragment.h"
24 #include "nsRange.h"
25 #include "nsContentUtils.h"
26 #include "nsIFrame.h"
28 using namespace mozilla;
30 static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"};
32 // IsIgnorableCharacter
34 // These characters are ones that we should ignore in input.
36 inline bool IsIgnorableCharacter(char ch) {
37 return (ch == static_cast<char>(0xAD)); // SOFT HYPHEN
40 inline bool IsIgnorableCharacter(char16_t ch) {
41 return (ch == 0xAD || // SOFT HYPHEN
42 ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
45 // IsConditionalPunctuation
47 // Some characters (like apostrophes) require characters on each side to be
48 // part of a word, and are otherwise punctuation.
50 inline bool IsConditionalPunctuation(char ch) {
51 return (ch == '\'' || // RIGHT SINGLE QUOTATION MARK
52 ch == static_cast<char>(0xB7)); // MIDDLE DOT
55 inline bool IsConditionalPunctuation(char16_t ch) {
56 return (ch == '\'' || ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
57 ch == 0x00B7); // MIDDLE DOT
60 static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
61 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
62 return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
63 IsConditionalPunctuation(ch));
66 static bool IsAmbiguousDOMWordSeprator(char ch) {
67 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
68 return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
71 // IsDOMWordSeparator
73 // Determines if the given character should be considered as a DOM Word
74 // separator. Basically, this is whitespace, although it could also have
75 // certain punctuation that we know ALWAYS breaks words. This is important.
76 // For example, we can't have any punctuation that could appear in a URL
77 // or email address in this, because those need to always fit into a single
78 // DOM word.
80 static bool IsDOMWordSeparator(char ch) {
81 // simple spaces or no-break space
82 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
83 ch == static_cast<char>(0xA0));
86 static bool IsDOMWordSeparator(char16_t ch) {
87 // simple spaces
88 if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
90 // complex spaces - check only if char isn't ASCII (uncommon)
91 if (ch >= 0xA0 && (ch == 0x00A0 || // NO-BREAK SPACE
92 ch == 0x2002 || // EN SPACE
93 ch == 0x2003 || // EM SPACE
94 ch == 0x2009 || // THIN SPACE
95 ch == 0x3000)) // IDEOGRAPHIC SPACE
96 return true;
98 // otherwise not a space
99 return false;
102 bool NodeOffset::operator==(
103 const mozilla::RangeBoundary& aRangeBoundary) const {
104 if (aRangeBoundary.Container() != mNode) {
105 return false;
108 const Maybe<uint32_t> rangeBoundaryOffset =
109 aRangeBoundary.Offset(RangeBoundary::OffsetFilter::kValidOffsets);
111 MOZ_ASSERT(mOffset >= 0);
112 return rangeBoundaryOffset &&
113 (*rangeBoundaryOffset == static_cast<uint32_t>(mOffset));
116 bool NodeOffsetRange::operator==(const nsRange& aRange) const {
117 return mBegin == aRange.StartRef() && mEnd == aRange.EndRef();
120 // static
121 Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create(
122 const EditorBase& aEditorBase) {
123 dom::Document* document = aEditorBase.GetDocument();
124 if (NS_WARN_IF(!document)) {
125 return Nothing();
128 const bool isContentEditableOrDesignMode = aEditorBase.IsHTMLEditor();
130 // Find the root node for the editor. For contenteditable the mRootNode could
131 // change to shadow root if the begin and end are inside the shadowDOM.
132 nsINode* rootNode = aEditorBase.GetRoot();
133 if (NS_WARN_IF(!rootNode)) {
134 return Nothing();
137 mozInlineSpellWordUtil util{*document, isContentEditableOrDesignMode,
138 *rootNode};
139 return Some(std::move(util));
142 static inline bool IsSpellCheckingTextNode(nsINode* aNode) {
143 nsIContent* parent = aNode->GetParent();
144 if (parent &&
145 parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))
146 return false;
147 return aNode->IsText();
150 typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
152 // Find the next node in the DOM tree in preorder.
153 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
154 // why we can't just use GetNextNode here, sadly.
155 static nsINode* FindNextNode(nsINode* aNode, const nsINode* aRoot,
156 OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) {
157 MOZ_ASSERT(aNode, "Null starting node?");
159 nsINode* next = aNode->GetFirstChild();
160 if (next) return next;
162 // Don't look at siblings or otherwise outside of aRoot
163 if (aNode == aRoot) return nullptr;
165 next = aNode->GetNextSibling();
166 if (next) return next;
168 // Go up
169 for (;;) {
170 if (aOnLeaveNode) {
171 aOnLeaveNode(aNode, aClosure);
174 next = aNode->GetParent();
175 if (next == aRoot || !next) return nullptr;
176 aNode = next;
178 next = aNode->GetNextSibling();
179 if (next) return next;
183 // aNode is not a text node. Find the first text node starting at aNode/aOffset
184 // in a preorder DOM traversal.
185 static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset,
186 const nsINode* aRoot) {
187 MOZ_ASSERT(aNode, "Null starting node?");
188 MOZ_ASSERT(!IsSpellCheckingTextNode(aNode),
189 "FindNextTextNode should start with a non-text node");
191 nsINode* checkNode;
192 // Need to start at the aOffset'th child
193 nsIContent* child = aNode->GetChildAt_Deprecated(aOffset);
195 if (child) {
196 checkNode = child;
197 } else {
198 // aOffset was beyond the end of the child list.
199 // goto next node after the last descendant of aNode in
200 // a preorder DOM traversal.
201 checkNode = aNode->GetNextNonChildNode(aRoot);
204 while (checkNode && !IsSpellCheckingTextNode(checkNode)) {
205 checkNode = checkNode->GetNextNode(aRoot);
207 return checkNode;
210 // mozInlineSpellWordUtil::SetPositionAndEnd
212 // We have two ranges "hard" and "soft". The hard boundary is simply
213 // the scope of the root node. The soft boundary is that which is set
214 // by the caller of this class by calling this function. If this function is
215 // not called, the soft boundary is the same as the hard boundary.
217 // When we reach the soft boundary (mSoftText.GetEnd()), we keep
218 // going until we reach the end of a word. This allows the caller to set the
219 // end of the range to anything, and we will always check whole multiples of
220 // words. When we reach the hard boundary we stop no matter what.
222 // There is no beginning soft boundary. This is because we only go to the
223 // previous node once, when finding the previous word boundary in
224 // SetPosition(). You might think of the soft boundary as being this initial
225 // position.
227 nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,
228 int32_t aPositionOffset,
229 nsINode* aEndNode,
230 int32_t aEndOffset) {
231 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
232 ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode,
233 aPositionOffset, aEndNode, aEndOffset));
235 MOZ_ASSERT(aPositionNode, "Null begin node?");
236 MOZ_ASSERT(aEndNode, "Null end node?");
238 MOZ_ASSERT(mRootNode, "Not initialized");
240 // Find a appropriate root if we are dealing with contenteditable nodes which
241 // are in the shadow DOM.
242 if (mIsContentEditableOrDesignMode) {
243 nsINode* rootNode = aPositionNode->SubtreeRoot();
244 if (rootNode != aEndNode->SubtreeRoot()) {
245 return NS_ERROR_FAILURE;
248 if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {
249 mRootNode = rootNode;
253 mSoftText.Invalidate();
255 if (!IsSpellCheckingTextNode(aPositionNode)) {
256 // Start at the start of the first text node after aNode/aOffset.
257 aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);
258 aPositionOffset = 0;
260 NodeOffset softBegin = NodeOffset(aPositionNode, aPositionOffset);
262 if (!IsSpellCheckingTextNode(aEndNode)) {
263 // End at the start of the first text node after aEndNode/aEndOffset.
264 aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
265 aEndOffset = 0;
267 NodeOffset softEnd = NodeOffset(aEndNode, aEndOffset);
269 nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
270 if (NS_FAILED(rv)) {
271 return rv;
274 int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftText.GetBegin());
275 if (textOffset < 0) {
276 return NS_OK;
279 mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
280 return NS_OK;
283 nsresult mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin,
284 NodeOffset aSoftEnd) {
285 if (mSoftText.mIsValid) return NS_OK;
286 mSoftText.AdjustBeginAndBuildText(std::move(aSoftBegin), std::move(aSoftEnd),
287 mRootNode);
289 mRealWords.Clear();
290 Result<RealWords, nsresult> realWords = BuildRealWords();
291 if (realWords.isErr()) {
292 return realWords.unwrapErr();
295 mRealWords = realWords.unwrap();
296 mSoftText.mIsValid = true;
297 return NS_OK;
300 nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord,
301 nsRange** aRange) const {
302 NodeOffset begin =
303 MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
304 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
305 return MakeRange(begin, end, aRange);
307 void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
308 const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) {
309 NodeOffset begin =
310 MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
311 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
312 *aNodeOffsetRange = NodeOffsetRange(begin, end);
315 // mozInlineSpellWordUtil::GetRangeForWord
317 nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode,
318 int32_t aWordOffset,
319 nsRange** aRange) {
320 // Set our soft end and start
321 NodeOffset pt(aWordNode, aWordOffset);
323 if (!mSoftText.mIsValid || pt != mSoftText.GetBegin() ||
324 pt != mSoftText.GetEnd()) {
325 mSoftText.Invalidate();
326 NodeOffset softBegin = pt;
327 NodeOffset softEnd = pt;
328 nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
329 if (NS_FAILED(rv)) {
330 return rv;
334 int32_t offset = MapDOMPositionToSoftTextOffset(pt);
335 if (offset < 0) return MakeRange(pt, pt, aRange);
336 int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
337 if (wordIndex < 0) return MakeRange(pt, pt, aRange);
338 return MakeRangeForWord(mRealWords[wordIndex], aRange);
341 // This is to fix characters that the spellchecker may not like
342 static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen,
343 nsAString& aOutput) {
344 aOutput.Truncate();
345 for (int32_t i = 0; i < aLen; i++) {
346 char16_t ch = aInput.CharAt(i + aPos);
348 // remove ignorable characters from the word
349 if (IsIgnorableCharacter(ch)) continue;
351 // the spellchecker doesn't handle curly apostrophes in all languages
352 if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
353 ch = '\'';
356 aOutput.Append(ch);
360 // mozInlineSpellWordUtil::GetNextWord
362 // FIXME-optimization: we shouldn't have to generate a range every single
363 // time. It would be better if the inline spellchecker didn't require a
364 // range unless the word was misspelled. This may or may not be possible.
366 bool mozInlineSpellWordUtil::GetNextWord(Word& aWord) {
367 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
368 ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex));
370 if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) {
371 mNextWordIndex = -1;
372 aWord.mSkipChecking = true;
373 return false;
376 const RealWord& realWord = mRealWords[mNextWordIndex];
377 MakeNodeOffsetRangeForWord(realWord, &aWord.mNodeOffsetRange);
378 ++mNextWordIndex;
379 aWord.mSkipChecking = !realWord.mCheckableWord;
380 ::NormalizeWord(mSoftText.GetValue(), realWord.mSoftTextOffset,
381 realWord.mLength, aWord.mText);
383 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
384 ("%s: returning: %s (skip=%d)", __FUNCTION__,
385 NS_ConvertUTF16toUTF8(aWord.mText).get(), aWord.mSkipChecking));
387 return true;
390 // mozInlineSpellWordUtil::MakeRange
392 // Convenience function for creating a range over the current document.
394 nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
395 nsRange** aRange) const {
396 NS_ENSURE_ARG_POINTER(aBegin.mNode);
397 if (!mDocument) {
398 return NS_ERROR_NOT_INITIALIZED;
401 ErrorResult error;
402 RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset,
403 aEnd.mNode, aEnd.mOffset, error);
404 if (NS_WARN_IF(error.Failed())) {
405 return error.StealNSResult();
407 MOZ_ASSERT(range);
408 range.forget(aRange);
409 return NS_OK;
412 // static
413 already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange(
414 const NodeOffsetRange& aRange) {
415 IgnoredErrorResult ignoredError;
416 RefPtr<nsRange> range =
417 nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(),
418 aRange.End().Node(), aRange.End().Offset(), ignoredError);
419 NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed");
420 return range.forget();
423 /*********** Word Splitting ************/
425 // classifies a given character in the DOM word
426 enum CharClass {
427 CHAR_CLASS_WORD,
428 CHAR_CLASS_SEPARATOR,
429 CHAR_CLASS_END_OF_INPUT
432 // Encapsulates DOM-word to real-word splitting
433 template <class T>
434 struct MOZ_STACK_CLASS WordSplitState {
435 const T& mDOMWordText;
436 int32_t mDOMWordOffset;
437 CharClass mCurCharClass;
439 explicit WordSplitState(const T& aString)
440 : mDOMWordText(aString),
441 mDOMWordOffset(0),
442 mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
444 CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
445 void Advance();
446 void AdvanceThroughSeparators();
447 void AdvanceThroughWord();
449 // Finds special words like email addresses and URLs that may start at the
450 // current position, and returns their length, or 0 if not found. This allows
451 // arbitrary word breaking rules to be used for these special entities, as
452 // long as they can not contain whitespace.
453 bool IsSpecialWord() const;
455 // Similar to IsSpecialWord except that this takes a split word as
456 // input. This checks for things that do not require special word-breaking
457 // rules.
458 bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;
460 // Finds the last sequence of DOM word separators before aBeforeOffset and
461 // returns the offset to its first element.
462 Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
463 int32_t aBeforeOffset) const;
465 char16_t GetUnicharAt(int32_t aIndex) const;
468 // WordSplitState::ClassifyCharacter
469 template <class T>
470 CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
471 bool aRecurse) const {
472 MOZ_ASSERT(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
473 "Index out of range");
474 if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;
476 // this will classify the character, we want to treat "ignorable" characters
477 // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
478 nsUGenCategory charCategory =
479 mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
480 if (charCategory == nsUGenCategory::kLetter ||
481 IsIgnorableCharacter(mDOMWordText[aIndex]) ||
482 mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
483 mDOMWordText[aIndex] == 0x200D /* ZWJ */)
484 return CHAR_CLASS_WORD;
486 // If conditional punctuation is surrounded immediately on both sides by word
487 // characters it also counts as a word character.
488 if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
489 if (!aRecurse) {
490 // not allowed to look around, this punctuation counts like a separator
491 return CHAR_CLASS_SEPARATOR;
494 // check the left-hand character
495 if (aIndex == 0) return CHAR_CLASS_SEPARATOR;
496 if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
497 return CHAR_CLASS_SEPARATOR;
498 // If the previous charatcer is a word-char, make sure that it's not a
499 // special dot character.
500 if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR;
502 // now we know left char is a word-char, check the right-hand character
503 if (aIndex == int32_t(mDOMWordText.Length() - 1)) {
504 return CHAR_CLASS_SEPARATOR;
507 if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
508 return CHAR_CLASS_SEPARATOR;
509 // If the next charatcer is a word-char, make sure that it's not a
510 // special dot character.
511 if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR;
513 // char on either side is a word, this counts as a word
514 return CHAR_CLASS_WORD;
517 // The dot character, if appearing at the end of a word, should
518 // be considered part of that word. Example: "etc.", or
519 // abbreviations
520 if (aIndex > 0 && mDOMWordText[aIndex] == '.' &&
521 mDOMWordText[aIndex - 1] != '.' &&
522 ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
523 return CHAR_CLASS_WORD;
526 // all other punctuation
527 if (charCategory == nsUGenCategory::kSeparator ||
528 charCategory == nsUGenCategory::kOther ||
529 charCategory == nsUGenCategory::kPunctuation ||
530 charCategory == nsUGenCategory::kSymbol) {
531 // Don't break on hyphens, as hunspell handles them on its own.
532 if (aIndex > 0 && mDOMWordText[aIndex] == '-' &&
533 mDOMWordText[aIndex - 1] != '-' &&
534 ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
535 // A hyphen is only meaningful as a separator inside a word
536 // if the previous and next characters are a word character.
537 if (aIndex == int32_t(mDOMWordText.Length()) - 1)
538 return CHAR_CLASS_SEPARATOR;
539 if (mDOMWordText[aIndex + 1] != '.' &&
540 ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
541 return CHAR_CLASS_WORD;
543 return CHAR_CLASS_SEPARATOR;
546 // any other character counts as a word
547 return CHAR_CLASS_WORD;
550 // WordSplitState::Advance
551 template <class T>
552 void WordSplitState<T>::Advance() {
553 MOZ_ASSERT(mDOMWordOffset >= 0, "Negative word index");
554 MOZ_ASSERT(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
555 "Length beyond end");
557 mDOMWordOffset++;
558 if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
559 mCurCharClass = CHAR_CLASS_END_OF_INPUT;
560 else
561 mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
564 // WordSplitState::AdvanceThroughSeparators
565 template <class T>
566 void WordSplitState<T>::AdvanceThroughSeparators() {
567 while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance();
570 // WordSplitState::AdvanceThroughWord
571 template <class T>
572 void WordSplitState<T>::AdvanceThroughWord() {
573 while (mCurCharClass == CHAR_CLASS_WORD) Advance();
576 // WordSplitState::IsSpecialWord
577 template <class T>
578 bool WordSplitState<T>::IsSpecialWord() const {
579 // Search for email addresses. We simply define these as any sequence of
580 // characters with an '@' character in the middle. The DOM word is already
581 // split on whitepace, so we know that everything to the end is the address
582 int32_t firstColon = -1;
583 for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) {
584 if (mDOMWordText[i] == '@') {
585 // only accept this if there are unambiguous word characters (don't bother
586 // recursing to disambiguate apostrophes) on each side. This prevents
587 // classifying, e.g. "@home" as an email address
589 // Use this condition to only accept words with '@' in the middle of
590 // them. It works, but the inlinespellcker doesn't like this. The problem
591 // is that you type "fhsgfh@" that's a misspelled word followed by a
592 // symbol, but when you type another letter "fhsgfh@g" that first word
593 // need to be unmarked misspelled. It doesn't do this. it only checks the
594 // current position for potentially removing a spelling range.
595 if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
596 i < (int32_t)mDOMWordText.Length() - 1 &&
597 ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
598 return true;
600 } else if (mDOMWordText[i] == ':' && firstColon < 0) {
601 firstColon = i;
603 // If the first colon is followed by a slash, consider it a URL
604 // This will catch things like asdf://foo.com
605 if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
606 mDOMWordText[firstColon + 1] == '/') {
607 return true;
612 // Check the text before the first colon against some known protocols. It
613 // is impossible to check against all protocols, especially since you can
614 // plug in new protocols. We also don't want to waste time here checking
615 // against a lot of obscure protocols.
616 if (firstColon > mDOMWordOffset) {
617 nsString protocol(
618 Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset));
619 if (protocol.EqualsIgnoreCase("http") ||
620 protocol.EqualsIgnoreCase("https") ||
621 protocol.EqualsIgnoreCase("news") ||
622 protocol.EqualsIgnoreCase("file") ||
623 protocol.EqualsIgnoreCase("javascript") ||
624 protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) {
625 return true;
629 // not anything special
630 return false;
633 // WordSplitState::ShouldSkipWord
634 template <class T>
635 bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
636 int32_t last = aStart + aLength;
638 // check to see if the word contains a digit
639 for (int32_t i = aStart; i < last; i++) {
640 if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) ==
641 nsUGenCategory::kNumber) {
642 return true;
646 // not special
647 return false;
650 template <class T>
651 Maybe<int32_t> WordSplitState<T>::FindOffsetOfLastDOMWordSeparatorSequence(
652 const int32_t aBeforeOffset) const {
653 for (int32_t i = aBeforeOffset - 1; i >= 0; --i) {
654 if (IsDOMWordSeparator(mDOMWordText[i]) ||
655 (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) &&
656 ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) {
657 // Be greedy, find as many separators as we can
658 for (int32_t j = i - 1; j >= 0; --j) {
659 if (IsDOMWordSeparator(mDOMWordText[j]) ||
660 (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) &&
661 ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) {
662 i = j;
663 } else {
664 break;
667 return Some(i);
670 return Nothing();
673 template <>
674 char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
675 int32_t aIndex) const {
676 return mDOMWordText[aIndex];
679 template <>
680 char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
681 int32_t aIndex) const {
682 return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
685 static inline bool IsBRElement(nsINode* aNode) {
686 return aNode->IsHTMLElement(nsGkAtoms::br);
690 * Given a TextNode, finds the last sequence of DOM word separators before
691 * aBeforeOffset and returns the offset to its first element.
693 * @param aContent the TextNode to check.
694 * @param aBeforeOffset the offset in the TextNode before which we will search
695 * for the DOM separator. You can pass INT32_MAX to search the entire
696 * length of the string.
698 static Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
699 nsIContent* aContent, int32_t aBeforeOffset) {
700 const nsTextFragment* textFragment = aContent->GetText();
701 MOZ_ASSERT(textFragment, "Where is our text?");
702 int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));
704 if (textFragment->Is2b()) {
705 nsDependentSubstring targetText(textFragment->Get2b(), end);
706 WordSplitState<nsDependentSubstring> state(targetText);
707 return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
710 nsDependentCSubstring targetText(textFragment->Get1b(), end);
711 WordSplitState<nsDependentCSubstring> state(targetText);
712 return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
716 * Check if there's a DOM word separator before aBeforeOffset in this node.
717 * Always returns true if it's a BR element.
718 * aSeparatorOffset is set to the index of the first character in the last
719 * separator if any is found (0 for BR elements).
721 * This function does not modify aSeparatorOffset when it returns false.
723 static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
724 int32_t* aSeparatorOffset) {
725 if (IsBRElement(aNode)) {
726 *aSeparatorOffset = 0;
727 return true;
730 if (!IsSpellCheckingTextNode(aNode)) return false;
732 const Maybe<int32_t> separatorOffset =
733 FindOffsetOfLastDOMWordSeparatorSequence(aNode->AsContent(),
734 aBeforeOffset);
735 if (separatorOffset) {
736 *aSeparatorOffset = *separatorOffset;
737 return true;
740 return false;
743 static bool IsBreakElement(nsINode* aNode) {
744 if (!aNode->IsElement()) {
745 return false;
748 dom::Element* element = aNode->AsElement();
749 if (element->IsHTMLElement(nsGkAtoms::br)) {
750 return true;
753 // If we don't have a frame, we don't consider ourselves a break
754 // element. In particular, words can span us.
755 nsIFrame* frame = element->GetPrimaryFrame();
756 if (!frame) {
757 return false;
760 auto* disp = frame->StyleDisplay();
761 // Anything that's not an inline element is a break element.
762 // XXXbz should replaced inlines be break elements, though?
763 // Also should inline-block and such be break elements?
765 // FIXME(emilio): We should teach the spell checker to deal with generated
766 // content (it doesn't at all), then remove the IsListItem() check, as there
767 // could be no marker, etc...
768 return !disp->IsInlineFlow() || disp->IsListItem();
771 struct CheckLeavingBreakElementClosure {
772 bool mLeftBreakElement;
775 static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) {
776 CheckLeavingBreakElementClosure* cl =
777 static_cast<CheckLeavingBreakElementClosure*>(aClosure);
778 if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
779 cl->mLeftBreakElement = true;
783 void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) {
784 nsAutoString result;
785 ::NormalizeWord(aWord, 0, aWord.Length(), result);
786 aWord = result;
789 void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText(
790 NodeOffset aBegin, NodeOffset aEnd, const nsINode* aRootNode) {
791 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__));
793 mBegin = std::move(aBegin);
794 mEnd = std::move(aEnd);
796 // First we have to work backwards from mBegin to find a text node
797 // containing a DOM word separator, a non-inline-element
798 // boundary, or the hard start node. That's where we'll start building the
799 // soft string from.
800 nsINode* node = mBegin.mNode;
801 int32_t firstOffsetInNode = 0;
802 int32_t checkBeforeOffset = mBegin.mOffset;
803 while (node) {
804 if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
805 if (node == mBegin.mNode) {
806 // If we find a word separator on the first node, look at the preceding
807 // word on the text node as well.
808 if (firstOffsetInNode > 0) {
809 // Try to find the previous word boundary in the current node. If
810 // we can't find one, start checking previous sibling nodes (if any
811 // adjacent ones exist) to see if we can find any text nodes with
812 // DOM word separators. We bail out as soon as we see a node that is
813 // not a text node, or we run out of previous sibling nodes. In the
814 // event that we simply cannot find any preceding word separator, the
815 // offset is set to 0, and the soft text beginning node is set to the
816 // "most previous" text node before the original starting node, or
817 // kept at the original starting node if no previous text nodes exist.
818 int32_t newOffset = 0;
819 if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
820 &newOffset)) {
821 nsIContent* prevNode = node->GetPreviousSibling();
822 while (prevNode && IsSpellCheckingTextNode(prevNode)) {
823 mBegin.mNode = prevNode;
824 const Maybe<int32_t> separatorOffset =
825 FindOffsetOfLastDOMWordSeparatorSequence(prevNode, INT32_MAX);
826 if (separatorOffset) {
827 newOffset = *separatorOffset;
828 break;
830 prevNode = prevNode->GetPreviousSibling();
833 firstOffsetInNode = newOffset;
834 } else {
835 firstOffsetInNode = 0;
838 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
839 ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__,
840 mBegin.mOffset, firstOffsetInNode));
841 mBegin.mOffset = firstOffsetInNode;
843 break;
845 checkBeforeOffset = INT32_MAX;
846 if (IsBreakElement(node)) {
847 // Since GerPrevNode follows tree *preorder*, we're about to traverse up
848 // out of 'node'. Since node induces breaks (e.g., it's a block), don't
849 // bother trying to look outside it, just stop now.
850 break;
852 // GetPreviousContent below expects aRootNode to be an ancestor of node.
853 if (!node->IsInclusiveDescendantOf(aRootNode)) {
854 break;
856 node = node->GetPrevNode(aRootNode);
859 // Now build up the string moving forward through the DOM until we reach
860 // the soft end and *then* see a DOM word separator, a non-inline-element
861 // boundary, or the hard end node.
862 mValue.Truncate();
863 mDOMMapping.Clear();
864 bool seenSoftEnd = false;
865 // Leave this outside the loop so large heap string allocations can be reused
866 // across iterations
867 while (node) {
868 if (node == mEnd.mNode) {
869 seenSoftEnd = true;
872 bool exit = false;
873 if (IsSpellCheckingTextNode(node)) {
874 nsIContent* content = static_cast<nsIContent*>(node);
875 MOZ_ASSERT(content, "Where is our content?");
876 const nsTextFragment* textFragment = content->GetText();
877 MOZ_ASSERT(textFragment, "Where is our text?");
878 uint32_t lastOffsetInNode = textFragment->GetLength();
880 if (seenSoftEnd) {
881 // check whether we can stop after this
882 for (uint32_t i =
883 node == mEnd.mNode ? AssertedCast<uint32_t>(mEnd.mOffset) : 0;
884 i < textFragment->GetLength(); ++i) {
885 if (IsDOMWordSeparator(textFragment->CharAt(i))) {
886 exit = true;
887 // stop at the first separator after the soft end point
888 lastOffsetInNode = i;
889 break;
894 if (firstOffsetInNode >= 0 &&
895 static_cast<uint32_t>(firstOffsetInNode) < lastOffsetInNode) {
896 const uint32_t len = lastOffsetInNode - firstOffsetInNode;
897 mDOMMapping.AppendElement(DOMTextMapping(
898 NodeOffset(node, firstOffsetInNode), mValue.Length(), len));
900 const bool ok = textFragment->AppendTo(
901 mValue, static_cast<uint32_t>(firstOffsetInNode), len,
902 mozilla::fallible);
903 if (!ok) {
904 // probably out of memory, remove from mDOMMapping
905 mDOMMapping.RemoveLastElement();
906 exit = true;
910 firstOffsetInNode = 0;
913 if (exit) break;
915 CheckLeavingBreakElementClosure closure = {false};
916 node = FindNextNode(node, aRootNode, CheckLeavingBreakElement, &closure);
917 if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
918 // We left, or are entering, a break element (e.g., block). Maybe we can
919 // stop now.
920 if (seenSoftEnd) break;
921 // Record the break
922 mValue.Append(' ');
926 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
927 ("%s: got DOM string: %s", __FUNCTION__,
928 NS_ConvertUTF16toUTF8(mValue).get()));
931 auto mozInlineSpellWordUtil::BuildRealWords() const
932 -> Result<RealWords, nsresult> {
933 // This is pretty simple. We just have to walk mSoftText.GetValue(),
934 // tokenizing it into "real words". We do an outer traversal of words
935 // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of
936 // those DOM words
937 int32_t wordStart = -1;
938 RealWords realWords;
939 for (int32_t i = 0; i < int32_t(mSoftText.GetValue().Length()); ++i) {
940 if (IsDOMWordSeparator(mSoftText.GetValue().CharAt(i))) {
941 if (wordStart >= 0) {
942 nsresult rv = SplitDOMWordAndAppendTo(wordStart, i, realWords);
943 if (NS_FAILED(rv)) {
944 return Err(rv);
946 wordStart = -1;
948 } else {
949 if (wordStart < 0) {
950 wordStart = i;
954 if (wordStart >= 0) {
955 nsresult rv = SplitDOMWordAndAppendTo(
956 wordStart, mSoftText.GetValue().Length(), realWords);
957 if (NS_FAILED(rv)) {
958 return Err(rv);
962 return realWords;
965 /*********** DOM/realwords<->mSoftText.GetValue() mapping functions
966 * ************/
968 int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
969 const NodeOffset& aNodeOffset) const {
970 if (!mSoftText.mIsValid) {
971 NS_ERROR("Soft text must be valid if we're to map into it");
972 return -1;
975 for (int32_t i = 0; i < int32_t(mSoftText.GetDOMMapping().Length()); ++i) {
976 const DOMTextMapping& map = mSoftText.GetDOMMapping()[i];
977 if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
978 // Allow offsets at either end of the string, in particular, allow the
979 // offset that's at the end of the contributed string
980 int32_t offsetInContributedString =
981 aNodeOffset.mOffset - map.mNodeOffset.mOffset;
982 if (offsetInContributedString >= 0 &&
983 offsetInContributedString <= map.mLength)
984 return map.mSoftTextOffset + offsetInContributedString;
985 return -1;
988 return -1;
991 namespace {
993 template <class T>
994 class FirstLargerOffset {
995 int32_t mSoftTextOffset;
997 public:
998 explicit FirstLargerOffset(int32_t aSoftTextOffset)
999 : mSoftTextOffset(aSoftTextOffset) {}
1000 int operator()(const T& t) const {
1001 // We want the first larger offset, so never return 0 (which would
1002 // short-circuit evaluation before finding the last such offset).
1003 return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
1007 template <class T>
1008 bool FindLastNongreaterOffset(const nsTArray<T>& aContainer,
1009 int32_t aSoftTextOffset, size_t* aIndex) {
1010 if (aContainer.Length() == 0) {
1011 return false;
1014 BinarySearchIf(aContainer, 0, aContainer.Length(),
1015 FirstLargerOffset<T>(aSoftTextOffset), aIndex);
1016 if (*aIndex > 0) {
1017 // There was at least one mapping with offset <= aSoftTextOffset. Step back
1018 // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
1019 *aIndex -= 1;
1020 } else {
1021 // Every mapping had offset greater than aSoftTextOffset.
1022 MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
1024 return true;
1027 } // namespace
1029 NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
1030 int32_t aSoftTextOffset, DOMMapHint aHint) const {
1031 MOZ_ASSERT(mSoftText.mIsValid,
1032 "Soft text must be valid if we're to map out of it");
1033 if (!mSoftText.mIsValid) return NodeOffset(nullptr, -1);
1035 // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
1036 size_t index;
1037 bool found = FindLastNongreaterOffset(mSoftText.GetDOMMapping(),
1038 aSoftTextOffset, &index);
1039 if (!found) {
1040 return NodeOffset(nullptr, -1);
1043 // 'index' is now the last mapping, if any, such that
1044 // mSoftTextOffset <= aSoftTextOffset.
1045 // If we're doing HINT_END, then we may want to return the end of the
1046 // the previous mapping instead of the start of this mapping
1047 if (aHint == HINT_END && index > 0) {
1048 const DOMTextMapping& map = mSoftText.GetDOMMapping()[index - 1];
1049 if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
1050 return NodeOffset(map.mNodeOffset.mNode,
1051 map.mNodeOffset.mOffset + map.mLength);
1054 // We allow ourselves to return the end of this mapping even if we're
1055 // doing HINT_START. This will only happen if there is no mapping which this
1056 // point is the start of. I'm not 100% sure this is OK...
1057 const DOMTextMapping& map = mSoftText.GetDOMMapping()[index];
1058 int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
1059 if (offset >= 0 && offset <= map.mLength)
1060 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
1062 return NodeOffset(nullptr, -1);
1065 // static
1066 void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint,
1067 nsACString& aResult) {
1068 switch (aHint) {
1069 case HINT_BEGIN:
1070 aResult.AssignLiteral("begin");
1071 break;
1072 case HINT_END:
1073 aResult.AssignLiteral("end");
1074 break;
1078 int32_t mozInlineSpellWordUtil::FindRealWordContaining(
1079 int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const {
1080 if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) {
1081 nsAutoCString hint;
1082 mozInlineSpellWordUtil::ToString(aHint, hint);
1084 MOZ_LOG(
1085 sInlineSpellWordUtilLog, LogLevel::Debug,
1086 ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__,
1087 aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward)));
1090 MOZ_ASSERT(mSoftText.mIsValid,
1091 "Soft text must be valid if we're to map out of it");
1092 if (!mSoftText.mIsValid) return -1;
1094 // Find the last word, if any, such that mRealWords[index].mSoftTextOffset
1095 // <= aSoftTextOffset
1096 size_t index;
1097 bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
1098 if (!found) {
1099 return -1;
1102 // 'index' is now the last word, if any, such that
1103 // mSoftTextOffset <= aSoftTextOffset.
1104 // If we're doing HINT_END, then we may want to return the end of the
1105 // the previous word instead of the start of this word
1106 if (aHint == HINT_END && index > 0) {
1107 const RealWord& word = mRealWords[index - 1];
1108 if (word.EndOffset() == aSoftTextOffset) {
1109 return index - 1;
1113 // We allow ourselves to return the end of this word even if we're
1114 // doing HINT_BEGIN. This will only happen if there is no word which this
1115 // point is the start of. I'm not 100% sure this is OK...
1116 const RealWord& word = mRealWords[index];
1117 int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
1118 if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index;
1120 if (aSearchForward) {
1121 if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
1122 // All words have mSoftTextOffset > aSoftTextOffset
1123 return 0;
1125 // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
1126 // Word index+1, if it exists, will be the first with
1127 // mSoftTextOffset > aSoftTextOffset.
1128 if (index + 1 < mRealWords.Length()) return index + 1;
1131 return -1;
1134 // mozInlineSpellWordUtil::SplitDOMWordAndAppendTo
1136 nsresult mozInlineSpellWordUtil::SplitDOMWordAndAppendTo(
1137 int32_t aStart, int32_t aEnd, nsTArray<RealWord>& aRealWords) const {
1138 nsDependentSubstring targetText(mSoftText.GetValue(), aStart, aEnd - aStart);
1139 WordSplitState<nsDependentSubstring> state(targetText);
1140 state.mCurCharClass = state.ClassifyCharacter(0, true);
1142 state.AdvanceThroughSeparators();
1143 if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) {
1144 int32_t specialWordLength =
1145 state.mDOMWordText.Length() - state.mDOMWordOffset;
1146 if (!aRealWords.AppendElement(
1147 RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
1148 fallible)) {
1149 return NS_ERROR_OUT_OF_MEMORY;
1152 return NS_OK;
1155 while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1156 state.AdvanceThroughSeparators();
1157 if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break;
1159 // save the beginning of the word
1160 int32_t wordOffset = state.mDOMWordOffset;
1162 // find the end of the word
1163 state.AdvanceThroughWord();
1164 int32_t wordLen = state.mDOMWordOffset - wordOffset;
1165 if (!aRealWords.AppendElement(
1166 RealWord(aStart + wordOffset, wordLen,
1167 !state.ShouldSkipWord(wordOffset, wordLen)),
1168 fallible)) {
1169 return NS_ERROR_OUT_OF_MEMORY;
1173 return NS_OK;