Bug 1700051: part 20) Merge `mozInlineSpellStatus`'s constructor and its `Init` metho...
[gecko.git] / extensions / spellcheck / src / mozInlineSpellWordUtil.cpp
blob683f0292530c52ee241c88bc18b36add703c39e4
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozInlineSpellWordUtil.h"
8 #include <algorithm>
9 #include <utility>
11 #include "mozilla/BinarySearch.h"
12 #include "mozilla/HTMLEditor.h"
13 #include "mozilla/Logging.h"
14 #include "mozilla/TextEditor.h"
15 #include "mozilla/dom/Element.h"
17 #include "nsDebug.h"
18 #include "nsAtom.h"
19 #include "nsComponentManagerUtils.h"
20 #include "nsUnicodeProperties.h"
21 #include "nsServiceManagerUtils.h"
22 #include "nsIContent.h"
23 #include "nsTextFragment.h"
24 #include "nsRange.h"
25 #include "nsContentUtils.h"
26 #include "nsIFrame.h"
28 using namespace mozilla;
30 static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"};
32 // IsIgnorableCharacter
34 // These characters are ones that we should ignore in input.
36 inline bool IsIgnorableCharacter(char ch) {
37 return (ch == static_cast<char>(0xAD)); // SOFT HYPHEN
40 inline bool IsIgnorableCharacter(char16_t ch) {
41 return (ch == 0xAD || // SOFT HYPHEN
42 ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
45 // IsConditionalPunctuation
47 // Some characters (like apostrophes) require characters on each side to be
48 // part of a word, and are otherwise punctuation.
50 inline bool IsConditionalPunctuation(char ch) {
51 return (ch == '\'' || // RIGHT SINGLE QUOTATION MARK
52 ch == static_cast<char>(0xB7)); // MIDDLE DOT
55 inline bool IsConditionalPunctuation(char16_t ch) {
56 return (ch == '\'' || ch == 0x2019 || // RIGHT SINGLE QUOTATION MARK
57 ch == 0x00B7); // MIDDLE DOT
60 static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
61 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
62 return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
63 IsConditionalPunctuation(ch));
66 static bool IsAmbiguousDOMWordSeprator(char ch) {
67 // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
68 return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
71 // IsDOMWordSeparator
73 // Determines if the given character should be considered as a DOM Word
74 // separator. Basically, this is whitespace, although it could also have
75 // certain punctuation that we know ALWAYS breaks words. This is important.
76 // For example, we can't have any punctuation that could appear in a URL
77 // or email address in this, because those need to always fit into a single
78 // DOM word.
80 static bool IsDOMWordSeparator(char ch) {
81 // simple spaces or no-break space
82 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
83 ch == static_cast<char>(0xA0));
86 static bool IsDOMWordSeparator(char16_t ch) {
87 // simple spaces
88 if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
90 // complex spaces - check only if char isn't ASCII (uncommon)
91 if (ch >= 0xA0 && (ch == 0x00A0 || // NO-BREAK SPACE
92 ch == 0x2002 || // EN SPACE
93 ch == 0x2003 || // EM SPACE
94 ch == 0x2009 || // THIN SPACE
95 ch == 0x3000)) // IDEOGRAPHIC SPACE
96 return true;
98 // otherwise not a space
99 return false;
102 // static
103 Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create(
104 const TextEditor& aTextEditor) {
105 mozInlineSpellWordUtil util;
106 util.mDocument = aTextEditor.GetDocument();
107 if (NS_WARN_IF(!util.mDocument)) {
108 return Nothing();
111 util.mIsContentEditableOrDesignMode = !!aTextEditor.AsHTMLEditor();
113 // Find the root node for the editor. For contenteditable the mRootNode could
114 // change to shadow root if the begin and end are inside the shadowDOM.
115 util.mRootNode = aTextEditor.GetRoot();
116 if (NS_WARN_IF(!util.mRootNode)) {
117 return Nothing();
119 return Some(std::move(util));
122 static inline bool IsSpellCheckingTextNode(nsINode* aNode) {
123 nsIContent* parent = aNode->GetParent();
124 if (parent &&
125 parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))
126 return false;
127 return aNode->IsText();
130 typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
132 // Find the next node in the DOM tree in preorder.
133 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
134 // why we can't just use GetNextNode here, sadly.
135 static nsINode* FindNextNode(nsINode* aNode, nsINode* aRoot,
136 OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) {
137 MOZ_ASSERT(aNode, "Null starting node?");
139 nsINode* next = aNode->GetFirstChild();
140 if (next) return next;
142 // Don't look at siblings or otherwise outside of aRoot
143 if (aNode == aRoot) return nullptr;
145 next = aNode->GetNextSibling();
146 if (next) return next;
148 // Go up
149 for (;;) {
150 if (aOnLeaveNode) {
151 aOnLeaveNode(aNode, aClosure);
154 next = aNode->GetParent();
155 if (next == aRoot || !next) return nullptr;
156 aNode = next;
158 next = aNode->GetNextSibling();
159 if (next) return next;
163 // aNode is not a text node. Find the first text node starting at aNode/aOffset
164 // in a preorder DOM traversal.
165 static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset,
166 nsINode* aRoot) {
167 MOZ_ASSERT(aNode, "Null starting node?");
168 NS_ASSERTION(!IsSpellCheckingTextNode(aNode),
169 "FindNextTextNode should start with a non-text node");
171 nsINode* checkNode;
172 // Need to start at the aOffset'th child
173 nsIContent* child = aNode->GetChildAt_Deprecated(aOffset);
175 if (child) {
176 checkNode = child;
177 } else {
178 // aOffset was beyond the end of the child list.
179 // goto next node after the last descendant of aNode in
180 // a preorder DOM traversal.
181 checkNode = aNode->GetNextNonChildNode(aRoot);
184 while (checkNode && !IsSpellCheckingTextNode(checkNode)) {
185 checkNode = checkNode->GetNextNode(aRoot);
187 return checkNode;
190 // mozInlineSpellWordUtil::SetPositionAndEnd
192 // We have two ranges "hard" and "soft". The hard boundary is simply
193 // the scope of the root node. The soft boundary is that which is set
194 // by the caller of this class by calling this function. If this function is
195 // not called, the soft boundary is the same as the hard boundary.
197 // When we reach the soft boundary (mSoftEnd), we keep
198 // going until we reach the end of a word. This allows the caller to set the
199 // end of the range to anything, and we will always check whole multiples of
200 // words. When we reach the hard boundary we stop no matter what.
202 // There is no beginning soft boundary. This is because we only go to the
203 // previous node once, when finding the previous word boundary in
204 // SetPosition(). You might think of the soft boundary as being this initial
205 // position.
207 nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,
208 int32_t aPositionOffset,
209 nsINode* aEndNode,
210 int32_t aEndOffset) {
211 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
212 ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode,
213 aPositionOffset, aEndNode, aEndOffset));
215 MOZ_ASSERT(aPositionNode, "Null begin node?");
216 MOZ_ASSERT(aEndNode, "Null end node?");
218 NS_ASSERTION(mRootNode, "Not initialized");
220 // Find a appropriate root if we are dealing with contenteditable nodes which
221 // are in the shadow DOM.
222 if (mIsContentEditableOrDesignMode) {
223 nsINode* rootNode = aPositionNode->SubtreeRoot();
224 if (rootNode != aEndNode->SubtreeRoot()) {
225 return NS_ERROR_FAILURE;
228 if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {
229 mRootNode = rootNode;
233 InvalidateWords();
235 if (!IsSpellCheckingTextNode(aPositionNode)) {
236 // Start at the start of the first text node after aNode/aOffset.
237 aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);
238 aPositionOffset = 0;
240 mSoftBegin = NodeOffset(aPositionNode, aPositionOffset);
242 if (!IsSpellCheckingTextNode(aEndNode)) {
243 // End at the start of the first text node after aEndNode/aEndOffset.
244 aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
245 aEndOffset = 0;
247 mSoftEnd = NodeOffset(aEndNode, aEndOffset);
249 nsresult rv = EnsureWords();
250 if (NS_FAILED(rv)) {
251 return rv;
254 int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
255 if (textOffset < 0) {
256 return NS_OK;
259 mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
260 return NS_OK;
263 nsresult mozInlineSpellWordUtil::EnsureWords() {
264 if (mSoftTextValid) return NS_OK;
265 BuildSoftText();
266 nsresult rv = BuildRealWords();
267 if (NS_FAILED(rv)) {
268 mRealWords.Clear();
269 return rv;
271 mSoftTextValid = true;
272 return NS_OK;
275 nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord,
276 nsRange** aRange) {
277 NodeOffset begin =
278 MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
279 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
280 return MakeRange(begin, end, aRange);
282 void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
283 const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) {
284 NodeOffset begin =
285 MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
286 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
287 *aNodeOffsetRange = NodeOffsetRange(begin, end);
290 // mozInlineSpellWordUtil::GetRangeForWord
292 nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode,
293 int32_t aWordOffset,
294 nsRange** aRange) {
295 // Set our soft end and start
296 NodeOffset pt(aWordNode, aWordOffset);
298 if (!mSoftTextValid || pt != mSoftBegin || pt != mSoftEnd) {
299 InvalidateWords();
300 mSoftBegin = mSoftEnd = pt;
301 nsresult rv = EnsureWords();
302 if (NS_FAILED(rv)) {
303 return rv;
307 int32_t offset = MapDOMPositionToSoftTextOffset(pt);
308 if (offset < 0) return MakeRange(pt, pt, aRange);
309 int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
310 if (wordIndex < 0) return MakeRange(pt, pt, aRange);
311 return MakeRangeForWord(mRealWords[wordIndex], aRange);
314 // This is to fix characters that the spellchecker may not like
315 static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen,
316 nsAString& aOutput) {
317 aOutput.Truncate();
318 for (int32_t i = 0; i < aLen; i++) {
319 char16_t ch = aInput.CharAt(i + aPos);
321 // remove ignorable characters from the word
322 if (IsIgnorableCharacter(ch)) continue;
324 // the spellchecker doesn't handle curly apostrophes in all languages
325 if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
326 ch = '\'';
329 aOutput.Append(ch);
333 // mozInlineSpellWordUtil::GetNextWord
335 // FIXME-optimization: we shouldn't have to generate a range every single
336 // time. It would be better if the inline spellchecker didn't require a
337 // range unless the word was misspelled. This may or may not be possible.
339 bool mozInlineSpellWordUtil::GetNextWord(nsAString& aText,
340 NodeOffsetRange* aNodeOffsetRange,
341 bool* aSkipChecking) {
342 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
343 ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex));
345 if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) {
346 mNextWordIndex = -1;
347 *aSkipChecking = true;
348 return false;
351 const RealWord& word = mRealWords[mNextWordIndex];
352 MakeNodeOffsetRangeForWord(word, aNodeOffsetRange);
353 ++mNextWordIndex;
354 *aSkipChecking = !word.mCheckableWord;
355 ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
357 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
358 ("%s: returning: %s (skip=%d)", __FUNCTION__,
359 NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking));
361 return true;
364 // mozInlineSpellWordUtil::MakeRange
366 // Convenience function for creating a range over the current document.
368 nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
369 nsRange** aRange) const {
370 NS_ENSURE_ARG_POINTER(aBegin.mNode);
371 if (!mDocument) {
372 return NS_ERROR_NOT_INITIALIZED;
375 ErrorResult error;
376 RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset,
377 aEnd.mNode, aEnd.mOffset, error);
378 if (NS_WARN_IF(error.Failed())) {
379 return error.StealNSResult();
381 MOZ_ASSERT(range);
382 range.forget(aRange);
383 return NS_OK;
386 // static
387 already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange(
388 const NodeOffsetRange& aRange) {
389 IgnoredErrorResult ignoredError;
390 RefPtr<nsRange> range =
391 nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(),
392 aRange.End().Node(), aRange.End().Offset(), ignoredError);
393 NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed");
394 return range.forget();
397 /*********** Word Splitting ************/
399 // classifies a given character in the DOM word
400 enum CharClass {
401 CHAR_CLASS_WORD,
402 CHAR_CLASS_SEPARATOR,
403 CHAR_CLASS_END_OF_INPUT
406 // Encapsulates DOM-word to real-word splitting
407 template <class T>
408 struct MOZ_STACK_CLASS WordSplitState {
409 const T& mDOMWordText;
410 int32_t mDOMWordOffset;
411 CharClass mCurCharClass;
413 explicit WordSplitState(const T& aString)
414 : mDOMWordText(aString),
415 mDOMWordOffset(0),
416 mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
418 CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
419 void Advance();
420 void AdvanceThroughSeparators();
421 void AdvanceThroughWord();
423 // Finds special words like email addresses and URLs that may start at the
424 // current position, and returns their length, or 0 if not found. This allows
425 // arbitrary word breaking rules to be used for these special entities, as
426 // long as they can not contain whitespace.
427 bool IsSpecialWord() const;
429 // Similar to IsSpecialWord except that this takes a split word as
430 // input. This checks for things that do not require special word-breaking
431 // rules.
432 bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;
434 // Checks to see if there's a DOM word separator before aBeforeOffset within
435 // it. This function does not modify aSeparatorOffset when it returns false.
436 bool GetDOMWordSeparatorOffset(int32_t aOffset,
437 int32_t* aSeparatorOffset) const;
439 char16_t GetUnicharAt(int32_t aIndex) const;
442 // WordSplitState::ClassifyCharacter
443 template <class T>
444 CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
445 bool aRecurse) const {
446 NS_ASSERTION(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
447 "Index out of range");
448 if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;
450 // this will classify the character, we want to treat "ignorable" characters
451 // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
452 nsUGenCategory charCategory =
453 mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
454 if (charCategory == nsUGenCategory::kLetter ||
455 IsIgnorableCharacter(mDOMWordText[aIndex]) ||
456 mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
457 mDOMWordText[aIndex] == 0x200D /* ZWJ */)
458 return CHAR_CLASS_WORD;
460 // If conditional punctuation is surrounded immediately on both sides by word
461 // characters it also counts as a word character.
462 if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
463 if (!aRecurse) {
464 // not allowed to look around, this punctuation counts like a separator
465 return CHAR_CLASS_SEPARATOR;
468 // check the left-hand character
469 if (aIndex == 0) return CHAR_CLASS_SEPARATOR;
470 if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
471 return CHAR_CLASS_SEPARATOR;
472 // If the previous charatcer is a word-char, make sure that it's not a
473 // special dot character.
474 if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR;
476 // now we know left char is a word-char, check the right-hand character
477 if (aIndex == int32_t(mDOMWordText.Length() - 1)) {
478 return CHAR_CLASS_SEPARATOR;
481 if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
482 return CHAR_CLASS_SEPARATOR;
483 // If the next charatcer is a word-char, make sure that it's not a
484 // special dot character.
485 if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR;
487 // char on either side is a word, this counts as a word
488 return CHAR_CLASS_WORD;
491 // The dot character, if appearing at the end of a word, should
492 // be considered part of that word. Example: "etc.", or
493 // abbreviations
494 if (aIndex > 0 && mDOMWordText[aIndex] == '.' &&
495 mDOMWordText[aIndex - 1] != '.' &&
496 ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
497 return CHAR_CLASS_WORD;
500 // all other punctuation
501 if (charCategory == nsUGenCategory::kSeparator ||
502 charCategory == nsUGenCategory::kOther ||
503 charCategory == nsUGenCategory::kPunctuation ||
504 charCategory == nsUGenCategory::kSymbol) {
505 // Don't break on hyphens, as hunspell handles them on its own.
506 if (aIndex > 0 && mDOMWordText[aIndex] == '-' &&
507 mDOMWordText[aIndex - 1] != '-' &&
508 ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
509 // A hyphen is only meaningful as a separator inside a word
510 // if the previous and next characters are a word character.
511 if (aIndex == int32_t(mDOMWordText.Length()) - 1)
512 return CHAR_CLASS_SEPARATOR;
513 if (mDOMWordText[aIndex + 1] != '.' &&
514 ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
515 return CHAR_CLASS_WORD;
517 return CHAR_CLASS_SEPARATOR;
520 // any other character counts as a word
521 return CHAR_CLASS_WORD;
524 // WordSplitState::Advance
525 template <class T>
526 void WordSplitState<T>::Advance() {
527 NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
528 NS_ASSERTION(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
529 "Length beyond end");
531 mDOMWordOffset++;
532 if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
533 mCurCharClass = CHAR_CLASS_END_OF_INPUT;
534 else
535 mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
538 // WordSplitState::AdvanceThroughSeparators
539 template <class T>
540 void WordSplitState<T>::AdvanceThroughSeparators() {
541 while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance();
544 // WordSplitState::AdvanceThroughWord
545 template <class T>
546 void WordSplitState<T>::AdvanceThroughWord() {
547 while (mCurCharClass == CHAR_CLASS_WORD) Advance();
550 // WordSplitState::IsSpecialWord
551 template <class T>
552 bool WordSplitState<T>::IsSpecialWord() const {
553 // Search for email addresses. We simply define these as any sequence of
554 // characters with an '@' character in the middle. The DOM word is already
555 // split on whitepace, so we know that everything to the end is the address
556 int32_t firstColon = -1;
557 for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) {
558 if (mDOMWordText[i] == '@') {
559 // only accept this if there are unambiguous word characters (don't bother
560 // recursing to disambiguate apostrophes) on each side. This prevents
561 // classifying, e.g. "@home" as an email address
563 // Use this condition to only accept words with '@' in the middle of
564 // them. It works, but the inlinespellcker doesn't like this. The problem
565 // is that you type "fhsgfh@" that's a misspelled word followed by a
566 // symbol, but when you type another letter "fhsgfh@g" that first word
567 // need to be unmarked misspelled. It doesn't do this. it only checks the
568 // current position for potentially removing a spelling range.
569 if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
570 i < (int32_t)mDOMWordText.Length() - 1 &&
571 ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
572 return true;
574 } else if (mDOMWordText[i] == ':' && firstColon < 0) {
575 firstColon = i;
577 // If the first colon is followed by a slash, consider it a URL
578 // This will catch things like asdf://foo.com
579 if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
580 mDOMWordText[firstColon + 1] == '/') {
581 return true;
586 // Check the text before the first colon against some known protocols. It
587 // is impossible to check against all protocols, especially since you can
588 // plug in new protocols. We also don't want to waste time here checking
589 // against a lot of obscure protocols.
590 if (firstColon > mDOMWordOffset) {
591 nsString protocol(
592 Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset));
593 if (protocol.EqualsIgnoreCase("http") ||
594 protocol.EqualsIgnoreCase("https") ||
595 protocol.EqualsIgnoreCase("news") ||
596 protocol.EqualsIgnoreCase("file") ||
597 protocol.EqualsIgnoreCase("javascript") ||
598 protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) {
599 return true;
603 // not anything special
604 return false;
607 // WordSplitState::ShouldSkipWord
608 template <class T>
609 bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
610 int32_t last = aStart + aLength;
612 // check to see if the word contains a digit
613 for (int32_t i = aStart; i < last; i++) {
614 if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) ==
615 nsUGenCategory::kNumber) {
616 return true;
620 // not special
621 return false;
624 template <class T>
625 bool WordSplitState<T>::GetDOMWordSeparatorOffset(
626 int32_t aOffset, int32_t* aSeparatorOffset) const {
627 for (int32_t i = aOffset - 1; i >= 0; --i) {
628 if (IsDOMWordSeparator(mDOMWordText[i]) ||
629 (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) &&
630 ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) {
631 // Be greedy, find as many separators as we can
632 for (int32_t j = i - 1; j >= 0; --j) {
633 if (IsDOMWordSeparator(mDOMWordText[j]) ||
634 (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) &&
635 ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) {
636 i = j;
637 } else {
638 break;
641 *aSeparatorOffset = i;
642 return true;
645 return false;
648 template <>
649 char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
650 int32_t aIndex) const {
651 return mDOMWordText[aIndex];
654 template <>
655 char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
656 int32_t aIndex) const {
657 return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
660 static inline bool IsBRElement(nsINode* aNode) {
661 return aNode->IsHTMLElement(nsGkAtoms::br);
665 * Given a TextNode, checks to see if there's a DOM word separator before
666 * aBeforeOffset within it. This function does not modify aSeparatorOffset when
667 * it returns false.
669 * @param aContent the TextNode to check.
670 * @param aBeforeOffset the offset in the TextNode before which we will search
671 * for the DOM separator. You can pass INT32_MAX to search the entire
672 * length of the string.
673 * @param aSeparatorOffset will be set to the offset of the first separator it
674 * encounters. Will not be written to if no separator is found.
675 * @returns True if it found a separator.
677 static bool TextNodeContainsDOMWordSeparator(nsIContent* aContent,
678 int32_t aBeforeOffset,
679 int32_t* aSeparatorOffset) {
680 const nsTextFragment* textFragment = aContent->GetText();
681 NS_ASSERTION(textFragment, "Where is our text?");
682 int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));
684 if (textFragment->Is2b()) {
685 nsDependentSubstring targetText(textFragment->Get2b(), end);
686 WordSplitState<nsDependentSubstring> state(targetText);
687 return state.GetDOMWordSeparatorOffset(end, aSeparatorOffset);
690 nsDependentCSubstring targetText(textFragment->Get1b(), end);
691 WordSplitState<nsDependentCSubstring> state(targetText);
692 return state.GetDOMWordSeparatorOffset(end, aSeparatorOffset);
696 * Check if there's a DOM word separator before aBeforeOffset in this node.
697 * Always returns true if it's a BR element.
698 * aSeparatorOffset is set to the index of the first character in the last
699 * separator if any is found (0 for BR elements).
701 * This function does not modify aSeparatorOffset when it returns false.
703 static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
704 int32_t* aSeparatorOffset) {
705 if (IsBRElement(aNode)) {
706 *aSeparatorOffset = 0;
707 return true;
710 if (!IsSpellCheckingTextNode(aNode)) return false;
712 return TextNodeContainsDOMWordSeparator(aNode->AsContent(), aBeforeOffset,
713 aSeparatorOffset);
716 static bool IsBreakElement(nsINode* aNode) {
717 if (!aNode->IsElement()) {
718 return false;
721 dom::Element* element = aNode->AsElement();
722 if (element->IsHTMLElement(nsGkAtoms::br)) {
723 return true;
726 // If we don't have a frame, we don't consider ourselves a break
727 // element. In particular, words can span us.
728 nsIFrame* frame = element->GetPrimaryFrame();
729 if (!frame) {
730 return false;
733 auto* disp = frame->StyleDisplay();
734 // Anything that's not an inline element is a break element.
735 // XXXbz should replaced inlines be break elements, though?
736 // Also should inline-block and such be break elements?
738 // FIXME(emilio): We should teach the spell checker to deal with generated
739 // content (it doesn't at all), then remove the IsListItem() check, as there
740 // could be no marker, etc...
741 return !disp->IsInlineFlow() || disp->IsListItem();
744 struct CheckLeavingBreakElementClosure {
745 bool mLeftBreakElement;
748 static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) {
749 CheckLeavingBreakElementClosure* cl =
750 static_cast<CheckLeavingBreakElementClosure*>(aClosure);
751 if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
752 cl->mLeftBreakElement = true;
756 void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) {
757 nsAutoString result;
758 ::NormalizeWord(aWord, 0, aWord.Length(), result);
759 aWord = result;
762 void mozInlineSpellWordUtil::BuildSoftText() {
763 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__));
765 // First we have to work backwards from mSoftStart to find a text node
766 // containing a DOM word separator, a non-inline-element
767 // boundary, or the hard start node. That's where we'll start building the
768 // soft string from.
769 nsINode* node = mSoftBegin.mNode;
770 int32_t firstOffsetInNode = 0;
771 int32_t checkBeforeOffset = mSoftBegin.mOffset;
772 while (node) {
773 if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
774 if (node == mSoftBegin.mNode) {
775 // If we find a word separator on the first node, look at the preceding
776 // word on the text node as well.
777 int32_t newOffset = 0;
778 if (firstOffsetInNode > 0) {
779 // Try to find the previous word boundary in the current node. If
780 // we can't find one, start checking previous sibling nodes (if any
781 // adjacent ones exist) to see if we can find any text nodes with
782 // DOM word separators. We bail out as soon as we see a node that is
783 // not a text node, or we run out of previous sibling nodes. In the
784 // event that we simply cannot find any preceding word separator, the
785 // offset is set to 0, and the soft text beginning node is set to the
786 // "most previous" text node before the original starting node, or
787 // kept at the original starting node if no previous text nodes exist.
788 if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
789 &newOffset)) {
790 nsIContent* prevNode = node->GetPreviousSibling();
791 while (prevNode && IsSpellCheckingTextNode(prevNode)) {
792 mSoftBegin.mNode = prevNode;
793 if (TextNodeContainsDOMWordSeparator(prevNode, INT32_MAX,
794 &newOffset)) {
795 break;
797 prevNode = prevNode->GetPreviousSibling();
801 firstOffsetInNode = newOffset;
802 mSoftBegin.mOffset = newOffset;
804 break;
806 checkBeforeOffset = INT32_MAX;
807 if (IsBreakElement(node)) {
808 // Since GetPreviousContent follows tree *preorder*, we're about to
809 // traverse up out of 'node'. Since node induces breaks (e.g., it's a
810 // block), don't bother trying to look outside it, just stop now.
811 break;
813 // GetPreviousContent below expects mRootNode to be an ancestor of node.
814 if (!node->IsInclusiveDescendantOf(mRootNode)) {
815 break;
817 node = node->GetPreviousContent(mRootNode);
820 // Now build up the string moving forward through the DOM until we reach
821 // the soft end and *then* see a DOM word separator, a non-inline-element
822 // boundary, or the hard end node.
823 mSoftText.Truncate();
824 mSoftTextDOMMapping.Clear();
825 bool seenSoftEnd = false;
826 // Leave this outside the loop so large heap string allocations can be reused
827 // across iterations
828 while (node) {
829 if (node == mSoftEnd.mNode) {
830 seenSoftEnd = true;
833 bool exit = false;
834 if (IsSpellCheckingTextNode(node)) {
835 nsIContent* content = static_cast<nsIContent*>(node);
836 NS_ASSERTION(content, "Where is our content?");
837 const nsTextFragment* textFragment = content->GetText();
838 NS_ASSERTION(textFragment, "Where is our text?");
839 int32_t lastOffsetInNode = textFragment->GetLength();
841 if (seenSoftEnd) {
842 // check whether we can stop after this
843 for (int32_t i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
844 i < int32_t(textFragment->GetLength()); ++i) {
845 if (IsDOMWordSeparator(textFragment->CharAt(i))) {
846 exit = true;
847 // stop at the first separator after the soft end point
848 lastOffsetInNode = i;
849 break;
854 if (firstOffsetInNode < lastOffsetInNode) {
855 int32_t len = lastOffsetInNode - firstOffsetInNode;
856 mSoftTextDOMMapping.AppendElement(DOMTextMapping(
857 NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
859 bool ok = textFragment->AppendTo(mSoftText, firstOffsetInNode, len,
860 mozilla::fallible);
861 if (!ok) {
862 // probably out of memory, remove from mSoftTextDOMMapping
863 mSoftTextDOMMapping.RemoveLastElement();
864 exit = true;
868 firstOffsetInNode = 0;
871 if (exit) break;
873 CheckLeavingBreakElementClosure closure = {false};
874 node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
875 if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
876 // We left, or are entering, a break element (e.g., block). Maybe we can
877 // stop now.
878 if (seenSoftEnd) break;
879 // Record the break
880 mSoftText.Append(' ');
884 MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
885 ("%s: got DOM string: %s", __FUNCTION__,
886 NS_ConvertUTF16toUTF8(mSoftText).get()));
889 nsresult mozInlineSpellWordUtil::BuildRealWords() {
890 // This is pretty simple. We just have to walk mSoftText, tokenizing it
891 // into "real words".
892 // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
893 // SplitDOMWord on each of those DOM words
894 int32_t wordStart = -1;
895 mRealWords.Clear();
896 for (int32_t i = 0; i < int32_t(mSoftText.Length()); ++i) {
897 if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
898 if (wordStart >= 0) {
899 nsresult rv = SplitDOMWord(wordStart, i);
900 if (NS_FAILED(rv)) {
901 return rv;
903 wordStart = -1;
905 } else {
906 if (wordStart < 0) {
907 wordStart = i;
911 if (wordStart >= 0) {
912 nsresult rv = SplitDOMWord(wordStart, mSoftText.Length());
913 if (NS_FAILED(rv)) {
914 return rv;
918 return NS_OK;
921 /*********** DOM/realwords<->mSoftText mapping functions ************/
923 int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
924 NodeOffset aNodeOffset) {
925 if (!mSoftTextValid) {
926 NS_ERROR("Soft text must be valid if we're to map into it");
927 return -1;
930 for (int32_t i = 0; i < int32_t(mSoftTextDOMMapping.Length()); ++i) {
931 const DOMTextMapping& map = mSoftTextDOMMapping[i];
932 if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
933 // Allow offsets at either end of the string, in particular, allow the
934 // offset that's at the end of the contributed string
935 int32_t offsetInContributedString =
936 aNodeOffset.mOffset - map.mNodeOffset.mOffset;
937 if (offsetInContributedString >= 0 &&
938 offsetInContributedString <= map.mLength)
939 return map.mSoftTextOffset + offsetInContributedString;
940 return -1;
943 return -1;
946 namespace {
948 template <class T>
949 class FirstLargerOffset {
950 int32_t mSoftTextOffset;
952 public:
953 explicit FirstLargerOffset(int32_t aSoftTextOffset)
954 : mSoftTextOffset(aSoftTextOffset) {}
955 int operator()(const T& t) const {
956 // We want the first larger offset, so never return 0 (which would
957 // short-circuit evaluation before finding the last such offset).
958 return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
962 template <class T>
963 bool FindLastNongreaterOffset(const nsTArray<T>& aContainer,
964 int32_t aSoftTextOffset, size_t* aIndex) {
965 if (aContainer.Length() == 0) {
966 return false;
969 BinarySearchIf(aContainer, 0, aContainer.Length(),
970 FirstLargerOffset<T>(aSoftTextOffset), aIndex);
971 if (*aIndex > 0) {
972 // There was at least one mapping with offset <= aSoftTextOffset. Step back
973 // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
974 *aIndex -= 1;
975 } else {
976 // Every mapping had offset greater than aSoftTextOffset.
977 MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
979 return true;
982 } // namespace
984 NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
985 int32_t aSoftTextOffset, DOMMapHint aHint) {
986 NS_ASSERTION(mSoftTextValid,
987 "Soft text must be valid if we're to map out of it");
988 if (!mSoftTextValid) return NodeOffset(nullptr, -1);
990 // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
991 size_t index;
992 bool found =
993 FindLastNongreaterOffset(mSoftTextDOMMapping, aSoftTextOffset, &index);
994 if (!found) {
995 return NodeOffset(nullptr, -1);
998 // 'index' is now the last mapping, if any, such that
999 // mSoftTextOffset <= aSoftTextOffset.
1000 // If we're doing HINT_END, then we may want to return the end of the
1001 // the previous mapping instead of the start of this mapping
1002 if (aHint == HINT_END && index > 0) {
1003 const DOMTextMapping& map = mSoftTextDOMMapping[index - 1];
1004 if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
1005 return NodeOffset(map.mNodeOffset.mNode,
1006 map.mNodeOffset.mOffset + map.mLength);
1009 // We allow ourselves to return the end of this mapping even if we're
1010 // doing HINT_START. This will only happen if there is no mapping which this
1011 // point is the start of. I'm not 100% sure this is OK...
1012 const DOMTextMapping& map = mSoftTextDOMMapping[index];
1013 int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
1014 if (offset >= 0 && offset <= map.mLength)
1015 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
1017 return NodeOffset(nullptr, -1);
1020 // static
1021 void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint,
1022 nsACString& aResult) {
1023 switch (aHint) {
1024 case HINT_BEGIN:
1025 aResult.AssignLiteral("begin");
1026 break;
1027 case HINT_END:
1028 aResult.AssignLiteral("end");
1029 break;
1033 int32_t mozInlineSpellWordUtil::FindRealWordContaining(
1034 int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const {
1035 if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) {
1036 nsAutoCString hint;
1037 mozInlineSpellWordUtil::ToString(aHint, hint);
1039 MOZ_LOG(
1040 sInlineSpellWordUtilLog, LogLevel::Debug,
1041 ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__,
1042 aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward)));
1045 NS_ASSERTION(mSoftTextValid,
1046 "Soft text must be valid if we're to map out of it");
1047 if (!mSoftTextValid) return -1;
1049 // Find the last word, if any, such that mSoftTextOffset <= aSoftTextOffset
1050 size_t index;
1051 bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
1052 if (!found) {
1053 return -1;
1056 // 'index' is now the last word, if any, such that
1057 // mSoftTextOffset <= aSoftTextOffset.
1058 // If we're doing HINT_END, then we may want to return the end of the
1059 // the previous word instead of the start of this word
1060 if (aHint == HINT_END && index > 0) {
1061 const RealWord& word = mRealWords[index - 1];
1062 if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
1063 return index - 1;
1066 // We allow ourselves to return the end of this word even if we're
1067 // doing HINT_START. This will only happen if there is no word which this
1068 // point is the start of. I'm not 100% sure this is OK...
1069 const RealWord& word = mRealWords[index];
1070 int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
1071 if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index;
1073 if (aSearchForward) {
1074 if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
1075 // All words have mSoftTextOffset > aSoftTextOffset
1076 return 0;
1078 // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
1079 // Word index+1, if it exists, will be the first with
1080 // mSoftTextOffset > aSoftTextOffset.
1081 if (index + 1 < mRealWords.Length()) return index + 1;
1084 return -1;
1087 // mozInlineSpellWordUtil::SplitDOMWord
1089 nsresult mozInlineSpellWordUtil::SplitDOMWord(int32_t aStart, int32_t aEnd) {
1090 nsDependentSubstring targetText(mSoftText, aStart, aEnd - aStart);
1091 WordSplitState<nsDependentSubstring> state(targetText);
1092 state.mCurCharClass = state.ClassifyCharacter(0, true);
1094 state.AdvanceThroughSeparators();
1095 if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) {
1096 int32_t specialWordLength =
1097 state.mDOMWordText.Length() - state.mDOMWordOffset;
1098 if (!mRealWords.AppendElement(
1099 RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
1100 fallible)) {
1101 return NS_ERROR_OUT_OF_MEMORY;
1104 return NS_OK;
1107 while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1108 state.AdvanceThroughSeparators();
1109 if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break;
1111 // save the beginning of the word
1112 int32_t wordOffset = state.mDOMWordOffset;
1114 // find the end of the word
1115 state.AdvanceThroughWord();
1116 int32_t wordLen = state.mDOMWordOffset - wordOffset;
1117 if (!mRealWords.AppendElement(
1118 RealWord(aStart + wordOffset, wordLen,
1119 !state.ShouldSkipWord(wordOffset, wordLen)),
1120 fallible)) {
1121 return NS_ERROR_OUT_OF_MEMORY;
1125 return NS_OK;