layout/generic/nsTextFrameUtils.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "nsTextFrameUtils.h"
   8
   9 #include "mozilla/dom/Text.h"
  10 #include "nsBidiUtils.h"
  11 #include "nsCharTraits.h"
  12 #include "nsIContent.h"
  13 #include "nsStyleStruct.h"
  14 #include "nsTextFragment.h"
  15 #include "nsUnicharUtils.h"
  16 #include "nsUnicodeProperties.h"
  17 #include <algorithm>
  18
  19 using namespace mozilla;
  20 using namespace mozilla::dom;
  21
  22 // static
  23 bool nsTextFrameUtils::IsSpaceCombiningSequenceTail(const char16_t* aChars,
  24                                                     int32_t aLength) {
  25   return aLength > 0 &&
  26          (mozilla::unicode::IsClusterExtenderExcludingJoiners(aChars[0]) ||
  27           (IsBidiControl(aChars[0]) &&
  28            IsSpaceCombiningSequenceTail(aChars + 1, aLength - 1)));
  29 }
  30
  31 static bool IsDiscardable(char16_t ch, nsTextFrameUtils::Flags* aFlags) {
  32   // Unlike IS_DISCARDABLE, we don't discard \r. \r will be ignored by
  33   // gfxTextRun and discarding it would force us to copy text in many cases of
  34   // preformatted text containing \r\n.
  35   if (ch == CH_SHY) {
  36     *aFlags |= nsTextFrameUtils::Flags::HasShy;
  37     return true;
  38   }
  39   return IsBidiControl(ch);
  40 }
  41
  42 static bool IsDiscardable(uint8_t ch, nsTextFrameUtils::Flags* aFlags) {
  43   if (ch == CH_SHY) {
  44     *aFlags |= nsTextFrameUtils::Flags::HasShy;
  45     return true;
  46   }
  47   return false;
  48 }
  49
  50 static bool IsSegmentBreak(char16_t aCh) { return aCh == '\n'; }
  51
  52 static bool IsSpaceOrTab(char16_t aCh) { return aCh == ' ' || aCh == '\t'; }
  53
  54 static bool IsSpaceOrTabOrSegmentBreak(char16_t aCh) {
  55   return IsSpaceOrTab(aCh) || IsSegmentBreak(aCh);
  56 }
  57
  58 template <typename CharT>
  59 /* static */
  60 bool nsTextFrameUtils::IsSkippableCharacterForTransformText(CharT aChar) {
  61   return aChar == ' ' || aChar == '\t' || aChar == '\n' || aChar == CH_SHY ||
  62          (aChar > 0xFF && IsBidiControl(aChar));
  63 }
  64
  65 #ifdef DEBUG
  66 template <typename CharT>
  67 static void AssertSkippedExpectedChars(const CharT* aText,
  68                                        const gfxSkipChars& aSkipChars,
  69                                        int32_t aSkipCharsOffset) {
  70   gfxSkipCharsIterator it(aSkipChars);
  71   it.AdvanceOriginal(aSkipCharsOffset);
  72   while (it.GetOriginalOffset() < it.GetOriginalEnd()) {
  73     CharT ch = aText[it.GetOriginalOffset() - aSkipCharsOffset];
  74     MOZ_ASSERT(!it.IsOriginalCharSkipped() ||
  75                    nsTextFrameUtils::IsSkippableCharacterForTransformText(ch),
  76                "skipped unexpected character; need to update "
  77                "IsSkippableCharacterForTransformText?");
  78     it.AdvanceOriginal(1);
  79   }
  80 }
  81 #endif
  82
  83 template <class CharT>
  84 static CharT* TransformWhiteSpaces(
  85     const CharT* aText, uint32_t aLength, uint32_t aBegin, uint32_t aEnd,
  86     bool aHasSegmentBreak, bool& aInWhitespace, CharT* aOutput,
  87     nsTextFrameUtils::Flags& aFlags,
  88     nsTextFrameUtils::CompressionMode aCompression, gfxSkipChars* aSkipChars) {
  89   MOZ_ASSERT(aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
  90                  aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE_NEWLINE,
  91              "whitespaces should be skippable!!");
  92   // Get the context preceding/following this white space range.
  93   // For 8-bit text (sizeof CharT == 1), the checks here should get optimized
  94   // out, and isSegmentBreakSkippable should be initialized to be 'false'.
  95   bool isSegmentBreakSkippable =
  96       sizeof(CharT) > 1 &&
  97       ((aBegin > 0 && IS_ZERO_WIDTH_SPACE(aText[aBegin - 1])) ||
  98        (aEnd < aLength && IS_ZERO_WIDTH_SPACE(aText[aEnd])));
  99   if (sizeof(CharT) > 1 && !isSegmentBreakSkippable && aBegin > 0 &&
 100       aEnd < aLength) {
 101     uint32_t ucs4before;
 102     uint32_t ucs4after;
 103     if (aBegin > 1 &&
 104         NS_IS_SURROGATE_PAIR(aText[aBegin - 2], aText[aBegin - 1])) {
 105       ucs4before = SURROGATE_TO_UCS4(aText[aBegin - 2], aText[aBegin - 1]);
 106     } else {
 107       ucs4before = aText[aBegin - 1];
 108     }
 109     if (aEnd + 1 < aLength &&
 110         NS_IS_SURROGATE_PAIR(aText[aEnd], aText[aEnd + 1])) {
 111       ucs4after = SURROGATE_TO_UCS4(aText[aEnd], aText[aEnd + 1]);
 112     } else {
 113       ucs4after = aText[aEnd];
 114     }
 115     // Discard newlines between characters that have F, W, or H
 116     // EastAsianWidth property and neither side is Hangul.
 117     isSegmentBreakSkippable =
 118         IsSegmentBreakSkipChar(ucs4before) && IsSegmentBreakSkipChar(ucs4after);
 119   }
 120
 121   for (uint32_t i = aBegin; i < aEnd; ++i) {
 122     CharT ch = aText[i];
 123     bool keepChar = false;
 124     bool keepTransformedWhiteSpace = false;
 125     if (IsDiscardable(ch, &aFlags)) {
 126       aSkipChars->SkipChar();
 127       continue;
 128     }
 129     if (IsSpaceOrTab(ch)) {
 130       if (aHasSegmentBreak) {
 131         // If white-space is set to normal, nowrap, or pre-line, white space
 132         // characters are considered collapsible and all spaces and tabs
 133         // immediately preceding or following a segment break are removed.
 134         aSkipChars->SkipChar();
 135         continue;
 136       }
 137
 138       if (aInWhitespace) {
 139         aSkipChars->SkipChar();
 140         continue;
 141       } else {
 142         keepTransformedWhiteSpace = true;
 143       }
 144     } else {
 145       // Apply Segment Break Transformation Rules (CSS Text 3 - 4.1.2) for
 146       // segment break characters.
 147       if (aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
 148           // XXX: According to CSS Text 3, a lone CR should not always be
 149           //      kept, but still go through the Segment Break Transformation
 150           //      Rules. However, this is what current modern browser engines
 151           //      (webkit/blink/edge) do. So, once we can get some clarity
 152           //      from the specification issue, we should either remove the
 153           //      lone CR condition here, or leave it here with this comment
 154           //      being rephrased.
 155           //      Please see https://github.com/w3c/csswg-drafts/issues/855.
 156           ch == '\r') {
 157         keepChar = true;
 158       } else {
 159         // aCompression == COMPRESS_WHITESPACE_NEWLINE
 160
 161         // Any collapsible segment break immediately following another
 162         // collapsible segment break is removed.  Then the remaining segment
 163         // break is either transformed into a space (U+0020) or removed
 164         // depending on the context before and after the break.
 165         if (isSegmentBreakSkippable || aInWhitespace) {
 166           aSkipChars->SkipChar();
 167           continue;
 168         }
 169         isSegmentBreakSkippable = true;
 170         keepTransformedWhiteSpace = true;
 171       }
 172     }
 173
 174     if (keepChar) {
 175       *aOutput++ = ch;
 176       aSkipChars->KeepChar();
 177       aInWhitespace = IsSpaceOrTab(ch);
 178     } else if (keepTransformedWhiteSpace) {
 179       *aOutput++ = ' ';
 180       aSkipChars->KeepChar();
 181       aInWhitespace = true;
 182     } else {
 183       MOZ_ASSERT_UNREACHABLE("Should've skipped the character!!");
 184     }
 185   }
 186   return aOutput;
 187 }
 188
 189 template <class CharT>
 190 CharT* nsTextFrameUtils::TransformText(const CharT* aText, uint32_t aLength,
 191                                        CharT* aOutput,
 192                                        CompressionMode aCompression,
 193                                        uint8_t* aIncomingFlags,
 194                                        gfxSkipChars* aSkipChars,
 195                                        Flags* aAnalysisFlags) {
 196   Flags flags = Flags();
 197 #ifdef DEBUG
 198   int32_t skipCharsOffset = aSkipChars->GetOriginalCharCount();
 199 #endif
 200
 201   bool lastCharArabic = false;
 202   if (aCompression == COMPRESS_NONE ||
 203       aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
 204     // Skip discardables.
 205     uint32_t i;
 206     for (i = 0; i < aLength; ++i) {
 207       CharT ch = aText[i];
 208       if (IsDiscardable(ch, &flags)) {
 209         aSkipChars->SkipChar();
 210       } else {
 211         aSkipChars->KeepChar();
 212         if (ch > ' ') {
 213           lastCharArabic = IS_ARABIC_CHAR(ch);
 214         } else if (aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
 215           if (ch == '\t' || ch == '\n') {
 216             ch = ' ';
 217           }
 218         } else {
 219           // aCompression == COMPRESS_NONE
 220           if (ch == '\t') {
 221             flags |= Flags::HasTab;
 222           } else if (ch == '\n') {
 223             flags |= Flags::HasNewline;
 224           }
 225         }
 226         *aOutput++ = ch;
 227       }
 228     }
 229     if (lastCharArabic) {
 230       *aIncomingFlags |= INCOMING_ARABICCHAR;
 231     } else {
 232       *aIncomingFlags &= ~INCOMING_ARABICCHAR;
 233     }
 234     *aIncomingFlags &= ~INCOMING_WHITESPACE;
 235   } else {
 236     bool inWhitespace = (*aIncomingFlags & INCOMING_WHITESPACE) != 0;
 237     uint32_t i;
 238     for (i = 0; i < aLength; ++i) {
 239       CharT ch = aText[i];
 240       // CSS Text 3 - 4.1. The White Space Processing Rules
 241       // White space processing in CSS affects only the document white space
 242       // characters: spaces (U+0020), tabs (U+0009), and segment breaks.
 243       // Since we need the context of segment breaks and their surrounding
 244       // white spaces to proceed the white space processing, a consecutive run
 245       // of spaces/tabs/segment breaks is collected in a first pass loop, then
 246       // we apply the collapsing and transformation rules to this run in a
 247       // second pass loop.
 248       if (IsSpaceOrTabOrSegmentBreak(ch)) {
 249         bool keepLastSpace = false;
 250         bool hasSegmentBreak = IsSegmentBreak(ch);
 251         uint32_t countTrailingDiscardables = 0;
 252         uint32_t j;
 253         for (j = i + 1; j < aLength && (IsSpaceOrTabOrSegmentBreak(aText[j]) ||
 254                                         IsDiscardable(aText[j], &flags));
 255              j++) {
 256           if (IsSegmentBreak(aText[j])) {
 257             hasSegmentBreak = true;
 258           }
 259         }
 260         // Exclude trailing discardables before checking space combining
 261         // sequence tail.
 262         for (; IsDiscardable(aText[j - 1], &flags); j--) {
 263           countTrailingDiscardables++;
 264         }
 265         // If the last white space is followed by a combining sequence tail,
 266         // exclude it from the range of TransformWhiteSpaces.
 267         if (sizeof(CharT) > 1 && aText[j - 1] == ' ' && j < aLength &&
 268             IsSpaceCombiningSequenceTail(&aText[j], aLength - j)) {
 269           keepLastSpace = true;
 270           j--;
 271         }
 272         if (j > i) {
 273           aOutput = TransformWhiteSpaces(aText, aLength, i, j, hasSegmentBreak,
 274                                          inWhitespace, aOutput, flags,
 275                                          aCompression, aSkipChars);
 276         }
 277         // We need to keep KeepChar()/SkipChar() in order, so process the
 278         // last white space first, then process the trailing discardables.
 279         if (keepLastSpace) {
 280           keepLastSpace = false;
 281           *aOutput++ = ' ';
 282           aSkipChars->KeepChar();
 283           lastCharArabic = false;
 284           j++;
 285         }
 286         for (; countTrailingDiscardables > 0; countTrailingDiscardables--) {
 287           aSkipChars->SkipChar();
 288           j++;
 289         }
 290         i = j - 1;
 291         continue;
 292       }
 293       // Process characters other than the document white space characters.
 294       if (IsDiscardable(ch, &flags)) {
 295         aSkipChars->SkipChar();
 296       } else {
 297         *aOutput++ = ch;
 298         aSkipChars->KeepChar();
 299       }
 300       lastCharArabic = IS_ARABIC_CHAR(ch);
 301       inWhitespace = false;
 302     }
 303
 304     if (lastCharArabic) {
 305       *aIncomingFlags |= INCOMING_ARABICCHAR;
 306     } else {
 307       *aIncomingFlags &= ~INCOMING_ARABICCHAR;
 308     }
 309     if (inWhitespace) {
 310       *aIncomingFlags |= INCOMING_WHITESPACE;
 311     } else {
 312       *aIncomingFlags &= ~INCOMING_WHITESPACE;
 313     }
 314   }
 315
 316   *aAnalysisFlags = flags;
 317
 318 #ifdef DEBUG
 319   AssertSkippedExpectedChars(aText, *aSkipChars, skipCharsOffset);
 320 #endif
 321   return aOutput;
 322 }
 323
 324 /*
 325  * NOTE: The TransformText and IsSkippableCharacterForTransformText template
 326  * functions are part of the public API of nsTextFrameUtils, while
 327  * their function bodies are not available in the header. They may stop working
 328  * (fail to resolve symbol in link time) once their callsites are moved to a
 329  * different translation unit (e.g. a different unified source file).
 330  * Explicit instantiating this function template with `uint8_t` and `char16_t`
 331  * could prevent us from the potential risk.
 332  */
 333 template uint8_t* nsTextFrameUtils::TransformText(
 334     const uint8_t* aText, uint32_t aLength, uint8_t* aOutput,
 335     CompressionMode aCompression, uint8_t* aIncomingFlags,
 336     gfxSkipChars* aSkipChars, Flags* aAnalysisFlags);
 337 template char16_t* nsTextFrameUtils::TransformText(
 338     const char16_t* aText, uint32_t aLength, char16_t* aOutput,
 339     CompressionMode aCompression, uint8_t* aIncomingFlags,
 340     gfxSkipChars* aSkipChars, Flags* aAnalysisFlags);
 341 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
 342     uint8_t aChar);
 343 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
 344     char16_t aChar);
 345
 346 template <typename CharT>
 347 static uint32_t DoComputeApproximateLengthWithWhitespaceCompression(
 348     const CharT* aChars, uint32_t aLength, const nsStyleText* aStyleText) {
 349   // This is an approximation so we don't really need anything
 350   // too fancy here.
 351   uint32_t len;
 352   if (aStyleText->WhiteSpaceIsSignificant()) {
 353     return aLength;
 354   }
 355   bool prevWS = true;  // more important to ignore blocks with
 356                        // only whitespace than get inline boundaries
 357                        // exactly right
 358   len = 0;
 359   for (uint32_t i = 0; i < aLength; ++i) {
 360     CharT c = aChars[i];
 361     if (c == ' ' || c == '\n' || c == '\t' || c == '\r') {
 362       if (!prevWS) {
 363         ++len;
 364       }
 365       prevWS = true;
 366     } else {
 367       ++len;
 368       prevWS = false;
 369     }
 370   }
 371   return len;
 372 }
 373
 374 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
 375     Text* aText, const nsStyleText* aStyleText) {
 376   const nsTextFragment* frag = &aText->TextFragment();
 377   if (frag->Is2b()) {
 378     return DoComputeApproximateLengthWithWhitespaceCompression(
 379         frag->Get2b(), frag->GetLength(), aStyleText);
 380   }
 381   return DoComputeApproximateLengthWithWhitespaceCompression(
 382       frag->Get1b(), frag->GetLength(), aStyleText);
 383 }
 384
 385 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
 386     const nsAString& aString, const nsStyleText* aStyleText) {
 387   return DoComputeApproximateLengthWithWhitespaceCompression(
 388       aString.BeginReading(), aString.Length(), aStyleText);
 389 }
 390
 391 bool nsSkipCharsRunIterator::NextRun() {
 392   do {
 393     if (mRunLength) {
 394       mIterator.AdvanceOriginal(mRunLength);
 395       NS_ASSERTION(mRunLength > 0,
 396                    "No characters in run (initial length too large?)");
 397       if (!mSkipped || mLengthIncludesSkipped) {
 398         mRemainingLength -= mRunLength;
 399       }
 400     }
 401     if (!mRemainingLength) {
 402       return false;
 403     }
 404     int32_t length;
 405     mSkipped = mIterator.IsOriginalCharSkipped(&length);
 406     mRunLength = std::min(length, mRemainingLength);
 407   } while (!mVisitSkipped && mSkipped);
 408
 409   return true;
 410 }