netwerk/streamconv/converters/mozTXTToHTMLConv.cpp

   1 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5
   6 #include "mozilla/TextUtils.h"
   7 #include "mozTXTToHTMLConv.h"
   8 #include "mozilla/intl/Segmenter.h"
   9 #include "mozilla/Maybe.h"
  10 #include "nsIThreadRetargetableStreamListener.h"
  11 #include "nsNetUtil.h"
  12 #include "nsUnicharUtils.h"
  13 #include "nsUnicodeProperties.h"
  14 #include "nsCRT.h"
  15 #include "nsIExternalProtocolHandler.h"
  16 #include "nsIURI.h"
  17
  18 #include <algorithm>
  19
  20 #ifdef DEBUG_BenB_Perf
  21 #  include "prtime.h"
  22 #  include "prinrval.h"
  23 #endif
  24
  25 using mozilla::IsAscii;
  26 using mozilla::IsAsciiAlpha;
  27 using mozilla::IsAsciiDigit;
  28 using mozilla::Maybe;
  29 using mozilla::Some;
  30 using mozilla::Span;
  31 using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
  32 using mozilla::intl::GraphemeClusterBreakReverseIteratorUtf16;
  33
  34 const double growthRate = 1.2;
  35
  36 // Bug 183111, editor now replaces multiple spaces with leading
  37 // 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
  38 // 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
  39 // Also recognize the Japanese ideographic space 0x3000 as a space.
  40 static inline bool IsSpace(const char16_t aChar) {
  41   return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
  42 }
  43
  44 // Escape Char will take ch, escape it and append the result to
  45 // aStringToAppendTo
  46 void mozTXTToHTMLConv::EscapeChar(const char16_t ch,
  47                                   nsAString& aStringToAppendTo,
  48                                   bool inAttribute) {
  49   switch (ch) {
  50     case '<':
  51       aStringToAppendTo.AppendLiteral("&lt;");
  52       break;
  53     case '>':
  54       aStringToAppendTo.AppendLiteral("&gt;");
  55       break;
  56     case '&':
  57       aStringToAppendTo.AppendLiteral("&amp;");
  58       break;
  59     case '"':
  60       if (inAttribute) {
  61         aStringToAppendTo.AppendLiteral("&quot;");
  62         break;
  63       }
  64       // else fall through
  65       [[fallthrough]];
  66     default:
  67       aStringToAppendTo += ch;
  68   }
  69 }
  70
  71 // EscapeStr takes the passed in string and
  72 // escapes it IN PLACE.
  73 void mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute) {
  74   // the replace substring routines
  75   // don't seem to work if you have a character
  76   // in the in string that is also in the replacement
  77   // string! =(
  78   // aInString.ReplaceSubstring("&", "&amp;");
  79   // aInString.ReplaceSubstring("<", "&lt;");
  80   // aInString.ReplaceSubstring(">", "&gt;");
  81   for (uint32_t i = 0; i < aInString.Length();) {
  82     switch (aInString[i]) {
  83       case '<':
  84         aInString.Cut(i, 1);
  85         aInString.InsertLiteral(u"&lt;", i);
  86         i += 4;  // skip past the integers we just added
  87         break;
  88       case '>':
  89         aInString.Cut(i, 1);
  90         aInString.InsertLiteral(u"&gt;", i);
  91         i += 4;  // skip past the integers we just added
  92         break;
  93       case '&':
  94         aInString.Cut(i, 1);
  95         aInString.InsertLiteral(u"&amp;", i);
  96         i += 5;  // skip past the integers we just added
  97         break;
  98       case '"':
  99         if (inAttribute) {
 100           aInString.Cut(i, 1);
 101           aInString.InsertLiteral(u"&quot;", i);
 102           i += 6;
 103           break;
 104         }
 105         // else fall through
 106         [[fallthrough]];
 107       default:
 108         i++;
 109     }
 110   }
 111 }
 112
 113 void mozTXTToHTMLConv::UnescapeStr(const char16_t* aInString, int32_t aStartPos,
 114                                    int32_t aLength, nsString& aOutString) {
 115   const char16_t* subString = nullptr;
 116   for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;) {
 117     int32_t remainingChars = i - aStartPos;
 118     if (aInString[i] == '&') {
 119       subString = &aInString[i];
 120       if (!NS_strncmp(subString, u"&lt;",
 121                       std::min(4, aLength - remainingChars))) {
 122         aOutString.Append(char16_t('<'));
 123         i += 4;
 124       } else if (!NS_strncmp(subString, u"&gt;",
 125                              std::min(4, aLength - remainingChars))) {
 126         aOutString.Append(char16_t('>'));
 127         i += 4;
 128       } else if (!NS_strncmp(subString, u"&amp;",
 129                              std::min(5, aLength - remainingChars))) {
 130         aOutString.Append(char16_t('&'));
 131         i += 5;
 132       } else if (!NS_strncmp(subString, u"&quot;",
 133                              std::min(6, aLength - remainingChars))) {
 134         aOutString.Append(char16_t('"'));
 135         i += 6;
 136       } else {
 137         aOutString += aInString[i];
 138         i++;
 139       }
 140     } else {
 141       aOutString += aInString[i];
 142       i++;
 143     }
 144   }
 145 }
 146
 147 void mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t* aInString,
 148                                               int32_t aInLength,
 149                                               const uint32_t pos,
 150                                               nsString& aOutString) {
 151   NS_ASSERTION(int32_t(pos) < aInLength,
 152                "bad args to CompleteAbbreviatedURL, see bug #190851");
 153   if (int32_t(pos) >= aInLength) return;
 154
 155   if (aInString[pos] == '@') {
 156     // only pre-pend a mailto url if the string contains a .domain in it..
 157     // i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
 158     nsDependentString inString(aInString, aInLength);
 159     if (inString.FindChar('.', pos) !=
 160         kNotFound)  // if we have a '.' after the @ sign....
 161     {
 162       aOutString.AssignLiteral("mailto:");
 163       aOutString += aInString;
 164     }
 165   } else if (aInString[pos] == '.') {
 166     if (ItMatchesDelimited(aInString, aInLength, u"www.", 4, LT_IGNORE,
 167                            LT_IGNORE)) {
 168       aOutString.AssignLiteral("http://");
 169       aOutString += aInString;
 170     }
 171   }
 172 }
 173
 174 bool mozTXTToHTMLConv::FindURLStart(const char16_t* aInString,
 175                                     int32_t aInLength, const uint32_t pos,
 176                                     const modetype check, uint32_t& start) {
 177   switch (check) {  // no breaks, because end of blocks is never reached
 178     case RFC1738: {
 179       if (!NS_strncmp(&aInString[std::max(int32_t(pos - 4), 0)], u"<URL:", 5)) {
 180         start = pos + 1;
 181         return true;
 182       }
 183       return false;
 184     }
 185     case RFC2396E: {
 186       nsDependentSubstring temp(aInString, aInLength);
 187       int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(u"<>\"", pos - 1);
 188       if (i != kNotFound &&
 189           (temp[uint32_t(i)] == '<' || temp[uint32_t(i)] == '"')) {
 190         start = uint32_t(++i);
 191         return start < pos;
 192       }
 193       return false;
 194     }
 195     case freetext: {
 196       int32_t i = pos - 1;
 197       for (; i >= 0 &&
 198              (IsAsciiAlpha(aInString[uint32_t(i)]) ||
 199               IsAsciiDigit(aInString[uint32_t(i)]) ||
 200               aInString[uint32_t(i)] == '+' || aInString[uint32_t(i)] == '-' ||
 201               aInString[uint32_t(i)] == '.');
 202            i--) {
 203         ;
 204       }
 205       if (++i >= 0 && uint32_t(i) < pos &&
 206           IsAsciiAlpha(aInString[uint32_t(i)])) {
 207         start = uint32_t(i);
 208         return true;
 209       }
 210       return false;
 211     }
 212     case abbreviated: {
 213       int32_t i = pos - 1;
 214       // This disallows non-ascii-characters for email.
 215       // Currently correct, but revisit later after standards changed.
 216       bool isEmail = aInString[pos] == (char16_t)'@';
 217       // These chars mark the start of the URL
 218       for (; i >= 0 && aInString[uint32_t(i)] != '>' &&
 219              aInString[uint32_t(i)] != '<' && aInString[uint32_t(i)] != '"' &&
 220              aInString[uint32_t(i)] != '\'' && aInString[uint32_t(i)] != '`' &&
 221              aInString[uint32_t(i)] != ',' && aInString[uint32_t(i)] != '{' &&
 222              aInString[uint32_t(i)] != '[' && aInString[uint32_t(i)] != '(' &&
 223              aInString[uint32_t(i)] != '|' && aInString[uint32_t(i)] != '\\' &&
 224              !IsSpace(aInString[uint32_t(i)]) &&
 225              (!isEmail || IsAscii(aInString[uint32_t(i)])) &&
 226              (!isEmail || aInString[uint32_t(i)] != ')');
 227            i--) {
 228         ;
 229       }
 230       if (++i >= 0 && uint32_t(i) < pos &&
 231           (IsAsciiAlpha(aInString[uint32_t(i)]) ||
 232            IsAsciiDigit(aInString[uint32_t(i)]))) {
 233         start = uint32_t(i);
 234         return true;
 235       }
 236       return false;
 237     }
 238     default:
 239       return false;
 240   }  // switch
 241 }
 242
 243 bool mozTXTToHTMLConv::FindURLEnd(const char16_t* aInString,
 244                                   int32_t aInStringLength, const uint32_t pos,
 245                                   const modetype check, const uint32_t start,
 246                                   uint32_t& end) {
 247   switch (check) {  // no breaks, because end of blocks is never reached
 248     case RFC1738:
 249     case RFC2396E: {
 250       nsDependentSubstring temp(aInString, aInStringLength);
 251
 252       int32_t i = temp.FindCharInSet(u"<>\"", pos + 1);
 253       if (i != kNotFound &&
 254           temp[uint32_t(i--)] ==
 255               (check == RFC1738 || temp[start - 1] == '<' ? '>' : '"')) {
 256         end = uint32_t(i);
 257         return end > pos;
 258       }
 259       return false;
 260     }
 261     case freetext:
 262     case abbreviated: {
 263       uint32_t i = pos + 1;
 264       bool isEmail = aInString[pos] == (char16_t)'@';
 265       bool seenOpeningParenthesis = false;  // there is a '(' earlier in the URL
 266       bool seenOpeningSquareBracket =
 267           false;  // there is a '[' earlier in the URL
 268       for (; int32_t(i) < aInStringLength; i++) {
 269         // These chars mark the end of the URL
 270         if (aInString[i] == '>' || aInString[i] == '<' || aInString[i] == '"' ||
 271             aInString[i] == '`' || aInString[i] == '}' || aInString[i] == '{' ||
 272             (aInString[i] == ')' && !seenOpeningParenthesis) ||
 273             (aInString[i] == ']' && !seenOpeningSquareBracket) ||
 274             // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
 275             (aInString[i] == '[' && i > 2 &&
 276              (aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
 277             IsSpace(aInString[i])) {
 278           break;
 279         }
 280         // Disallow non-ascii-characters for email.
 281         // Currently correct, but revisit later after standards changed.
 282         if (isEmail && (aInString[i] == '(' || aInString[i] == '\'' ||
 283                         !IsAscii(aInString[i]))) {
 284           break;
 285         }
 286         if (aInString[i] == '(') seenOpeningParenthesis = true;
 287         if (aInString[i] == '[') seenOpeningSquareBracket = true;
 288       }
 289       // These chars are allowed in the middle of the URL, but not at end.
 290       // Technically they are, but are used in normal text after the URL.
 291       while (--i > pos && (aInString[i] == '.' || aInString[i] == ',' ||
 292                            aInString[i] == ';' || aInString[i] == '!' ||
 293                            aInString[i] == '?' || aInString[i] == '-' ||
 294                            aInString[i] == ':' || aInString[i] == '\'')) {
 295         ;
 296       }
 297       if (i > pos) {
 298         end = i;
 299         return true;
 300       }
 301       return false;
 302     }
 303     default:
 304       return false;
 305   }  // switch
 306 }
 307
 308 void mozTXTToHTMLConv::CalculateURLBoundaries(
 309     const char16_t* aInString, int32_t aInStringLength, const uint32_t pos,
 310     const uint32_t whathasbeendone, const modetype check, const uint32_t start,
 311     const uint32_t end, nsString& txtURL, nsString& desc,
 312     int32_t& replaceBefore, int32_t& replaceAfter) {
 313   uint32_t descstart = start;
 314   switch (check) {
 315     case RFC1738: {
 316       descstart = start - 5;
 317       desc.Append(&aInString[descstart],
 318                   end - descstart + 2);  // include "<URL:" and ">"
 319       replaceAfter = end - pos + 1;
 320     } break;
 321     case RFC2396E: {
 322       descstart = start - 1;
 323       desc.Append(&aInString[descstart],
 324                   end - descstart + 2);  // include brackets
 325       replaceAfter = end - pos + 1;
 326     } break;
 327     case freetext:
 328     case abbreviated: {
 329       descstart = start;
 330       desc.Append(&aInString[descstart],
 331                   end - start + 1);  // don't include brackets
 332       replaceAfter = end - pos;
 333     } break;
 334     default:
 335       break;
 336   }  // switch
 337
 338   EscapeStr(desc, false);
 339
 340   txtURL.Append(&aInString[start], end - start + 1);
 341   txtURL.StripWhitespace();
 342
 343   // FIX ME
 344   nsAutoString temp2;
 345   ScanTXT(nsDependentSubstring(&aInString[descstart], pos - descstart),
 346           ~kURLs /*prevents loop*/ & whathasbeendone, temp2);
 347   replaceBefore = temp2.Length();
 348 }
 349
 350 bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL) {
 351   if (!mIOService) return false;
 352
 353   nsAutoCString scheme;
 354   nsresult rv = mIOService->ExtractScheme(aURL, scheme);
 355   if (NS_FAILED(rv)) return false;
 356
 357   if (scheme == "http" || scheme == "https" || scheme == "mailto") {
 358     return true;
 359   }
 360
 361   // Get the handler for this scheme.
 362   nsCOMPtr<nsIProtocolHandler> handler;
 363   rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
 364   if (NS_FAILED(rv)) return false;
 365
 366   // Is it an external protocol handler? If not, linkify it.
 367   nsCOMPtr<nsIExternalProtocolHandler> externalHandler =
 368       do_QueryInterface(handler);
 369   if (!externalHandler) return true;  // handler is built-in, linkify it!
 370
 371   // If external app exists for the scheme then linkify it.
 372   bool exists;
 373   rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
 374   return (NS_SUCCEEDED(rv) && exists);
 375 }
 376
 377 bool mozTXTToHTMLConv::CheckURLAndCreateHTML(const nsString& txtURL,
 378                                              const nsString& desc,
 379                                              const modetype mode,
 380                                              nsString& outputHTML) {
 381   // Create *uri from txtURL
 382   nsCOMPtr<nsIURI> uri;
 383   nsresult rv;
 384   // Lazily initialize mIOService
 385   if (!mIOService) {
 386     mIOService = do_GetIOService();
 387
 388     if (!mIOService) return false;
 389   }
 390
 391   // See if the url should be linkified.
 392   NS_ConvertUTF16toUTF8 utf8URL(txtURL);
 393   if (!ShouldLinkify(utf8URL)) return false;
 394
 395   // it would be faster if we could just check to see if there is a protocol
 396   // handler for the url and return instead of actually trying to create a
 397   // url...
 398   rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));
 399
 400   // Real work
 401   if (NS_SUCCEEDED(rv) && uri) {
 402     outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
 403     switch (mode) {
 404       case RFC1738:
 405         outputHTML.AppendLiteral("rfc1738");
 406         break;
 407       case RFC2396E:
 408         outputHTML.AppendLiteral("rfc2396E");
 409         break;
 410       case freetext:
 411         outputHTML.AppendLiteral("freetext");
 412         break;
 413       case abbreviated:
 414         outputHTML.AppendLiteral("abbreviated");
 415         break;
 416       default:
 417         break;
 418     }
 419     nsAutoString escapedURL(txtURL);
 420     EscapeStr(escapedURL, true);
 421
 422     outputHTML.AppendLiteral("\" href=\"");
 423     outputHTML += escapedURL;
 424     outputHTML.AppendLiteral("\">");
 425     outputHTML += desc;
 426     outputHTML.AppendLiteral("</a>");
 427     return true;
 428   }
 429   return false;
 430 }
 431
 432 NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t* aInString,
 433                                                    int32_t aInLength,
 434                                                    int32_t aPos,
 435                                                    int32_t* aStartPos,
 436                                                    int32_t* aEndPos) {
 437   // call FindURL on the passed in string
 438   nsAutoString outputHTML;  // we'll ignore the generated output HTML
 439
 440   *aStartPos = -1;
 441   *aEndPos = -1;
 442
 443   FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);
 444
 445   return NS_OK;
 446 }
 447
 448 bool mozTXTToHTMLConv::FindURL(const char16_t* aInString, int32_t aInLength,
 449                                const uint32_t pos,
 450                                const uint32_t whathasbeendone,
 451                                nsString& outputHTML, int32_t& replaceBefore,
 452                                int32_t& replaceAfter) {
 453   enum statetype { unchecked, invalid, startok, endok, success };
 454   static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};
 455
 456   statetype state[mozTXTToHTMLConv_lastMode + 1];  // 0(=unknown)..lastMode
 457   /* I don't like this abuse of enums as index for the array,
 458      but I don't know a better method */
 459
 460   // Define, which modes to check
 461   /* all modes but abbreviated are checked for text[pos] == ':',
 462      only abbreviated for '.', RFC2396E and abbreviated for '@' */
 463   for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
 464        iState = modetype(iState + 1)) {
 465     state[iState] = aInString[pos] == ':' ? unchecked : invalid;
 466   }
 467   switch (aInString[pos]) {
 468     case '@':
 469       state[RFC2396E] = unchecked;
 470       [[fallthrough]];
 471     case '.':
 472       state[abbreviated] = unchecked;
 473       break;
 474     case ':':
 475       state[abbreviated] = invalid;
 476       break;
 477     default:
 478       break;
 479   }
 480
 481   // Test, first successful mode wins, sequence defined by |ranking|
 482   int32_t iCheck = 0;  // the currently tested modetype
 483   modetype check = ranking[iCheck];
 484   for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
 485        iCheck++)
 486   /* check state from last run.
 487      If this is the first, check this one, which isn't = success yet */
 488   {
 489     check = ranking[iCheck];
 490
 491     uint32_t start, end;
 492
 493     if (state[check] == unchecked) {
 494       if (FindURLStart(aInString, aInLength, pos, check, start)) {
 495         state[check] = startok;
 496       }
 497     }
 498
 499     if (state[check] == startok) {
 500       if (FindURLEnd(aInString, aInLength, pos, check, start, end)) {
 501         state[check] = endok;
 502       }
 503     }
 504
 505     if (state[check] == endok) {
 506       nsAutoString txtURL, desc;
 507       int32_t resultReplaceBefore, resultReplaceAfter;
 508
 509       CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check,
 510                              start, end, txtURL, desc, resultReplaceBefore,
 511                              resultReplaceAfter);
 512
 513       if (aInString[pos] != ':') {
 514         nsAutoString temp = txtURL;
 515         txtURL.SetLength(0);
 516         CompleteAbbreviatedURL(temp.get(), temp.Length(), pos - start, txtURL);
 517       }
 518
 519       if (!txtURL.IsEmpty() &&
 520           CheckURLAndCreateHTML(txtURL, desc, check, outputHTML)) {
 521         replaceBefore = resultReplaceBefore;
 522         replaceAfter = resultReplaceAfter;
 523         state[check] = success;
 524       }
 525     }  // if
 526   }    // for
 527   return state[check] == success;
 528 }
 529
 530 static inline bool IsAlpha(const uint32_t aChar) {
 531   return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kLetter;
 532 }
 533
 534 static inline bool IsDigit(const uint32_t aChar) {
 535   return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kNumber;
 536 }
 537
 538 bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString,
 539                                           int32_t aInLength,
 540                                           const char16_t* rep, int32_t aRepLen,
 541                                           LIMTYPE before, LIMTYPE after) {
 542   // this little method gets called a LOT. I found we were spending a
 543   // lot of time just calculating the length of the variable "rep"
 544   // over and over again every time we called it. So we're now passing
 545   // an integer in here.
 546   int32_t textLen = aInLength;
 547
 548   if (((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER)) &&
 549        textLen < aRepLen) ||
 550       ((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER)) &&
 551        textLen < aRepLen + 1) ||
 552       (before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER &&
 553        textLen < aRepLen + 2)) {
 554     return false;
 555   }
 556
 557   uint32_t text0 = aInString[0];
 558   if (aInLength > 1 && NS_IS_SURROGATE_PAIR(text0, aInString[1])) {
 559     text0 = SURROGATE_TO_UCS4(text0, aInString[1]);
 560   }
 561   // find length of the char/cluster to be ignored
 562   int32_t ignoreLen = before == LT_IGNORE ? 0 : 1;
 563   if (ignoreLen) {
 564     GraphemeClusterBreakIteratorUtf16 ci(
 565         Span<const char16_t>(aInString, aInLength));
 566     ignoreLen = *ci.Next();
 567   }
 568
 569   int32_t afterIndex = aRepLen + ignoreLen;
 570   uint32_t textAfterPos = aInString[afterIndex];
 571   if (aInLength > afterIndex + 1 &&
 572       NS_IS_SURROGATE_PAIR(textAfterPos, aInString[afterIndex + 1])) {
 573     textAfterPos = SURROGATE_TO_UCS4(textAfterPos, aInString[afterIndex + 1]);
 574   }
 575
 576   return !((before == LT_ALPHA && !IsAlpha(text0)) ||
 577            (before == LT_DIGIT && !IsDigit(text0)) ||
 578            (before == LT_DELIMITER &&
 579             (IsAlpha(text0) || IsDigit(text0) || text0 == *rep)) ||
 580            (after == LT_ALPHA && !IsAlpha(textAfterPos)) ||
 581            (after == LT_DIGIT && !IsDigit(textAfterPos)) ||
 582            (after == LT_DELIMITER &&
 583             (IsAlpha(textAfterPos) || IsDigit(textAfterPos) ||
 584              textAfterPos == *rep)) ||
 585            !Substring(Substring(aInString, aInString + aInLength), ignoreLen,
 586                       aRepLen)
 587                 .Equals(Substring(rep, rep + aRepLen),
 588                         nsCaseInsensitiveStringComparator));
 589 }
 590
 591 uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString,
 592                                            int32_t aInStringLength,
 593                                            const char16_t* rep, int32_t aRepLen,
 594                                            LIMTYPE before, LIMTYPE after) {
 595   uint32_t result = 0;
 596
 597   // Limit lookahead length to avoid pathological O(n^2) behavior; looking so
 598   // far ahead is unlikely to be important for cases where styling marked-up
 599   // fragments is actually useful anyhow.
 600   const uint32_t len =
 601       std::min(2000u, mozilla::AssertedCast<uint32_t>(aInStringLength));
 602   GraphemeClusterBreakIteratorUtf16 ci(Span<const char16_t>(aInString, len));
 603   for (uint32_t pos = 0; pos < len; pos = *ci.Next()) {
 604     if (ItMatchesDelimited(aInString + pos, aInStringLength - pos, rep, aRepLen,
 605                            before, after)) {
 606       result++;
 607     }
 608   }
 609   return result;
 610 }
 611
 612 // NOTE: the converted html for the phrase is appended to aOutString
 613 // tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
 614 bool mozTXTToHTMLConv::StructPhraseHit(
 615     const char16_t* aInString, int32_t aInStringLength, bool col0,
 616     const char16_t* tagTXT, int32_t aTagTXTLen, const char* tagHTML,
 617     const char* attributeHTML, nsAString& aOutString, uint32_t& openTags) {
 618   /* We're searching for the following pattern:
 619      LT_DELIMITER - "*" - ALPHA -
 620      [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
 621      <strong> is only inserted, if existence of a pair could be verified
 622      We use the first opening/closing tag, if we can choose */
 623
 624   const char16_t* newOffset = aInString;
 625   int32_t newLength = aInStringLength;
 626   if (!col0)  // skip the first element?
 627   {
 628     newOffset = &aInString[1];
 629     newLength = aInStringLength - 1;
 630   }
 631
 632   // opening tag
 633   if (ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
 634                          (col0 ? LT_IGNORE : LT_DELIMITER),
 635                          LT_ALPHA)  // is opening tag
 636       && NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen, LT_ALPHA,
 637                          LT_DELIMITER)  // remaining closing tags
 638              > openTags) {
 639     openTags++;
 640     aOutString.Append('<');
 641     aOutString.AppendASCII(tagHTML);
 642     aOutString.Append(char16_t(' '));
 643     aOutString.AppendASCII(attributeHTML);
 644     aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
 645     aOutString.Append(tagTXT);
 646     aOutString.AppendLiteral("</span>");
 647     return true;
 648   }
 649
 650   // closing tag
 651   if (openTags > 0 && ItMatchesDelimited(aInString, aInStringLength, tagTXT,
 652                                          aTagTXTLen, LT_ALPHA, LT_DELIMITER)) {
 653     openTags--;
 654     aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
 655     aOutString.Append(tagTXT);
 656     aOutString.AppendLiteral("</span></");
 657     aOutString.AppendASCII(tagHTML);
 658     aOutString.Append(char16_t('>'));
 659     return true;
 660   }
 661
 662   return false;
 663 }
 664
 665 bool mozTXTToHTMLConv::SmilyHit(const char16_t* aInString, int32_t aLength,
 666                                 bool col0, const char* tagTXT,
 667                                 const nsString& imageName, nsString& outputHTML,
 668                                 int32_t& glyphTextLen) {
 669   if (!aInString || !tagTXT || imageName.IsEmpty()) return false;
 670
 671   int32_t tagLen = strlen(tagTXT);
 672
 673   uint32_t delim = (col0 ? 0 : 1) + tagLen;
 674
 675   if ((col0 || IsSpace(aInString[0])) &&
 676       (aLength <= int32_t(delim) || IsSpace(aInString[delim]) ||
 677        (aLength > int32_t(delim + 1) &&
 678         (aInString[delim] == '.' || aInString[delim] == ',' ||
 679          aInString[delim] == ';' || aInString[delim] == '8' ||
 680          aInString[delim] == '>' || aInString[delim] == '!' ||
 681          aInString[delim] == '?') &&
 682         IsSpace(aInString[delim + 1]))) &&
 683       ItMatchesDelimited(aInString, aLength,
 684                          NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen,
 685                          col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
 686       // Note: tests at different pos for LT_IGNORE and LT_DELIMITER
 687   ) {
 688     if (!col0) {
 689       outputHTML.Truncate();
 690       outputHTML.Append(char16_t(' '));
 691     }
 692
 693     outputHTML.Append(imageName);  // emoji unicode
 694     glyphTextLen = (col0 ? 0 : 1) + tagLen;
 695     return true;
 696   }
 697
 698   return false;
 699 }
 700
 701 // the glyph is appended to aOutputString instead of the original string...
 702 bool mozTXTToHTMLConv::GlyphHit(const char16_t* aInString, int32_t aInLength,
 703                                 bool col0, nsAString& aOutputString,
 704                                 int32_t& glyphTextLen) {
 705   char16_t text0 = aInString[0];
 706   char16_t text1 = aInString[1];
 707   char16_t firstChar = (col0 ? text0 : text1);
 708
 709   // temporary variable used to store the glyph html text
 710   nsAutoString outputHTML;
 711   bool bTestSmilie;
 712   bool bArg = false;
 713   int i;
 714
 715   // refactor some of this mess to avoid code duplication and speed execution a
 716   // bit there are two cases that need to be tried one after another. To avoid a
 717   // lot of duplicate code, rolling into a loop
 718
 719   i = 0;
 720   while (i < 2) {
 721     bTestSmilie = false;
 722     if (!i && (firstChar == ':' || firstChar == ';' || firstChar == '=' ||
 723                firstChar == '>' || firstChar == '8' || firstChar == 'O')) {
 724       // first test passed
 725
 726       bTestSmilie = true;
 727       bArg = col0;
 728     }
 729     if (i && col0 &&
 730         (text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' ||
 731          text1 == '8' || text1 == 'O')) {
 732       // second test passed
 733
 734       bTestSmilie = true;
 735       bArg = false;
 736     }
 737     if (bTestSmilie && (SmilyHit(aInString, aInLength, bArg, ":-)",
 738                                  u"🙂"_ns,  // smile, U+1F642
 739                                  outputHTML, glyphTextLen) ||
 740
 741                         SmilyHit(aInString, aInLength, bArg, ":)",
 742                                  u"🙂"_ns,  // smile, U+1F642
 743                                  outputHTML, glyphTextLen) ||
 744
 745                         SmilyHit(aInString, aInLength, bArg, ":-D",
 746                                  u"😂"_ns,  // laughing, U+1F602
 747                                  outputHTML, glyphTextLen) ||
 748
 749                         SmilyHit(aInString, aInLength, bArg, ":-(",
 750                                  u"🙁"_ns,  // frown, U+1F641
 751                                  outputHTML, glyphTextLen) ||
 752
 753                         SmilyHit(aInString, aInLength, bArg, ":(",
 754                                  u"🙁"_ns,  // frown, U+1F641
 755                                  outputHTML, glyphTextLen) ||
 756
 757                         SmilyHit(aInString, aInLength, bArg, ":$",
 758                                  u"😳"_ns,  // embarassed, U+1F633
 759                                  outputHTML, glyphTextLen) ||
 760
 761                         SmilyHit(aInString, aInLength, bArg, ";-)",
 762                                  u"😉"_ns,  // wink, U+1F609
 763                                  outputHTML, glyphTextLen) ||
 764
 765                         SmilyHit(aInString, aInLength, col0, ";)",
 766                                  u"😉"_ns,  // wink, U+1F609
 767                                  outputHTML, glyphTextLen) ||
 768
 769                         SmilyHit(aInString, aInLength, bArg, ":-\\",
 770                                  u"😕"_ns,  // undecided, U+1F615
 771                                  outputHTML, glyphTextLen) ||
 772
 773                         SmilyHit(aInString, aInLength, bArg, ":-P",
 774                                  u"😛"_ns,  // tongue, U+1F61B
 775                                  outputHTML, glyphTextLen) ||
 776
 777                         SmilyHit(aInString, aInLength, bArg, ";-P",
 778                                  u"😜"_ns,  // winking face with tongue, U+1F61C
 779                                  outputHTML, glyphTextLen) ||
 780
 781                         SmilyHit(aInString, aInLength, bArg, "=-O",
 782                                  u"😮"_ns,  // surprise, U+1F62E
 783                                  outputHTML, glyphTextLen) ||
 784
 785                         SmilyHit(aInString, aInLength, bArg, ":-*",
 786                                  u"😘"_ns,  // kiss, U+1F618
 787                                  outputHTML, glyphTextLen) ||
 788
 789                         SmilyHit(aInString, aInLength, bArg, ">:o",
 790                                  u"🤬"_ns,  // swearing, U+1F92C
 791                                  outputHTML, glyphTextLen) ||
 792
 793                         SmilyHit(aInString, aInLength, bArg, ">:-o",
 794                                  u"🤬"_ns,  // swearing, U+1F92C
 795                                  outputHTML, glyphTextLen) ||
 796
 797                         SmilyHit(aInString, aInLength, bArg, ">:(",
 798                                  u"😠"_ns,  // angry, U+1F620
 799                                  outputHTML, glyphTextLen) ||
 800
 801                         SmilyHit(aInString, aInLength, bArg, ">:-(",
 802                                  u"😠"_ns,  // angry, U+1F620
 803                                  outputHTML, glyphTextLen) ||
 804
 805                         SmilyHit(aInString, aInLength, bArg, "8-)",
 806                                  u"😎"_ns,  // cool, U+1F60E
 807                                  outputHTML, glyphTextLen) ||
 808
 809                         SmilyHit(aInString, aInLength, bArg, ":-$",
 810                                  u"🤑"_ns,  // money, U+1F911
 811                                  outputHTML, glyphTextLen) ||
 812
 813                         SmilyHit(aInString, aInLength, bArg, ":-!",
 814                                  u"😬"_ns,  // foot, U+1F62C
 815                                  outputHTML, glyphTextLen) ||
 816
 817                         SmilyHit(aInString, aInLength, bArg, "O:-)",
 818                                  u"😇"_ns,  // innocent, U+1F607
 819                                  outputHTML, glyphTextLen) ||
 820
 821                         SmilyHit(aInString, aInLength, bArg, ":'(",
 822                                  u"😭"_ns,  // cry, U+1F62D
 823                                  outputHTML, glyphTextLen) ||
 824
 825                         SmilyHit(aInString, aInLength, bArg, ":-X",
 826                                  u"🤐"_ns,  // sealed, U+1F910
 827                                  outputHTML, glyphTextLen))) {
 828       aOutputString.Append(outputHTML);
 829       return true;
 830     }
 831     i++;
 832   }
 833   if (text0 == '\f') {
 834     aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
 835     glyphTextLen = 1;
 836     return true;
 837   }
 838   if (text0 == '+' || text1 == '+') {
 839     if (ItMatchesDelimited(aInString, aInLength, u" +/-", 4, LT_IGNORE,
 840                            LT_IGNORE)) {
 841       aOutputString.AppendLiteral(" &plusmn;");
 842       glyphTextLen = 4;
 843       return true;
 844     }
 845     if (col0 && ItMatchesDelimited(aInString, aInLength, u"+/-", 3, LT_IGNORE,
 846                                    LT_IGNORE)) {
 847       aOutputString.AppendLiteral("&plusmn;");
 848       glyphTextLen = 3;
 849       return true;
 850     }
 851   }
 852
 853   // x^2  =>  x<sup>2</sup>,   also handle powers x^-2,  x^0.5
 854   // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
 855   if (text1 == '^' &&
 856       (IsAsciiDigit(text0) || IsAsciiAlpha(text0) || text0 == ')' ||
 857        text0 == ']' || text0 == '}') &&
 858       ((2 < aInLength && IsAsciiDigit(aInString[2])) ||
 859        (3 < aInLength && aInString[2] == '-' && IsAsciiDigit(aInString[3])))) {
 860     // Find first non-digit
 861     int32_t delimPos = 3;  // skip "^" and first digit (or '-')
 862     for (; delimPos < aInLength &&
 863            (IsAsciiDigit(aInString[delimPos]) ||
 864             (aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
 865              IsAsciiDigit(aInString[delimPos + 1])));
 866          delimPos++) {
 867       ;
 868     }
 869
 870     if (delimPos < aInLength && IsAsciiAlpha(aInString[delimPos])) {
 871       return false;
 872     }
 873
 874     outputHTML.Truncate();
 875     outputHTML += text0;
 876     outputHTML.AppendLiteral(
 877         "<sup class=\"moz-txt-sup\">"
 878         "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
 879         "^</span>");
 880
 881     aOutputString.Append(outputHTML);
 882     aOutputString.Append(&aInString[2], delimPos - 2);
 883     aOutputString.AppendLiteral("</sup>");
 884
 885     glyphTextLen = delimPos /* - 1 + 1 */;
 886     return true;
 887   }
 888   /*
 889    The following strings are not substituted:
 890    |TXT   |HTML     |Reason
 891    +------+---------+----------
 892     ->     &larr;    Bug #454
 893     =>     &lArr;    dito
 894     <-     &rarr;    dito
 895     <=     &rArr;    dito
 896     (tm)   &trade;   dito
 897     1/4    &frac14;  is triggered by 1/4 Part 1, 2/4 Part 2, ...
 898     3/4    &frac34;  dito
 899     1/2    &frac12;  similar
 900   */
 901   return false;
 902 }
 903
 904 /***************************************************************************
 905   Library-internal Interface
 906 ****************************************************************************/
 907
 908 NS_IMPL_ISUPPORTS(mozTXTToHTMLConv, mozITXTToHTMLConv, nsIStreamConverter,
 909                   nsIThreadRetargetableStreamListener, nsIStreamListener,
 910                   nsIRequestObserver)
 911
 912 int32_t mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line,
 913                                        uint32_t& logLineStart) {
 914   int32_t result = 0;
 915   int32_t lineLength = NS_strlen(line);
 916
 917   bool moreCites = true;
 918   while (moreCites) {
 919     /* E.g. the following lines count as quote:
 920
 921        > text
 922        //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
 923        >text
 924        //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
 925            > text
 926        ] text
 927        USER> text
 928        USER] text
 929        //#endif
 930
 931        logLineStart is the position of "t" in this example
 932     */
 933     uint32_t i = logLineStart;
 934
 935 #ifdef QUOTE_RECOGNITION_AGGRESSIVE
 936     for (; int32_t(i) < lineLength && IsSpace(line[i]); i++)
 937       ;
 938     for (; int32_t(i) < lineLength && IsAsciiAlpha(line[i]) &&
 939            nsCRT::IsUpper(line[i]);
 940          i++)
 941       ;
 942     if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']'))
 943 #else
 944     if (int32_t(i) < lineLength && line[i] == '>')
 945 #endif
 946     {
 947       i++;
 948       if (int32_t(i) < lineLength && line[i] == ' ') i++;
 949       // sendmail/mbox
 950       // Placed here for performance increase
 951       const char16_t* indexString = &line[logLineStart];
 952       // here, |logLineStart < lineLength| is always true
 953       uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString));
 954       if (Substring(indexString, indexString + minlength)
 955               .Equals(Substring(u">From "_ns, 0, minlength),
 956                       nsCaseInsensitiveStringComparator)) {
 957         // XXX RFC2646
 958         moreCites = false;
 959       } else {
 960         result++;
 961         logLineStart = i;
 962       }
 963     } else {
 964       moreCites = false;
 965     }
 966   }
 967
 968   return result;
 969 }
 970
 971 NS_IMETHODIMP
 972 mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo,
 973                           nsAString& aOutString) {
 974   if (aInString.Length() == 0) {
 975     aOutString.Truncate();
 976     return NS_OK;
 977   }
 978
 979   if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
 980                               mozilla::fallible)) {
 981     return NS_ERROR_OUT_OF_MEMORY;
 982   }
 983
 984   bool doURLs = 0 != (whattodo & kURLs);
 985   bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution);
 986   bool doStructPhrase = 0 != (whattodo & kStructPhrase);
 987
 988   uint32_t structPhrase_strong = 0;  // Number of currently open tags
 989   uint32_t structPhrase_underline = 0;
 990   uint32_t structPhrase_italic = 0;
 991   uint32_t structPhrase_code = 0;
 992
 993   uint32_t endOfLastURLOutput = 0;
 994
 995   nsAutoString outputHTML;  // moved here for performance increase
 996
 997   const char16_t* rawInputString = aInString.BeginReading();
 998   uint32_t inLength = aInString.Length();
 999
1000   const Span<const char16_t> inString(aInString);
1001   GraphemeClusterBreakIteratorUtf16 ci(inString);
1002   uint32_t i = 0;
1003   while (i < inLength) {
1004     if (doGlyphSubstitution) {
1005       int32_t glyphTextLen;
1006       if (GlyphHit(&rawInputString[i], inLength - i, i == 0, aOutString,
1007                    glyphTextLen)) {
1008         i = *ci.Seek(i + glyphTextLen - 1);
1009         continue;
1010       }
1011     }
1012
1013     if (doStructPhrase) {
1014       const char16_t* newOffset = rawInputString;
1015       int32_t newLength = aInString.Length();
1016       if (i > 0)  // skip the first element?
1017       {
1018         GraphemeClusterBreakReverseIteratorUtf16 ri(
1019             Span<const char16_t>(rawInputString, i));
1020         Maybe<uint32_t> nextPos = ri.Next();
1021         newOffset += *nextPos;
1022         newLength -= *nextPos;
1023       }
1024
1025       switch (aInString[i])  // Performance increase
1026       {
1027         case '*':
1028           if (StructPhraseHit(newOffset, newLength, i == 0, u"*", 1, "b",
1029                               "class=\"moz-txt-star\"", aOutString,
1030                               structPhrase_strong)) {
1031             i = *ci.Next();
1032             continue;
1033           }
1034           break;
1035         case '/':
1036           if (StructPhraseHit(newOffset, newLength, i == 0, u"/", 1, "i",
1037                               "class=\"moz-txt-slash\"", aOutString,
1038                               structPhrase_italic)) {
1039             i = *ci.Next();
1040             continue;
1041           }
1042           break;
1043         case '_':
1044           if (StructPhraseHit(newOffset, newLength, i == 0, u"_", 1,
1045                               "span" /* <u> is deprecated */,
1046                               "class=\"moz-txt-underscore\"", aOutString,
1047                               structPhrase_underline)) {
1048             i = *ci.Next();
1049             continue;
1050           }
1051           break;
1052         case '|':
1053           if (StructPhraseHit(newOffset, newLength, i == 0, u"|", 1, "code",
1054                               "class=\"moz-txt-verticalline\"", aOutString,
1055                               structPhrase_code)) {
1056             i = *ci.Next();
1057             continue;
1058           }
1059           break;
1060       }
1061     }
1062
1063     if (doURLs) {
1064       switch (aInString[i]) {
1065         case ':':
1066         case '@':
1067         case '.':
1068           if ((i == 0 || ((i > 0) && aInString[i - 1] != ' ')) &&
1069               ((i == aInString.Length() - 1) ||
1070                (aInString[i + 1] != ' ')))  // Performance increase
1071           {
1072             int32_t replaceBefore;
1073             int32_t replaceAfter;
1074             if (FindURL(rawInputString, aInString.Length(), i, whattodo,
1075                         outputHTML, replaceBefore, replaceAfter) &&
1076                 structPhrase_strong + structPhrase_italic +
1077                         structPhrase_underline + structPhrase_code ==
1078                     0
1079                 /* workaround for bug #19445 */) {
1080               // Don't cut into previously inserted HTML (bug 1509493)
1081               if (aOutString.Length() - replaceBefore < endOfLastURLOutput) {
1082                 break;
1083               }
1084               aOutString.Cut(aOutString.Length() - replaceBefore,
1085                              replaceBefore);
1086               aOutString += outputHTML;
1087               endOfLastURLOutput = aOutString.Length();
1088               i = *ci.Seek(i + replaceAfter);
1089               continue;
1090             }
1091           }
1092           break;
1093       }  // switch
1094     }
1095
1096     switch (aInString[i]) {
1097       // Special symbols
1098       case '<':
1099       case '>':
1100       case '&':
1101         EscapeChar(aInString[i], aOutString, false);
1102         i = *ci.Next();
1103         break;
1104       // Normal characters
1105       default: {
1106         const uint32_t oldIdx = i;
1107         i = *ci.Next();
1108         aOutString.Append(inString.FromTo(oldIdx, i));
1109         break;
1110       }
1111     }
1112   }
1113   return NS_OK;
1114 }
1115
1116 NS_IMETHODIMP
1117 mozTXTToHTMLConv::ScanHTML(const nsAString& input, uint32_t whattodo,
1118                            nsAString& aOutString) {
1119   const nsPromiseFlatString& aInString = PromiseFlatString(input);
1120   if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
1121                               mozilla::fallible)) {
1122     return NS_ERROR_OUT_OF_MEMORY;
1123   }
1124
1125   // some common variables we were recalculating
1126   // every time inside the for loop...
1127   int32_t lengthOfInString = aInString.Length();
1128   const char16_t* uniBuffer = aInString.get();
1129
1130 #ifdef DEBUG_BenB_Perf
1131   PRTime parsing_start = PR_IntervalNow();
1132 #endif
1133
1134   // Look for simple entities not included in a tags and scan them.
1135   // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"),
1136   // comment tag ("<!--[...]-->"), style tag, script tag or head tag.
1137   // Unescape the rest (text between tags) and pass it to ScanTXT.
1138   nsAutoCString canFollow(" \f\n\r\t>");
1139   for (int32_t i = 0; i < lengthOfInString;) {
1140     if (aInString[i] == '<')  // html tag
1141     {
1142       int32_t start = i;
1143       if (i + 2 < lengthOfInString && nsCRT::ToLower(aInString[i + 1]) == 'a' &&
1144           canFollow.FindChar(aInString[i + 2]) != kNotFound)
1145       // if a tag, skip until </a>.
1146       // Make sure there's a white-space character after, not to match "abbr".
1147       {
1148         i = aInString.LowerCaseFindASCII("</a>", i);
1149         if (i == kNotFound) {
1150           i = lengthOfInString;
1151         } else {
1152           i += 4;
1153         }
1154       } else if (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--"))
1155       // if out-commended code, skip until -->
1156       {
1157         i = aInString.Find(u"-->", i);
1158         if (i == kNotFound) {
1159           i = lengthOfInString;
1160         } else {
1161           i += 3;
1162         }
1163       } else if (i + 6 < lengthOfInString &&
1164                  Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") &&
1165                  canFollow.FindChar(aInString[i + 6]) != kNotFound)
1166       // if style tag, skip until </style>
1167       {
1168         i = aInString.LowerCaseFindASCII("</style>", i);
1169         if (i == kNotFound) {
1170           i = lengthOfInString;
1171         } else {
1172           i += 8;
1173         }
1174       } else if (i + 7 < lengthOfInString &&
1175                  Substring(aInString, i + 1, 6)
1176                      .LowerCaseEqualsASCII("script") &&
1177                  canFollow.FindChar(aInString[i + 7]) != kNotFound)
1178       // if script tag, skip until </script>
1179       {
1180         i = aInString.LowerCaseFindASCII("</script>", i);
1181         if (i == kNotFound) {
1182           i = lengthOfInString;
1183         } else {
1184           i += 9;
1185         }
1186       } else if (i + 5 < lengthOfInString &&
1187                  Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") &&
1188                  canFollow.FindChar(aInString[i + 5]) != kNotFound)
1189       // if head tag, skip until </head>
1190       // Make sure not to match <header>.
1191       {
1192         i = aInString.LowerCaseFindASCII("</head>", i);
1193         if (i == kNotFound) {
1194           i = lengthOfInString;
1195         } else {
1196           i += 7;
1197         }
1198       } else  // just skip tag (attributes etc.)
1199       {
1200         i = aInString.FindChar('>', i);
1201         if (i == kNotFound) {
1202           i = lengthOfInString;
1203         } else {
1204           i++;
1205         }
1206       }
1207       aOutString.Append(&uniBuffer[start], i - start);
1208     } else {
1209       uint32_t start = uint32_t(i);
1210       i = aInString.FindChar('<', i);
1211       if (i == kNotFound) i = lengthOfInString;
1212
1213       nsAutoStringN<256> tempString;
1214       tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate));
1215       UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString);
1216       ScanTXT(tempString, whattodo, aOutString);
1217     }
1218   }
1219
1220 #ifdef DEBUG_BenB_Perf
1221   printf("ScanHTML time:    %d ms\n",
1222          PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
1223 #endif
1224   return NS_OK;
1225 }
1226
1227 /****************************************************************************
1228   XPCOM Interface
1229 *****************************************************************************/
1230
1231 NS_IMETHODIMP
1232 mozTXTToHTMLConv::Convert(nsIInputStream* aFromStream, const char* aFromType,
1233                           const char* aToType, nsISupports* aCtxt,
1234                           nsIInputStream** _retval) {
1235   return NS_ERROR_NOT_IMPLEMENTED;
1236 }
1237
1238 NS_IMETHODIMP
1239 mozTXTToHTMLConv::AsyncConvertData(const char* aFromType, const char* aToType,
1240                                    nsIStreamListener* aListener,
1241                                    nsISupports* aCtxt) {
1242   return NS_ERROR_NOT_IMPLEMENTED;
1243 }
1244
1245 NS_IMETHODIMP
1246 mozTXTToHTMLConv::GetConvertedType(const nsACString& aFromType,
1247                                    nsIChannel* aChannel, nsACString& aToType) {
1248   return NS_ERROR_NOT_IMPLEMENTED;
1249 }
1250
1251 NS_IMETHODIMP
1252 mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsIInputStream* inStr,
1253                                   uint64_t sourceOffset, uint32_t count) {
1254   return NS_ERROR_NOT_IMPLEMENTED;
1255 }
1256
1257 NS_IMETHODIMP
1258 mozTXTToHTMLConv::OnDataFinished(nsresult aStatus) {
1259   return NS_ERROR_NOT_IMPLEMENTED;
1260 }
1261
1262 NS_IMETHODIMP
1263 mozTXTToHTMLConv::CheckListenerChain() { return NS_ERROR_NOT_IMPLEMENTED; }
1264
1265 NS_IMETHODIMP
1266 mozTXTToHTMLConv::MaybeRetarget(nsIRequest* request) {
1267   return NS_ERROR_NOT_IMPLEMENTED;
1268 }
1269
1270 NS_IMETHODIMP
1271 mozTXTToHTMLConv::OnStartRequest(nsIRequest* request) {
1272   return NS_ERROR_NOT_IMPLEMENTED;
1273 }
1274
1275 NS_IMETHODIMP
1276 mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsresult aStatus) {
1277   return NS_ERROR_NOT_IMPLEMENTED;
1278 }
1279
1280 NS_IMETHODIMP
1281 mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line, uint32_t* logLineStart,
1282                                uint32_t* _retval) {
1283   if (!logLineStart || !_retval || !line) return NS_ERROR_NULL_POINTER;
1284   *_retval = CiteLevelTXT(line, *logLineStart);
1285   return NS_OK;
1286 }
1287
1288 nsresult MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv) {
1289   MOZ_ASSERT(aConv != nullptr, "null ptr");
1290   if (!aConv) return NS_ERROR_NULL_POINTER;
1291
1292   RefPtr<mozTXTToHTMLConv> conv = new mozTXTToHTMLConv();
1293   conv.forget(aConv);
1294   //    return (*aConv)->Init();
1295   return NS_OK;
1296 }