Backed out changeset 2450366cf7ca (bug 1891629) for causing win msix mochitest failures
[gecko.git] / netwerk / streamconv / converters / mozTXTToHTMLConv.cpp
blobde9927386207da2b3b17f6f0a24405bab436b49d
1 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozilla/TextUtils.h"
7 #include "mozTXTToHTMLConv.h"
8 #include "mozilla/intl/Segmenter.h"
9 #include "mozilla/Maybe.h"
10 #include "nsIThreadRetargetableStreamListener.h"
11 #include "nsNetUtil.h"
12 #include "nsUnicharUtils.h"
13 #include "nsUnicodeProperties.h"
14 #include "nsCRT.h"
15 #include "nsIExternalProtocolHandler.h"
16 #include "nsIURI.h"
18 #include <algorithm>
20 #ifdef DEBUG_BenB_Perf
21 # include "prtime.h"
22 # include "prinrval.h"
23 #endif
25 using mozilla::IsAscii;
26 using mozilla::IsAsciiAlpha;
27 using mozilla::IsAsciiDigit;
28 using mozilla::Maybe;
29 using mozilla::Some;
30 using mozilla::Span;
31 using mozilla::intl::GraphemeClusterBreakIteratorUtf16;
32 using mozilla::intl::GraphemeClusterBreakReverseIteratorUtf16;
34 const double growthRate = 1.2;
36 // Bug 183111, editor now replaces multiple spaces with leading
37 // 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
38 // 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
39 // Also recognize the Japanese ideographic space 0x3000 as a space.
40 static inline bool IsSpace(const char16_t aChar) {
41 return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
44 // Escape Char will take ch, escape it and append the result to
45 // aStringToAppendTo
46 void mozTXTToHTMLConv::EscapeChar(const char16_t ch,
47 nsAString& aStringToAppendTo,
48 bool inAttribute) {
49 switch (ch) {
50 case '<':
51 aStringToAppendTo.AppendLiteral("&lt;");
52 break;
53 case '>':
54 aStringToAppendTo.AppendLiteral("&gt;");
55 break;
56 case '&':
57 aStringToAppendTo.AppendLiteral("&amp;");
58 break;
59 case '"':
60 if (inAttribute) {
61 aStringToAppendTo.AppendLiteral("&quot;");
62 break;
64 // else fall through
65 [[fallthrough]];
66 default:
67 aStringToAppendTo += ch;
71 // EscapeStr takes the passed in string and
72 // escapes it IN PLACE.
73 void mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute) {
74 // the replace substring routines
75 // don't seem to work if you have a character
76 // in the in string that is also in the replacement
77 // string! =(
78 // aInString.ReplaceSubstring("&", "&amp;");
79 // aInString.ReplaceSubstring("<", "&lt;");
80 // aInString.ReplaceSubstring(">", "&gt;");
81 for (uint32_t i = 0; i < aInString.Length();) {
82 switch (aInString[i]) {
83 case '<':
84 aInString.Cut(i, 1);
85 aInString.InsertLiteral(u"&lt;", i);
86 i += 4; // skip past the integers we just added
87 break;
88 case '>':
89 aInString.Cut(i, 1);
90 aInString.InsertLiteral(u"&gt;", i);
91 i += 4; // skip past the integers we just added
92 break;
93 case '&':
94 aInString.Cut(i, 1);
95 aInString.InsertLiteral(u"&amp;", i);
96 i += 5; // skip past the integers we just added
97 break;
98 case '"':
99 if (inAttribute) {
100 aInString.Cut(i, 1);
101 aInString.InsertLiteral(u"&quot;", i);
102 i += 6;
103 break;
105 // else fall through
106 [[fallthrough]];
107 default:
108 i++;
113 void mozTXTToHTMLConv::UnescapeStr(const char16_t* aInString, int32_t aStartPos,
114 int32_t aLength, nsString& aOutString) {
115 const char16_t* subString = nullptr;
116 for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;) {
117 int32_t remainingChars = i - aStartPos;
118 if (aInString[i] == '&') {
119 subString = &aInString[i];
120 if (!NS_strncmp(subString, u"&lt;",
121 std::min(4, aLength - remainingChars))) {
122 aOutString.Append(char16_t('<'));
123 i += 4;
124 } else if (!NS_strncmp(subString, u"&gt;",
125 std::min(4, aLength - remainingChars))) {
126 aOutString.Append(char16_t('>'));
127 i += 4;
128 } else if (!NS_strncmp(subString, u"&amp;",
129 std::min(5, aLength - remainingChars))) {
130 aOutString.Append(char16_t('&'));
131 i += 5;
132 } else if (!NS_strncmp(subString, u"&quot;",
133 std::min(6, aLength - remainingChars))) {
134 aOutString.Append(char16_t('"'));
135 i += 6;
136 } else {
137 aOutString += aInString[i];
138 i++;
140 } else {
141 aOutString += aInString[i];
142 i++;
147 void mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t* aInString,
148 int32_t aInLength,
149 const uint32_t pos,
150 nsString& aOutString) {
151 NS_ASSERTION(int32_t(pos) < aInLength,
152 "bad args to CompleteAbbreviatedURL, see bug #190851");
153 if (int32_t(pos) >= aInLength) return;
155 if (aInString[pos] == '@') {
156 // only pre-pend a mailto url if the string contains a .domain in it..
157 // i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
158 nsDependentString inString(aInString, aInLength);
159 if (inString.FindChar('.', pos) !=
160 kNotFound) // if we have a '.' after the @ sign....
162 aOutString.AssignLiteral("mailto:");
163 aOutString += aInString;
165 } else if (aInString[pos] == '.') {
166 if (ItMatchesDelimited(aInString, aInLength, u"www.", 4, LT_IGNORE,
167 LT_IGNORE)) {
168 aOutString.AssignLiteral("http://");
169 aOutString += aInString;
174 bool mozTXTToHTMLConv::FindURLStart(const char16_t* aInString,
175 int32_t aInLength, const uint32_t pos,
176 const modetype check, uint32_t& start) {
177 switch (check) { // no breaks, because end of blocks is never reached
178 case RFC1738: {
179 if (!NS_strncmp(&aInString[std::max(int32_t(pos - 4), 0)], u"<URL:", 5)) {
180 start = pos + 1;
181 return true;
183 return false;
185 case RFC2396E: {
186 nsDependentSubstring temp(aInString, aInLength);
187 int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(u"<>\"", pos - 1);
188 if (i != kNotFound &&
189 (temp[uint32_t(i)] == '<' || temp[uint32_t(i)] == '"')) {
190 start = uint32_t(++i);
191 return start < pos;
193 return false;
195 case freetext: {
196 int32_t i = pos - 1;
197 for (; i >= 0 &&
198 (IsAsciiAlpha(aInString[uint32_t(i)]) ||
199 IsAsciiDigit(aInString[uint32_t(i)]) ||
200 aInString[uint32_t(i)] == '+' || aInString[uint32_t(i)] == '-' ||
201 aInString[uint32_t(i)] == '.');
202 i--) {
205 if (++i >= 0 && uint32_t(i) < pos &&
206 IsAsciiAlpha(aInString[uint32_t(i)])) {
207 start = uint32_t(i);
208 return true;
210 return false;
212 case abbreviated: {
213 int32_t i = pos - 1;
214 // This disallows non-ascii-characters for email.
215 // Currently correct, but revisit later after standards changed.
216 bool isEmail = aInString[pos] == (char16_t)'@';
217 // These chars mark the start of the URL
218 for (; i >= 0 && aInString[uint32_t(i)] != '>' &&
219 aInString[uint32_t(i)] != '<' && aInString[uint32_t(i)] != '"' &&
220 aInString[uint32_t(i)] != '\'' && aInString[uint32_t(i)] != '`' &&
221 aInString[uint32_t(i)] != ',' && aInString[uint32_t(i)] != '{' &&
222 aInString[uint32_t(i)] != '[' && aInString[uint32_t(i)] != '(' &&
223 aInString[uint32_t(i)] != '|' && aInString[uint32_t(i)] != '\\' &&
224 !IsSpace(aInString[uint32_t(i)]) &&
225 (!isEmail || IsAscii(aInString[uint32_t(i)])) &&
226 (!isEmail || aInString[uint32_t(i)] != ')');
227 i--) {
230 if (++i >= 0 && uint32_t(i) < pos &&
231 (IsAsciiAlpha(aInString[uint32_t(i)]) ||
232 IsAsciiDigit(aInString[uint32_t(i)]))) {
233 start = uint32_t(i);
234 return true;
236 return false;
238 default:
239 return false;
240 } // switch
243 bool mozTXTToHTMLConv::FindURLEnd(const char16_t* aInString,
244 int32_t aInStringLength, const uint32_t pos,
245 const modetype check, const uint32_t start,
246 uint32_t& end) {
247 switch (check) { // no breaks, because end of blocks is never reached
248 case RFC1738:
249 case RFC2396E: {
250 nsDependentSubstring temp(aInString, aInStringLength);
252 int32_t i = temp.FindCharInSet(u"<>\"", pos + 1);
253 if (i != kNotFound &&
254 temp[uint32_t(i--)] ==
255 (check == RFC1738 || temp[start - 1] == '<' ? '>' : '"')) {
256 end = uint32_t(i);
257 return end > pos;
259 return false;
261 case freetext:
262 case abbreviated: {
263 uint32_t i = pos + 1;
264 bool isEmail = aInString[pos] == (char16_t)'@';
265 bool seenOpeningParenthesis = false; // there is a '(' earlier in the URL
266 bool seenOpeningSquareBracket =
267 false; // there is a '[' earlier in the URL
268 for (; int32_t(i) < aInStringLength; i++) {
269 // These chars mark the end of the URL
270 if (aInString[i] == '>' || aInString[i] == '<' || aInString[i] == '"' ||
271 aInString[i] == '`' || aInString[i] == '}' || aInString[i] == '{' ||
272 (aInString[i] == ')' && !seenOpeningParenthesis) ||
273 (aInString[i] == ']' && !seenOpeningSquareBracket) ||
274 // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
275 (aInString[i] == '[' && i > 2 &&
276 (aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
277 IsSpace(aInString[i])) {
278 break;
280 // Disallow non-ascii-characters for email.
281 // Currently correct, but revisit later after standards changed.
282 if (isEmail && (aInString[i] == '(' || aInString[i] == '\'' ||
283 !IsAscii(aInString[i]))) {
284 break;
286 if (aInString[i] == '(') seenOpeningParenthesis = true;
287 if (aInString[i] == '[') seenOpeningSquareBracket = true;
289 // These chars are allowed in the middle of the URL, but not at end.
290 // Technically they are, but are used in normal text after the URL.
291 while (--i > pos && (aInString[i] == '.' || aInString[i] == ',' ||
292 aInString[i] == ';' || aInString[i] == '!' ||
293 aInString[i] == '?' || aInString[i] == '-' ||
294 aInString[i] == ':' || aInString[i] == '\'')) {
297 if (i > pos) {
298 end = i;
299 return true;
301 return false;
303 default:
304 return false;
305 } // switch
308 void mozTXTToHTMLConv::CalculateURLBoundaries(
309 const char16_t* aInString, int32_t aInStringLength, const uint32_t pos,
310 const uint32_t whathasbeendone, const modetype check, const uint32_t start,
311 const uint32_t end, nsString& txtURL, nsString& desc,
312 int32_t& replaceBefore, int32_t& replaceAfter) {
313 uint32_t descstart = start;
314 switch (check) {
315 case RFC1738: {
316 descstart = start - 5;
317 desc.Append(&aInString[descstart],
318 end - descstart + 2); // include "<URL:" and ">"
319 replaceAfter = end - pos + 1;
320 } break;
321 case RFC2396E: {
322 descstart = start - 1;
323 desc.Append(&aInString[descstart],
324 end - descstart + 2); // include brackets
325 replaceAfter = end - pos + 1;
326 } break;
327 case freetext:
328 case abbreviated: {
329 descstart = start;
330 desc.Append(&aInString[descstart],
331 end - start + 1); // don't include brackets
332 replaceAfter = end - pos;
333 } break;
334 default:
335 break;
336 } // switch
338 EscapeStr(desc, false);
340 txtURL.Append(&aInString[start], end - start + 1);
341 txtURL.StripWhitespace();
343 // FIX ME
344 nsAutoString temp2;
345 ScanTXT(nsDependentSubstring(&aInString[descstart], pos - descstart),
346 ~kURLs /*prevents loop*/ & whathasbeendone, temp2);
347 replaceBefore = temp2.Length();
350 bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL) {
351 if (!mIOService) return false;
353 nsAutoCString scheme;
354 nsresult rv = mIOService->ExtractScheme(aURL, scheme);
355 if (NS_FAILED(rv)) return false;
357 if (scheme == "http" || scheme == "https" || scheme == "mailto") {
358 return true;
361 // Get the handler for this scheme.
362 nsCOMPtr<nsIProtocolHandler> handler;
363 rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
364 if (NS_FAILED(rv)) return false;
366 // Is it an external protocol handler? If not, linkify it.
367 nsCOMPtr<nsIExternalProtocolHandler> externalHandler =
368 do_QueryInterface(handler);
369 if (!externalHandler) return true; // handler is built-in, linkify it!
371 // If external app exists for the scheme then linkify it.
372 bool exists;
373 rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
374 return (NS_SUCCEEDED(rv) && exists);
377 bool mozTXTToHTMLConv::CheckURLAndCreateHTML(const nsString& txtURL,
378 const nsString& desc,
379 const modetype mode,
380 nsString& outputHTML) {
381 // Create *uri from txtURL
382 nsCOMPtr<nsIURI> uri;
383 nsresult rv;
384 // Lazily initialize mIOService
385 if (!mIOService) {
386 mIOService = do_GetIOService();
388 if (!mIOService) return false;
391 // See if the url should be linkified.
392 NS_ConvertUTF16toUTF8 utf8URL(txtURL);
393 if (!ShouldLinkify(utf8URL)) return false;
395 // it would be faster if we could just check to see if there is a protocol
396 // handler for the url and return instead of actually trying to create a
397 // url...
398 rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));
400 // Real work
401 if (NS_SUCCEEDED(rv) && uri) {
402 outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
403 switch (mode) {
404 case RFC1738:
405 outputHTML.AppendLiteral("rfc1738");
406 break;
407 case RFC2396E:
408 outputHTML.AppendLiteral("rfc2396E");
409 break;
410 case freetext:
411 outputHTML.AppendLiteral("freetext");
412 break;
413 case abbreviated:
414 outputHTML.AppendLiteral("abbreviated");
415 break;
416 default:
417 break;
419 nsAutoString escapedURL(txtURL);
420 EscapeStr(escapedURL, true);
422 outputHTML.AppendLiteral("\" href=\"");
423 outputHTML += escapedURL;
424 outputHTML.AppendLiteral("\">");
425 outputHTML += desc;
426 outputHTML.AppendLiteral("</a>");
427 return true;
429 return false;
432 NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t* aInString,
433 int32_t aInLength,
434 int32_t aPos,
435 int32_t* aStartPos,
436 int32_t* aEndPos) {
437 // call FindURL on the passed in string
438 nsAutoString outputHTML; // we'll ignore the generated output HTML
440 *aStartPos = -1;
441 *aEndPos = -1;
443 FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);
445 return NS_OK;
448 bool mozTXTToHTMLConv::FindURL(const char16_t* aInString, int32_t aInLength,
449 const uint32_t pos,
450 const uint32_t whathasbeendone,
451 nsString& outputHTML, int32_t& replaceBefore,
452 int32_t& replaceAfter) {
453 enum statetype { unchecked, invalid, startok, endok, success };
454 static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};
456 statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode
457 /* I don't like this abuse of enums as index for the array,
458 but I don't know a better method */
460 // Define, which modes to check
461 /* all modes but abbreviated are checked for text[pos] == ':',
462 only abbreviated for '.', RFC2396E and abbreviated for '@' */
463 for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
464 iState = modetype(iState + 1)) {
465 state[iState] = aInString[pos] == ':' ? unchecked : invalid;
467 switch (aInString[pos]) {
468 case '@':
469 state[RFC2396E] = unchecked;
470 [[fallthrough]];
471 case '.':
472 state[abbreviated] = unchecked;
473 break;
474 case ':':
475 state[abbreviated] = invalid;
476 break;
477 default:
478 break;
481 // Test, first successful mode wins, sequence defined by |ranking|
482 int32_t iCheck = 0; // the currently tested modetype
483 modetype check = ranking[iCheck];
484 for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
485 iCheck++)
486 /* check state from last run.
487 If this is the first, check this one, which isn't = success yet */
489 check = ranking[iCheck];
491 uint32_t start, end;
493 if (state[check] == unchecked) {
494 if (FindURLStart(aInString, aInLength, pos, check, start)) {
495 state[check] = startok;
499 if (state[check] == startok) {
500 if (FindURLEnd(aInString, aInLength, pos, check, start, end)) {
501 state[check] = endok;
505 if (state[check] == endok) {
506 nsAutoString txtURL, desc;
507 int32_t resultReplaceBefore, resultReplaceAfter;
509 CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check,
510 start, end, txtURL, desc, resultReplaceBefore,
511 resultReplaceAfter);
513 if (aInString[pos] != ':') {
514 nsAutoString temp = txtURL;
515 txtURL.SetLength(0);
516 CompleteAbbreviatedURL(temp.get(), temp.Length(), pos - start, txtURL);
519 if (!txtURL.IsEmpty() &&
520 CheckURLAndCreateHTML(txtURL, desc, check, outputHTML)) {
521 replaceBefore = resultReplaceBefore;
522 replaceAfter = resultReplaceAfter;
523 state[check] = success;
525 } // if
526 } // for
527 return state[check] == success;
530 static inline bool IsAlpha(const uint32_t aChar) {
531 return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kLetter;
534 static inline bool IsDigit(const uint32_t aChar) {
535 return mozilla::unicode::GetGenCategory(aChar) == nsUGenCategory::kNumber;
538 bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t* aInString,
539 int32_t aInLength,
540 const char16_t* rep, int32_t aRepLen,
541 LIMTYPE before, LIMTYPE after) {
542 // this little method gets called a LOT. I found we were spending a
543 // lot of time just calculating the length of the variable "rep"
544 // over and over again every time we called it. So we're now passing
545 // an integer in here.
546 int32_t textLen = aInLength;
548 if (((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER)) &&
549 textLen < aRepLen) ||
550 ((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER)) &&
551 textLen < aRepLen + 1) ||
552 (before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER &&
553 textLen < aRepLen + 2)) {
554 return false;
557 uint32_t text0 = aInString[0];
558 if (aInLength > 1 && NS_IS_SURROGATE_PAIR(text0, aInString[1])) {
559 text0 = SURROGATE_TO_UCS4(text0, aInString[1]);
561 // find length of the char/cluster to be ignored
562 int32_t ignoreLen = before == LT_IGNORE ? 0 : 1;
563 if (ignoreLen) {
564 GraphemeClusterBreakIteratorUtf16 ci(
565 Span<const char16_t>(aInString, aInLength));
566 ignoreLen = *ci.Next();
569 int32_t afterIndex = aRepLen + ignoreLen;
570 uint32_t textAfterPos = aInString[afterIndex];
571 if (aInLength > afterIndex + 1 &&
572 NS_IS_SURROGATE_PAIR(textAfterPos, aInString[afterIndex + 1])) {
573 textAfterPos = SURROGATE_TO_UCS4(textAfterPos, aInString[afterIndex + 1]);
576 return !((before == LT_ALPHA && !IsAlpha(text0)) ||
577 (before == LT_DIGIT && !IsDigit(text0)) ||
578 (before == LT_DELIMITER &&
579 (IsAlpha(text0) || IsDigit(text0) || text0 == *rep)) ||
580 (after == LT_ALPHA && !IsAlpha(textAfterPos)) ||
581 (after == LT_DIGIT && !IsDigit(textAfterPos)) ||
582 (after == LT_DELIMITER &&
583 (IsAlpha(textAfterPos) || IsDigit(textAfterPos) ||
584 textAfterPos == *rep)) ||
585 !Substring(Substring(aInString, aInString + aInLength), ignoreLen,
586 aRepLen)
587 .Equals(Substring(rep, rep + aRepLen),
588 nsCaseInsensitiveStringComparator));
591 uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t* aInString,
592 int32_t aInStringLength,
593 const char16_t* rep, int32_t aRepLen,
594 LIMTYPE before, LIMTYPE after) {
595 uint32_t result = 0;
597 // Limit lookahead length to avoid pathological O(n^2) behavior; looking so
598 // far ahead is unlikely to be important for cases where styling marked-up
599 // fragments is actually useful anyhow.
600 const uint32_t len =
601 std::min(2000u, mozilla::AssertedCast<uint32_t>(aInStringLength));
602 GraphemeClusterBreakIteratorUtf16 ci(Span<const char16_t>(aInString, len));
603 for (uint32_t pos = 0; pos < len; pos = *ci.Next()) {
604 if (ItMatchesDelimited(aInString + pos, aInStringLength - pos, rep, aRepLen,
605 before, after)) {
606 result++;
609 return result;
612 // NOTE: the converted html for the phrase is appended to aOutString
613 // tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
614 bool mozTXTToHTMLConv::StructPhraseHit(
615 const char16_t* aInString, int32_t aInStringLength, bool col0,
616 const char16_t* tagTXT, int32_t aTagTXTLen, const char* tagHTML,
617 const char* attributeHTML, nsAString& aOutString, uint32_t& openTags) {
618 /* We're searching for the following pattern:
619 LT_DELIMITER - "*" - ALPHA -
620 [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
621 <strong> is only inserted, if existence of a pair could be verified
622 We use the first opening/closing tag, if we can choose */
624 const char16_t* newOffset = aInString;
625 int32_t newLength = aInStringLength;
626 if (!col0) // skip the first element?
628 newOffset = &aInString[1];
629 newLength = aInStringLength - 1;
632 // opening tag
633 if (ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
634 (col0 ? LT_IGNORE : LT_DELIMITER),
635 LT_ALPHA) // is opening tag
636 && NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen, LT_ALPHA,
637 LT_DELIMITER) // remaining closing tags
638 > openTags) {
639 openTags++;
640 aOutString.Append('<');
641 aOutString.AppendASCII(tagHTML);
642 aOutString.Append(char16_t(' '));
643 aOutString.AppendASCII(attributeHTML);
644 aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
645 aOutString.Append(tagTXT);
646 aOutString.AppendLiteral("</span>");
647 return true;
650 // closing tag
651 if (openTags > 0 && ItMatchesDelimited(aInString, aInStringLength, tagTXT,
652 aTagTXTLen, LT_ALPHA, LT_DELIMITER)) {
653 openTags--;
654 aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
655 aOutString.Append(tagTXT);
656 aOutString.AppendLiteral("</span></");
657 aOutString.AppendASCII(tagHTML);
658 aOutString.Append(char16_t('>'));
659 return true;
662 return false;
665 bool mozTXTToHTMLConv::SmilyHit(const char16_t* aInString, int32_t aLength,
666 bool col0, const char* tagTXT,
667 const nsString& imageName, nsString& outputHTML,
668 int32_t& glyphTextLen) {
669 if (!aInString || !tagTXT || imageName.IsEmpty()) return false;
671 int32_t tagLen = strlen(tagTXT);
673 uint32_t delim = (col0 ? 0 : 1) + tagLen;
675 if ((col0 || IsSpace(aInString[0])) &&
676 (aLength <= int32_t(delim) || IsSpace(aInString[delim]) ||
677 (aLength > int32_t(delim + 1) &&
678 (aInString[delim] == '.' || aInString[delim] == ',' ||
679 aInString[delim] == ';' || aInString[delim] == '8' ||
680 aInString[delim] == '>' || aInString[delim] == '!' ||
681 aInString[delim] == '?') &&
682 IsSpace(aInString[delim + 1]))) &&
683 ItMatchesDelimited(aInString, aLength,
684 NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen,
685 col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
686 // Note: tests at different pos for LT_IGNORE and LT_DELIMITER
688 if (!col0) {
689 outputHTML.Truncate();
690 outputHTML.Append(char16_t(' '));
693 outputHTML.Append(imageName); // emoji unicode
694 glyphTextLen = (col0 ? 0 : 1) + tagLen;
695 return true;
698 return false;
701 // the glyph is appended to aOutputString instead of the original string...
702 bool mozTXTToHTMLConv::GlyphHit(const char16_t* aInString, int32_t aInLength,
703 bool col0, nsAString& aOutputString,
704 int32_t& glyphTextLen) {
705 char16_t text0 = aInString[0];
706 char16_t text1 = aInString[1];
707 char16_t firstChar = (col0 ? text0 : text1);
709 // temporary variable used to store the glyph html text
710 nsAutoString outputHTML;
711 bool bTestSmilie;
712 bool bArg = false;
713 int i;
715 // refactor some of this mess to avoid code duplication and speed execution a
716 // bit there are two cases that need to be tried one after another. To avoid a
717 // lot of duplicate code, rolling into a loop
719 i = 0;
720 while (i < 2) {
721 bTestSmilie = false;
722 if (!i && (firstChar == ':' || firstChar == ';' || firstChar == '=' ||
723 firstChar == '>' || firstChar == '8' || firstChar == 'O')) {
724 // first test passed
726 bTestSmilie = true;
727 bArg = col0;
729 if (i && col0 &&
730 (text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' ||
731 text1 == '8' || text1 == 'O')) {
732 // second test passed
734 bTestSmilie = true;
735 bArg = false;
737 if (bTestSmilie && (SmilyHit(aInString, aInLength, bArg, ":-)",
738 u"🙂"_ns, // smile, U+1F642
739 outputHTML, glyphTextLen) ||
741 SmilyHit(aInString, aInLength, bArg, ":)",
742 u"🙂"_ns, // smile, U+1F642
743 outputHTML, glyphTextLen) ||
745 SmilyHit(aInString, aInLength, bArg, ":-D",
746 u"😂"_ns, // laughing, U+1F602
747 outputHTML, glyphTextLen) ||
749 SmilyHit(aInString, aInLength, bArg, ":-(",
750 u"🙁"_ns, // frown, U+1F641
751 outputHTML, glyphTextLen) ||
753 SmilyHit(aInString, aInLength, bArg, ":(",
754 u"🙁"_ns, // frown, U+1F641
755 outputHTML, glyphTextLen) ||
757 SmilyHit(aInString, aInLength, bArg, ":$",
758 u"😳"_ns, // embarassed, U+1F633
759 outputHTML, glyphTextLen) ||
761 SmilyHit(aInString, aInLength, bArg, ";-)",
762 u"😉"_ns, // wink, U+1F609
763 outputHTML, glyphTextLen) ||
765 SmilyHit(aInString, aInLength, col0, ";)",
766 u"😉"_ns, // wink, U+1F609
767 outputHTML, glyphTextLen) ||
769 SmilyHit(aInString, aInLength, bArg, ":-\\",
770 u"😕"_ns, // undecided, U+1F615
771 outputHTML, glyphTextLen) ||
773 SmilyHit(aInString, aInLength, bArg, ":-P",
774 u"😛"_ns, // tongue, U+1F61B
775 outputHTML, glyphTextLen) ||
777 SmilyHit(aInString, aInLength, bArg, ";-P",
778 u"😜"_ns, // winking face with tongue, U+1F61C
779 outputHTML, glyphTextLen) ||
781 SmilyHit(aInString, aInLength, bArg, "=-O",
782 u"😮"_ns, // surprise, U+1F62E
783 outputHTML, glyphTextLen) ||
785 SmilyHit(aInString, aInLength, bArg, ":-*",
786 u"😘"_ns, // kiss, U+1F618
787 outputHTML, glyphTextLen) ||
789 SmilyHit(aInString, aInLength, bArg, ">:o",
790 u"🤬"_ns, // swearing, U+1F92C
791 outputHTML, glyphTextLen) ||
793 SmilyHit(aInString, aInLength, bArg, ">:-o",
794 u"🤬"_ns, // swearing, U+1F92C
795 outputHTML, glyphTextLen) ||
797 SmilyHit(aInString, aInLength, bArg, ">:(",
798 u"😠"_ns, // angry, U+1F620
799 outputHTML, glyphTextLen) ||
801 SmilyHit(aInString, aInLength, bArg, ">:-(",
802 u"😠"_ns, // angry, U+1F620
803 outputHTML, glyphTextLen) ||
805 SmilyHit(aInString, aInLength, bArg, "8-)",
806 u"😎"_ns, // cool, U+1F60E
807 outputHTML, glyphTextLen) ||
809 SmilyHit(aInString, aInLength, bArg, ":-$",
810 u"🤑"_ns, // money, U+1F911
811 outputHTML, glyphTextLen) ||
813 SmilyHit(aInString, aInLength, bArg, ":-!",
814 u"😬"_ns, // foot, U+1F62C
815 outputHTML, glyphTextLen) ||
817 SmilyHit(aInString, aInLength, bArg, "O:-)",
818 u"😇"_ns, // innocent, U+1F607
819 outputHTML, glyphTextLen) ||
821 SmilyHit(aInString, aInLength, bArg, ":'(",
822 u"😭"_ns, // cry, U+1F62D
823 outputHTML, glyphTextLen) ||
825 SmilyHit(aInString, aInLength, bArg, ":-X",
826 u"🤐"_ns, // sealed, U+1F910
827 outputHTML, glyphTextLen))) {
828 aOutputString.Append(outputHTML);
829 return true;
831 i++;
833 if (text0 == '\f') {
834 aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
835 glyphTextLen = 1;
836 return true;
838 if (text0 == '+' || text1 == '+') {
839 if (ItMatchesDelimited(aInString, aInLength, u" +/-", 4, LT_IGNORE,
840 LT_IGNORE)) {
841 aOutputString.AppendLiteral(" &plusmn;");
842 glyphTextLen = 4;
843 return true;
845 if (col0 && ItMatchesDelimited(aInString, aInLength, u"+/-", 3, LT_IGNORE,
846 LT_IGNORE)) {
847 aOutputString.AppendLiteral("&plusmn;");
848 glyphTextLen = 3;
849 return true;
853 // x^2 => x<sup>2</sup>, also handle powers x^-2, x^0.5
854 // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
855 if (text1 == '^' &&
856 (IsAsciiDigit(text0) || IsAsciiAlpha(text0) || text0 == ')' ||
857 text0 == ']' || text0 == '}') &&
858 ((2 < aInLength && IsAsciiDigit(aInString[2])) ||
859 (3 < aInLength && aInString[2] == '-' && IsAsciiDigit(aInString[3])))) {
860 // Find first non-digit
861 int32_t delimPos = 3; // skip "^" and first digit (or '-')
862 for (; delimPos < aInLength &&
863 (IsAsciiDigit(aInString[delimPos]) ||
864 (aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
865 IsAsciiDigit(aInString[delimPos + 1])));
866 delimPos++) {
870 if (delimPos < aInLength && IsAsciiAlpha(aInString[delimPos])) {
871 return false;
874 outputHTML.Truncate();
875 outputHTML += text0;
876 outputHTML.AppendLiteral(
877 "<sup class=\"moz-txt-sup\">"
878 "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
879 "^</span>");
881 aOutputString.Append(outputHTML);
882 aOutputString.Append(&aInString[2], delimPos - 2);
883 aOutputString.AppendLiteral("</sup>");
885 glyphTextLen = delimPos /* - 1 + 1 */;
886 return true;
889 The following strings are not substituted:
890 |TXT |HTML |Reason
891 +------+---------+----------
892 -> &larr; Bug #454
893 => &lArr; dito
894 <- &rarr; dito
895 <= &rArr; dito
896 (tm) &trade; dito
897 1/4 &frac14; is triggered by 1/4 Part 1, 2/4 Part 2, ...
898 3/4 &frac34; dito
899 1/2 &frac12; similar
901 return false;
904 /***************************************************************************
905 Library-internal Interface
906 ****************************************************************************/
908 NS_IMPL_ISUPPORTS(mozTXTToHTMLConv, mozITXTToHTMLConv, nsIStreamConverter,
909 nsIThreadRetargetableStreamListener, nsIStreamListener,
910 nsIRequestObserver)
912 int32_t mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line,
913 uint32_t& logLineStart) {
914 int32_t result = 0;
915 int32_t lineLength = NS_strlen(line);
917 bool moreCites = true;
918 while (moreCites) {
919 /* E.g. the following lines count as quote:
921 > text
922 //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
923 >text
924 //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
925 > text
926 ] text
927 USER> text
928 USER] text
929 //#endif
931 logLineStart is the position of "t" in this example
933 uint32_t i = logLineStart;
935 #ifdef QUOTE_RECOGNITION_AGGRESSIVE
936 for (; int32_t(i) < lineLength && IsSpace(line[i]); i++)
938 for (; int32_t(i) < lineLength && IsAsciiAlpha(line[i]) &&
939 nsCRT::IsUpper(line[i]);
940 i++)
942 if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']'))
943 #else
944 if (int32_t(i) < lineLength && line[i] == '>')
945 #endif
947 i++;
948 if (int32_t(i) < lineLength && line[i] == ' ') i++;
949 // sendmail/mbox
950 // Placed here for performance increase
951 const char16_t* indexString = &line[logLineStart];
952 // here, |logLineStart < lineLength| is always true
953 uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString));
954 if (Substring(indexString, indexString + minlength)
955 .Equals(Substring(u">From "_ns, 0, minlength),
956 nsCaseInsensitiveStringComparator)) {
957 // XXX RFC2646
958 moreCites = false;
959 } else {
960 result++;
961 logLineStart = i;
963 } else {
964 moreCites = false;
968 return result;
971 NS_IMETHODIMP
972 mozTXTToHTMLConv::ScanTXT(const nsAString& aInString, uint32_t whattodo,
973 nsAString& aOutString) {
974 if (aInString.Length() == 0) {
975 aOutString.Truncate();
976 return NS_OK;
979 if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
980 mozilla::fallible)) {
981 return NS_ERROR_OUT_OF_MEMORY;
984 bool doURLs = 0 != (whattodo & kURLs);
985 bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution);
986 bool doStructPhrase = 0 != (whattodo & kStructPhrase);
988 uint32_t structPhrase_strong = 0; // Number of currently open tags
989 uint32_t structPhrase_underline = 0;
990 uint32_t structPhrase_italic = 0;
991 uint32_t structPhrase_code = 0;
993 uint32_t endOfLastURLOutput = 0;
995 nsAutoString outputHTML; // moved here for performance increase
997 const char16_t* rawInputString = aInString.BeginReading();
998 uint32_t inLength = aInString.Length();
1000 const Span<const char16_t> inString(aInString);
1001 GraphemeClusterBreakIteratorUtf16 ci(inString);
1002 uint32_t i = 0;
1003 while (i < inLength) {
1004 if (doGlyphSubstitution) {
1005 int32_t glyphTextLen;
1006 if (GlyphHit(&rawInputString[i], inLength - i, i == 0, aOutString,
1007 glyphTextLen)) {
1008 i = *ci.Seek(i + glyphTextLen - 1);
1009 continue;
1013 if (doStructPhrase) {
1014 const char16_t* newOffset = rawInputString;
1015 int32_t newLength = aInString.Length();
1016 if (i > 0) // skip the first element?
1018 GraphemeClusterBreakReverseIteratorUtf16 ri(
1019 Span<const char16_t>(rawInputString, i));
1020 Maybe<uint32_t> nextPos = ri.Next();
1021 newOffset += *nextPos;
1022 newLength -= *nextPos;
1025 switch (aInString[i]) // Performance increase
1027 case '*':
1028 if (StructPhraseHit(newOffset, newLength, i == 0, u"*", 1, "b",
1029 "class=\"moz-txt-star\"", aOutString,
1030 structPhrase_strong)) {
1031 i = *ci.Next();
1032 continue;
1034 break;
1035 case '/':
1036 if (StructPhraseHit(newOffset, newLength, i == 0, u"/", 1, "i",
1037 "class=\"moz-txt-slash\"", aOutString,
1038 structPhrase_italic)) {
1039 i = *ci.Next();
1040 continue;
1042 break;
1043 case '_':
1044 if (StructPhraseHit(newOffset, newLength, i == 0, u"_", 1,
1045 "span" /* <u> is deprecated */,
1046 "class=\"moz-txt-underscore\"", aOutString,
1047 structPhrase_underline)) {
1048 i = *ci.Next();
1049 continue;
1051 break;
1052 case '|':
1053 if (StructPhraseHit(newOffset, newLength, i == 0, u"|", 1, "code",
1054 "class=\"moz-txt-verticalline\"", aOutString,
1055 structPhrase_code)) {
1056 i = *ci.Next();
1057 continue;
1059 break;
1063 if (doURLs) {
1064 switch (aInString[i]) {
1065 case ':':
1066 case '@':
1067 case '.':
1068 if ((i == 0 || ((i > 0) && aInString[i - 1] != ' ')) &&
1069 ((i == aInString.Length() - 1) ||
1070 (aInString[i + 1] != ' '))) // Performance increase
1072 int32_t replaceBefore;
1073 int32_t replaceAfter;
1074 if (FindURL(rawInputString, aInString.Length(), i, whattodo,
1075 outputHTML, replaceBefore, replaceAfter) &&
1076 structPhrase_strong + structPhrase_italic +
1077 structPhrase_underline + structPhrase_code ==
1079 /* workaround for bug #19445 */) {
1080 // Don't cut into previously inserted HTML (bug 1509493)
1081 if (aOutString.Length() - replaceBefore < endOfLastURLOutput) {
1082 break;
1084 aOutString.Cut(aOutString.Length() - replaceBefore,
1085 replaceBefore);
1086 aOutString += outputHTML;
1087 endOfLastURLOutput = aOutString.Length();
1088 i = *ci.Seek(i + replaceAfter);
1089 continue;
1092 break;
1093 } // switch
1096 switch (aInString[i]) {
1097 // Special symbols
1098 case '<':
1099 case '>':
1100 case '&':
1101 EscapeChar(aInString[i], aOutString, false);
1102 i = *ci.Next();
1103 break;
1104 // Normal characters
1105 default: {
1106 const uint32_t oldIdx = i;
1107 i = *ci.Next();
1108 aOutString.Append(inString.FromTo(oldIdx, i));
1109 break;
1113 return NS_OK;
1116 NS_IMETHODIMP
1117 mozTXTToHTMLConv::ScanHTML(const nsAString& input, uint32_t whattodo,
1118 nsAString& aOutString) {
1119 const nsPromiseFlatString& aInString = PromiseFlatString(input);
1120 if (!aOutString.SetCapacity(uint32_t(aInString.Length() * growthRate),
1121 mozilla::fallible)) {
1122 return NS_ERROR_OUT_OF_MEMORY;
1125 // some common variables we were recalculating
1126 // every time inside the for loop...
1127 int32_t lengthOfInString = aInString.Length();
1128 const char16_t* uniBuffer = aInString.get();
1130 #ifdef DEBUG_BenB_Perf
1131 PRTime parsing_start = PR_IntervalNow();
1132 #endif
1134 // Look for simple entities not included in a tags and scan them.
1135 // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"),
1136 // comment tag ("<!--[...]-->"), style tag, script tag or head tag.
1137 // Unescape the rest (text between tags) and pass it to ScanTXT.
1138 nsAutoCString canFollow(" \f\n\r\t>");
1139 for (int32_t i = 0; i < lengthOfInString;) {
1140 if (aInString[i] == '<') // html tag
1142 int32_t start = i;
1143 if (i + 2 < lengthOfInString && nsCRT::ToLower(aInString[i + 1]) == 'a' &&
1144 canFollow.FindChar(aInString[i + 2]) != kNotFound)
1145 // if a tag, skip until </a>.
1146 // Make sure there's a white-space character after, not to match "abbr".
1148 i = aInString.LowerCaseFindASCII("</a>", i);
1149 if (i == kNotFound) {
1150 i = lengthOfInString;
1151 } else {
1152 i += 4;
1154 } else if (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--"))
1155 // if out-commended code, skip until -->
1157 i = aInString.Find(u"-->", i);
1158 if (i == kNotFound) {
1159 i = lengthOfInString;
1160 } else {
1161 i += 3;
1163 } else if (i + 6 < lengthOfInString &&
1164 Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") &&
1165 canFollow.FindChar(aInString[i + 6]) != kNotFound)
1166 // if style tag, skip until </style>
1168 i = aInString.LowerCaseFindASCII("</style>", i);
1169 if (i == kNotFound) {
1170 i = lengthOfInString;
1171 } else {
1172 i += 8;
1174 } else if (i + 7 < lengthOfInString &&
1175 Substring(aInString, i + 1, 6)
1176 .LowerCaseEqualsASCII("script") &&
1177 canFollow.FindChar(aInString[i + 7]) != kNotFound)
1178 // if script tag, skip until </script>
1180 i = aInString.LowerCaseFindASCII("</script>", i);
1181 if (i == kNotFound) {
1182 i = lengthOfInString;
1183 } else {
1184 i += 9;
1186 } else if (i + 5 < lengthOfInString &&
1187 Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") &&
1188 canFollow.FindChar(aInString[i + 5]) != kNotFound)
1189 // if head tag, skip until </head>
1190 // Make sure not to match <header>.
1192 i = aInString.LowerCaseFindASCII("</head>", i);
1193 if (i == kNotFound) {
1194 i = lengthOfInString;
1195 } else {
1196 i += 7;
1198 } else // just skip tag (attributes etc.)
1200 i = aInString.FindChar('>', i);
1201 if (i == kNotFound) {
1202 i = lengthOfInString;
1203 } else {
1204 i++;
1207 aOutString.Append(&uniBuffer[start], i - start);
1208 } else {
1209 uint32_t start = uint32_t(i);
1210 i = aInString.FindChar('<', i);
1211 if (i == kNotFound) i = lengthOfInString;
1213 nsAutoStringN<256> tempString;
1214 tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate));
1215 UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString);
1216 ScanTXT(tempString, whattodo, aOutString);
1220 #ifdef DEBUG_BenB_Perf
1221 printf("ScanHTML time: %d ms\n",
1222 PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
1223 #endif
1224 return NS_OK;
1227 /****************************************************************************
1228 XPCOM Interface
1229 *****************************************************************************/
1231 NS_IMETHODIMP
1232 mozTXTToHTMLConv::Convert(nsIInputStream* aFromStream, const char* aFromType,
1233 const char* aToType, nsISupports* aCtxt,
1234 nsIInputStream** _retval) {
1235 return NS_ERROR_NOT_IMPLEMENTED;
1238 NS_IMETHODIMP
1239 mozTXTToHTMLConv::AsyncConvertData(const char* aFromType, const char* aToType,
1240 nsIStreamListener* aListener,
1241 nsISupports* aCtxt) {
1242 return NS_ERROR_NOT_IMPLEMENTED;
1245 NS_IMETHODIMP
1246 mozTXTToHTMLConv::GetConvertedType(const nsACString& aFromType,
1247 nsIChannel* aChannel, nsACString& aToType) {
1248 return NS_ERROR_NOT_IMPLEMENTED;
1251 NS_IMETHODIMP
1252 mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsIInputStream* inStr,
1253 uint64_t sourceOffset, uint32_t count) {
1254 return NS_ERROR_NOT_IMPLEMENTED;
1257 NS_IMETHODIMP
1258 mozTXTToHTMLConv::OnDataFinished(nsresult aStatus) {
1259 return NS_ERROR_NOT_IMPLEMENTED;
1262 NS_IMETHODIMP
1263 mozTXTToHTMLConv::CheckListenerChain() { return NS_ERROR_NOT_IMPLEMENTED; }
1265 NS_IMETHODIMP
1266 mozTXTToHTMLConv::MaybeRetarget(nsIRequest* request) {
1267 return NS_ERROR_NOT_IMPLEMENTED;
1270 NS_IMETHODIMP
1271 mozTXTToHTMLConv::OnStartRequest(nsIRequest* request) {
1272 return NS_ERROR_NOT_IMPLEMENTED;
1275 NS_IMETHODIMP
1276 mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsresult aStatus) {
1277 return NS_ERROR_NOT_IMPLEMENTED;
1280 NS_IMETHODIMP
1281 mozTXTToHTMLConv::CiteLevelTXT(const char16_t* line, uint32_t* logLineStart,
1282 uint32_t* _retval) {
1283 if (!logLineStart || !_retval || !line) return NS_ERROR_NULL_POINTER;
1284 *_retval = CiteLevelTXT(line, *logLineStart);
1285 return NS_OK;
1288 nsresult MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv) {
1289 MOZ_ASSERT(aConv != nullptr, "null ptr");
1290 if (!aConv) return NS_ERROR_NULL_POINTER;
1292 RefPtr<mozTXTToHTMLConv> conv = new mozTXTToHTMLConv();
1293 conv.forget(aConv);
1294 // return (*aConv)->Init();
1295 return NS_OK;