1 /* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "mozilla/TextUtils.h"
7 #include "mozTXTToHTMLConv.h"
8 #include "mozilla/intl/Segmenter.h"
9 #include "mozilla/Maybe.h"
10 #include "nsIThreadRetargetableStreamListener.h"
11 #include "nsNetUtil.h"
12 #include "nsUnicharUtils.h"
13 #include "nsUnicodeProperties.h"
15 #include "nsIExternalProtocolHandler.h"
20 #ifdef DEBUG_BenB_Perf
22 # include "prinrval.h"
25 using mozilla::IsAscii
;
26 using mozilla::IsAsciiAlpha
;
27 using mozilla::IsAsciiDigit
;
31 using mozilla::intl::GraphemeClusterBreakIteratorUtf16
;
32 using mozilla::intl::GraphemeClusterBreakReverseIteratorUtf16
;
34 const double growthRate
= 1.2;
36 // Bug 183111, editor now replaces multiple spaces with leading
37 // 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
38 // 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
39 // Also recognize the Japanese ideographic space 0x3000 as a space.
40 static inline bool IsSpace(const char16_t aChar
) {
41 return (nsCRT::IsAsciiSpace(aChar
) || aChar
== 0xA0 || aChar
== 0x3000);
44 // Escape Char will take ch, escape it and append the result to
46 void mozTXTToHTMLConv::EscapeChar(const char16_t ch
,
47 nsAString
& aStringToAppendTo
,
51 aStringToAppendTo
.AppendLiteral("<");
54 aStringToAppendTo
.AppendLiteral(">");
57 aStringToAppendTo
.AppendLiteral("&");
61 aStringToAppendTo
.AppendLiteral(""");
67 aStringToAppendTo
+= ch
;
71 // EscapeStr takes the passed in string and
72 // escapes it IN PLACE.
73 void mozTXTToHTMLConv::EscapeStr(nsString
& aInString
, bool inAttribute
) {
74 // the replace substring routines
75 // don't seem to work if you have a character
76 // in the in string that is also in the replacement
78 // aInString.ReplaceSubstring("&", "&");
79 // aInString.ReplaceSubstring("<", "<");
80 // aInString.ReplaceSubstring(">", ">");
81 for (uint32_t i
= 0; i
< aInString
.Length();) {
82 switch (aInString
[i
]) {
85 aInString
.InsertLiteral(u
"<", i
);
86 i
+= 4; // skip past the integers we just added
90 aInString
.InsertLiteral(u
">", i
);
91 i
+= 4; // skip past the integers we just added
95 aInString
.InsertLiteral(u
"&", i
);
96 i
+= 5; // skip past the integers we just added
101 aInString
.InsertLiteral(u
""", i
);
113 void mozTXTToHTMLConv::UnescapeStr(const char16_t
* aInString
, int32_t aStartPos
,
114 int32_t aLength
, nsString
& aOutString
) {
115 const char16_t
* subString
= nullptr;
116 for (uint32_t i
= aStartPos
; int32_t(i
) - aStartPos
< aLength
;) {
117 int32_t remainingChars
= i
- aStartPos
;
118 if (aInString
[i
] == '&') {
119 subString
= &aInString
[i
];
120 if (!NS_strncmp(subString
, u
"<",
121 std::min(4, aLength
- remainingChars
))) {
122 aOutString
.Append(char16_t('<'));
124 } else if (!NS_strncmp(subString
, u
">",
125 std::min(4, aLength
- remainingChars
))) {
126 aOutString
.Append(char16_t('>'));
128 } else if (!NS_strncmp(subString
, u
"&",
129 std::min(5, aLength
- remainingChars
))) {
130 aOutString
.Append(char16_t('&'));
132 } else if (!NS_strncmp(subString
, u
""",
133 std::min(6, aLength
- remainingChars
))) {
134 aOutString
.Append(char16_t('"'));
137 aOutString
+= aInString
[i
];
141 aOutString
+= aInString
[i
];
147 void mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t
* aInString
,
150 nsString
& aOutString
) {
151 NS_ASSERTION(int32_t(pos
) < aInLength
,
152 "bad args to CompleteAbbreviatedURL, see bug #190851");
153 if (int32_t(pos
) >= aInLength
) return;
155 if (aInString
[pos
] == '@') {
156 // only pre-pend a mailto url if the string contains a .domain in it..
157 // i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
158 nsDependentString
inString(aInString
, aInLength
);
159 if (inString
.FindChar('.', pos
) !=
160 kNotFound
) // if we have a '.' after the @ sign....
162 aOutString
.AssignLiteral("mailto:");
163 aOutString
+= aInString
;
165 } else if (aInString
[pos
] == '.') {
166 if (ItMatchesDelimited(aInString
, aInLength
, u
"www.", 4, LT_IGNORE
,
168 aOutString
.AssignLiteral("http://");
169 aOutString
+= aInString
;
174 bool mozTXTToHTMLConv::FindURLStart(const char16_t
* aInString
,
175 int32_t aInLength
, const uint32_t pos
,
176 const modetype check
, uint32_t& start
) {
177 switch (check
) { // no breaks, because end of blocks is never reached
179 if (!NS_strncmp(&aInString
[std::max(int32_t(pos
- 4), 0)], u
"<URL:", 5)) {
186 nsDependentSubstring
temp(aInString
, aInLength
);
187 int32_t i
= pos
<= 0 ? kNotFound
: temp
.RFindCharInSet(u
"<>\"", pos
- 1);
188 if (i
!= kNotFound
&&
189 (temp
[uint32_t(i
)] == '<' || temp
[uint32_t(i
)] == '"')) {
190 start
= uint32_t(++i
);
198 (IsAsciiAlpha(aInString
[uint32_t(i
)]) ||
199 IsAsciiDigit(aInString
[uint32_t(i
)]) ||
200 aInString
[uint32_t(i
)] == '+' || aInString
[uint32_t(i
)] == '-' ||
201 aInString
[uint32_t(i
)] == '.');
205 if (++i
>= 0 && uint32_t(i
) < pos
&&
206 IsAsciiAlpha(aInString
[uint32_t(i
)])) {
214 // This disallows non-ascii-characters for email.
215 // Currently correct, but revisit later after standards changed.
216 bool isEmail
= aInString
[pos
] == (char16_t
)'@';
217 // These chars mark the start of the URL
218 for (; i
>= 0 && aInString
[uint32_t(i
)] != '>' &&
219 aInString
[uint32_t(i
)] != '<' && aInString
[uint32_t(i
)] != '"' &&
220 aInString
[uint32_t(i
)] != '\'' && aInString
[uint32_t(i
)] != '`' &&
221 aInString
[uint32_t(i
)] != ',' && aInString
[uint32_t(i
)] != '{' &&
222 aInString
[uint32_t(i
)] != '[' && aInString
[uint32_t(i
)] != '(' &&
223 aInString
[uint32_t(i
)] != '|' && aInString
[uint32_t(i
)] != '\\' &&
224 !IsSpace(aInString
[uint32_t(i
)]) &&
225 (!isEmail
|| IsAscii(aInString
[uint32_t(i
)])) &&
226 (!isEmail
|| aInString
[uint32_t(i
)] != ')');
230 if (++i
>= 0 && uint32_t(i
) < pos
&&
231 (IsAsciiAlpha(aInString
[uint32_t(i
)]) ||
232 IsAsciiDigit(aInString
[uint32_t(i
)]))) {
243 bool mozTXTToHTMLConv::FindURLEnd(const char16_t
* aInString
,
244 int32_t aInStringLength
, const uint32_t pos
,
245 const modetype check
, const uint32_t start
,
247 switch (check
) { // no breaks, because end of blocks is never reached
250 nsDependentSubstring
temp(aInString
, aInStringLength
);
252 int32_t i
= temp
.FindCharInSet(u
"<>\"", pos
+ 1);
253 if (i
!= kNotFound
&&
254 temp
[uint32_t(i
--)] ==
255 (check
== RFC1738
|| temp
[start
- 1] == '<' ? '>' : '"')) {
263 uint32_t i
= pos
+ 1;
264 bool isEmail
= aInString
[pos
] == (char16_t
)'@';
265 bool seenOpeningParenthesis
= false; // there is a '(' earlier in the URL
266 bool seenOpeningSquareBracket
=
267 false; // there is a '[' earlier in the URL
268 for (; int32_t(i
) < aInStringLength
; i
++) {
269 // These chars mark the end of the URL
270 if (aInString
[i
] == '>' || aInString
[i
] == '<' || aInString
[i
] == '"' ||
271 aInString
[i
] == '`' || aInString
[i
] == '}' || aInString
[i
] == '{' ||
272 (aInString
[i
] == ')' && !seenOpeningParenthesis
) ||
273 (aInString
[i
] == ']' && !seenOpeningSquareBracket
) ||
274 // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
275 (aInString
[i
] == '[' && i
> 2 &&
276 (aInString
[i
- 1] != '/' || aInString
[i
- 2] != '/')) ||
277 IsSpace(aInString
[i
])) {
280 // Disallow non-ascii-characters for email.
281 // Currently correct, but revisit later after standards changed.
282 if (isEmail
&& (aInString
[i
] == '(' || aInString
[i
] == '\'' ||
283 !IsAscii(aInString
[i
]))) {
286 if (aInString
[i
] == '(') seenOpeningParenthesis
= true;
287 if (aInString
[i
] == '[') seenOpeningSquareBracket
= true;
289 // These chars are allowed in the middle of the URL, but not at end.
290 // Technically they are, but are used in normal text after the URL.
291 while (--i
> pos
&& (aInString
[i
] == '.' || aInString
[i
] == ',' ||
292 aInString
[i
] == ';' || aInString
[i
] == '!' ||
293 aInString
[i
] == '?' || aInString
[i
] == '-' ||
294 aInString
[i
] == ':' || aInString
[i
] == '\'')) {
308 void mozTXTToHTMLConv::CalculateURLBoundaries(
309 const char16_t
* aInString
, int32_t aInStringLength
, const uint32_t pos
,
310 const uint32_t whathasbeendone
, const modetype check
, const uint32_t start
,
311 const uint32_t end
, nsString
& txtURL
, nsString
& desc
,
312 int32_t& replaceBefore
, int32_t& replaceAfter
) {
313 uint32_t descstart
= start
;
316 descstart
= start
- 5;
317 desc
.Append(&aInString
[descstart
],
318 end
- descstart
+ 2); // include "<URL:" and ">"
319 replaceAfter
= end
- pos
+ 1;
322 descstart
= start
- 1;
323 desc
.Append(&aInString
[descstart
],
324 end
- descstart
+ 2); // include brackets
325 replaceAfter
= end
- pos
+ 1;
330 desc
.Append(&aInString
[descstart
],
331 end
- start
+ 1); // don't include brackets
332 replaceAfter
= end
- pos
;
338 EscapeStr(desc
, false);
340 txtURL
.Append(&aInString
[start
], end
- start
+ 1);
341 txtURL
.StripWhitespace();
345 ScanTXT(nsDependentSubstring(&aInString
[descstart
], pos
- descstart
),
346 ~kURLs
/*prevents loop*/ & whathasbeendone
, temp2
);
347 replaceBefore
= temp2
.Length();
350 bool mozTXTToHTMLConv::ShouldLinkify(const nsCString
& aURL
) {
351 if (!mIOService
) return false;
353 nsAutoCString scheme
;
354 nsresult rv
= mIOService
->ExtractScheme(aURL
, scheme
);
355 if (NS_FAILED(rv
)) return false;
357 if (scheme
== "http" || scheme
== "https" || scheme
== "mailto") {
361 // Get the handler for this scheme.
362 nsCOMPtr
<nsIProtocolHandler
> handler
;
363 rv
= mIOService
->GetProtocolHandler(scheme
.get(), getter_AddRefs(handler
));
364 if (NS_FAILED(rv
)) return false;
366 // Is it an external protocol handler? If not, linkify it.
367 nsCOMPtr
<nsIExternalProtocolHandler
> externalHandler
=
368 do_QueryInterface(handler
);
369 if (!externalHandler
) return true; // handler is built-in, linkify it!
371 // If external app exists for the scheme then linkify it.
373 rv
= externalHandler
->ExternalAppExistsForScheme(scheme
, &exists
);
374 return (NS_SUCCEEDED(rv
) && exists
);
377 bool mozTXTToHTMLConv::CheckURLAndCreateHTML(const nsString
& txtURL
,
378 const nsString
& desc
,
380 nsString
& outputHTML
) {
381 // Create *uri from txtURL
382 nsCOMPtr
<nsIURI
> uri
;
384 // Lazily initialize mIOService
386 mIOService
= do_GetIOService();
388 if (!mIOService
) return false;
391 // See if the url should be linkified.
392 NS_ConvertUTF16toUTF8
utf8URL(txtURL
);
393 if (!ShouldLinkify(utf8URL
)) return false;
395 // it would be faster if we could just check to see if there is a protocol
396 // handler for the url and return instead of actually trying to create a
398 rv
= mIOService
->NewURI(utf8URL
, nullptr, nullptr, getter_AddRefs(uri
));
401 if (NS_SUCCEEDED(rv
) && uri
) {
402 outputHTML
.AssignLiteral("<a class=\"moz-txt-link-");
405 outputHTML
.AppendLiteral("rfc1738");
408 outputHTML
.AppendLiteral("rfc2396E");
411 outputHTML
.AppendLiteral("freetext");
414 outputHTML
.AppendLiteral("abbreviated");
419 nsAutoString
escapedURL(txtURL
);
420 EscapeStr(escapedURL
, true);
422 outputHTML
.AppendLiteral("\" href=\"");
423 outputHTML
+= escapedURL
;
424 outputHTML
.AppendLiteral("\">");
426 outputHTML
.AppendLiteral("</a>");
432 NS_IMETHODIMP
mozTXTToHTMLConv::FindURLInPlaintext(const char16_t
* aInString
,
437 // call FindURL on the passed in string
438 nsAutoString outputHTML
; // we'll ignore the generated output HTML
443 FindURL(aInString
, aInLength
, aPos
, kURLs
, outputHTML
, *aStartPos
, *aEndPos
);
448 bool mozTXTToHTMLConv::FindURL(const char16_t
* aInString
, int32_t aInLength
,
450 const uint32_t whathasbeendone
,
451 nsString
& outputHTML
, int32_t& replaceBefore
,
452 int32_t& replaceAfter
) {
453 enum statetype
{ unchecked
, invalid
, startok
, endok
, success
};
454 static const modetype ranking
[] = {RFC1738
, RFC2396E
, freetext
, abbreviated
};
456 statetype state
[mozTXTToHTMLConv_lastMode
+ 1]; // 0(=unknown)..lastMode
457 /* I don't like this abuse of enums as index for the array,
458 but I don't know a better method */
460 // Define, which modes to check
461 /* all modes but abbreviated are checked for text[pos] == ':',
462 only abbreviated for '.', RFC2396E and abbreviated for '@' */
463 for (modetype iState
= unknown
; iState
<= mozTXTToHTMLConv_lastMode
;
464 iState
= modetype(iState
+ 1)) {
465 state
[iState
] = aInString
[pos
] == ':' ? unchecked
: invalid
;
467 switch (aInString
[pos
]) {
469 state
[RFC2396E
] = unchecked
;
472 state
[abbreviated
] = unchecked
;
475 state
[abbreviated
] = invalid
;
481 // Test, first successful mode wins, sequence defined by |ranking|
482 int32_t iCheck
= 0; // the currently tested modetype
483 modetype check
= ranking
[iCheck
];
484 for (; iCheck
< mozTXTToHTMLConv_numberOfModes
&& state
[check
] != success
;
486 /* check state from last run.
487 If this is the first, check this one, which isn't = success yet */
489 check
= ranking
[iCheck
];
493 if (state
[check
] == unchecked
) {
494 if (FindURLStart(aInString
, aInLength
, pos
, check
, start
)) {
495 state
[check
] = startok
;
499 if (state
[check
] == startok
) {
500 if (FindURLEnd(aInString
, aInLength
, pos
, check
, start
, end
)) {
501 state
[check
] = endok
;
505 if (state
[check
] == endok
) {
506 nsAutoString txtURL
, desc
;
507 int32_t resultReplaceBefore
, resultReplaceAfter
;
509 CalculateURLBoundaries(aInString
, aInLength
, pos
, whathasbeendone
, check
,
510 start
, end
, txtURL
, desc
, resultReplaceBefore
,
513 if (aInString
[pos
] != ':') {
514 nsAutoString temp
= txtURL
;
516 CompleteAbbreviatedURL(temp
.get(), temp
.Length(), pos
- start
, txtURL
);
519 if (!txtURL
.IsEmpty() &&
520 CheckURLAndCreateHTML(txtURL
, desc
, check
, outputHTML
)) {
521 replaceBefore
= resultReplaceBefore
;
522 replaceAfter
= resultReplaceAfter
;
523 state
[check
] = success
;
527 return state
[check
] == success
;
530 static inline bool IsAlpha(const uint32_t aChar
) {
531 return mozilla::unicode::GetGenCategory(aChar
) == nsUGenCategory::kLetter
;
534 static inline bool IsDigit(const uint32_t aChar
) {
535 return mozilla::unicode::GetGenCategory(aChar
) == nsUGenCategory::kNumber
;
538 bool mozTXTToHTMLConv::ItMatchesDelimited(const char16_t
* aInString
,
540 const char16_t
* rep
, int32_t aRepLen
,
541 LIMTYPE before
, LIMTYPE after
) {
542 // this little method gets called a LOT. I found we were spending a
543 // lot of time just calculating the length of the variable "rep"
544 // over and over again every time we called it. So we're now passing
545 // an integer in here.
546 int32_t textLen
= aInLength
;
548 if (((before
== LT_IGNORE
&& (after
== LT_IGNORE
|| after
== LT_DELIMITER
)) &&
549 textLen
< aRepLen
) ||
550 ((before
!= LT_IGNORE
|| (after
!= LT_IGNORE
&& after
!= LT_DELIMITER
)) &&
551 textLen
< aRepLen
+ 1) ||
552 (before
!= LT_IGNORE
&& after
!= LT_IGNORE
&& after
!= LT_DELIMITER
&&
553 textLen
< aRepLen
+ 2)) {
557 uint32_t text0
= aInString
[0];
558 if (aInLength
> 1 && NS_IS_SURROGATE_PAIR(text0
, aInString
[1])) {
559 text0
= SURROGATE_TO_UCS4(text0
, aInString
[1]);
561 // find length of the char/cluster to be ignored
562 int32_t ignoreLen
= before
== LT_IGNORE
? 0 : 1;
564 GraphemeClusterBreakIteratorUtf16
ci(
565 Span
<const char16_t
>(aInString
, aInLength
));
566 ignoreLen
= *ci
.Next();
569 int32_t afterIndex
= aRepLen
+ ignoreLen
;
570 uint32_t textAfterPos
= aInString
[afterIndex
];
571 if (aInLength
> afterIndex
+ 1 &&
572 NS_IS_SURROGATE_PAIR(textAfterPos
, aInString
[afterIndex
+ 1])) {
573 textAfterPos
= SURROGATE_TO_UCS4(textAfterPos
, aInString
[afterIndex
+ 1]);
576 return !((before
== LT_ALPHA
&& !IsAlpha(text0
)) ||
577 (before
== LT_DIGIT
&& !IsDigit(text0
)) ||
578 (before
== LT_DELIMITER
&&
579 (IsAlpha(text0
) || IsDigit(text0
) || text0
== *rep
)) ||
580 (after
== LT_ALPHA
&& !IsAlpha(textAfterPos
)) ||
581 (after
== LT_DIGIT
&& !IsDigit(textAfterPos
)) ||
582 (after
== LT_DELIMITER
&&
583 (IsAlpha(textAfterPos
) || IsDigit(textAfterPos
) ||
584 textAfterPos
== *rep
)) ||
585 !Substring(Substring(aInString
, aInString
+ aInLength
), ignoreLen
,
587 .Equals(Substring(rep
, rep
+ aRepLen
),
588 nsCaseInsensitiveStringComparator
));
591 uint32_t mozTXTToHTMLConv::NumberOfMatches(const char16_t
* aInString
,
592 int32_t aInStringLength
,
593 const char16_t
* rep
, int32_t aRepLen
,
594 LIMTYPE before
, LIMTYPE after
) {
597 // Limit lookahead length to avoid pathological O(n^2) behavior; looking so
598 // far ahead is unlikely to be important for cases where styling marked-up
599 // fragments is actually useful anyhow.
601 std::min(2000u, mozilla::AssertedCast
<uint32_t>(aInStringLength
));
602 GraphemeClusterBreakIteratorUtf16
ci(Span
<const char16_t
>(aInString
, len
));
603 for (uint32_t pos
= 0; pos
< len
; pos
= *ci
.Next()) {
604 if (ItMatchesDelimited(aInString
+ pos
, aInStringLength
- pos
, rep
, aRepLen
,
612 // NOTE: the converted html for the phrase is appended to aOutString
613 // tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
614 bool mozTXTToHTMLConv::StructPhraseHit(
615 const char16_t
* aInString
, int32_t aInStringLength
, bool col0
,
616 const char16_t
* tagTXT
, int32_t aTagTXTLen
, const char* tagHTML
,
617 const char* attributeHTML
, nsAString
& aOutString
, uint32_t& openTags
) {
618 /* We're searching for the following pattern:
619 LT_DELIMITER - "*" - ALPHA -
620 [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
621 <strong> is only inserted, if existence of a pair could be verified
622 We use the first opening/closing tag, if we can choose */
624 const char16_t
* newOffset
= aInString
;
625 int32_t newLength
= aInStringLength
;
626 if (!col0
) // skip the first element?
628 newOffset
= &aInString
[1];
629 newLength
= aInStringLength
- 1;
633 if (ItMatchesDelimited(aInString
, aInStringLength
, tagTXT
, aTagTXTLen
,
634 (col0
? LT_IGNORE
: LT_DELIMITER
),
635 LT_ALPHA
) // is opening tag
636 && NumberOfMatches(newOffset
, newLength
, tagTXT
, aTagTXTLen
, LT_ALPHA
,
637 LT_DELIMITER
) // remaining closing tags
640 aOutString
.Append('<');
641 aOutString
.AppendASCII(tagHTML
);
642 aOutString
.Append(char16_t(' '));
643 aOutString
.AppendASCII(attributeHTML
);
644 aOutString
.AppendLiteral("><span class=\"moz-txt-tag\">");
645 aOutString
.Append(tagTXT
);
646 aOutString
.AppendLiteral("</span>");
651 if (openTags
> 0 && ItMatchesDelimited(aInString
, aInStringLength
, tagTXT
,
652 aTagTXTLen
, LT_ALPHA
, LT_DELIMITER
)) {
654 aOutString
.AppendLiteral("<span class=\"moz-txt-tag\">");
655 aOutString
.Append(tagTXT
);
656 aOutString
.AppendLiteral("</span></");
657 aOutString
.AppendASCII(tagHTML
);
658 aOutString
.Append(char16_t('>'));
665 bool mozTXTToHTMLConv::SmilyHit(const char16_t
* aInString
, int32_t aLength
,
666 bool col0
, const char* tagTXT
,
667 const nsString
& imageName
, nsString
& outputHTML
,
668 int32_t& glyphTextLen
) {
669 if (!aInString
|| !tagTXT
|| imageName
.IsEmpty()) return false;
671 int32_t tagLen
= strlen(tagTXT
);
673 uint32_t delim
= (col0
? 0 : 1) + tagLen
;
675 if ((col0
|| IsSpace(aInString
[0])) &&
676 (aLength
<= int32_t(delim
) || IsSpace(aInString
[delim
]) ||
677 (aLength
> int32_t(delim
+ 1) &&
678 (aInString
[delim
] == '.' || aInString
[delim
] == ',' ||
679 aInString
[delim
] == ';' || aInString
[delim
] == '8' ||
680 aInString
[delim
] == '>' || aInString
[delim
] == '!' ||
681 aInString
[delim
] == '?') &&
682 IsSpace(aInString
[delim
+ 1]))) &&
683 ItMatchesDelimited(aInString
, aLength
,
684 NS_ConvertASCIItoUTF16(tagTXT
).get(), tagLen
,
685 col0
? LT_IGNORE
: LT_DELIMITER
, LT_IGNORE
)
686 // Note: tests at different pos for LT_IGNORE and LT_DELIMITER
689 outputHTML
.Truncate();
690 outputHTML
.Append(char16_t(' '));
693 outputHTML
.Append(imageName
); // emoji unicode
694 glyphTextLen
= (col0
? 0 : 1) + tagLen
;
701 // the glyph is appended to aOutputString instead of the original string...
702 bool mozTXTToHTMLConv::GlyphHit(const char16_t
* aInString
, int32_t aInLength
,
703 bool col0
, nsAString
& aOutputString
,
704 int32_t& glyphTextLen
) {
705 char16_t text0
= aInString
[0];
706 char16_t text1
= aInString
[1];
707 char16_t firstChar
= (col0
? text0
: text1
);
709 // temporary variable used to store the glyph html text
710 nsAutoString outputHTML
;
715 // refactor some of this mess to avoid code duplication and speed execution a
716 // bit there are two cases that need to be tried one after another. To avoid a
717 // lot of duplicate code, rolling into a loop
722 if (!i
&& (firstChar
== ':' || firstChar
== ';' || firstChar
== '=' ||
723 firstChar
== '>' || firstChar
== '8' || firstChar
== 'O')) {
730 (text1
== ':' || text1
== ';' || text1
== '=' || text1
== '>' ||
731 text1
== '8' || text1
== 'O')) {
732 // second test passed
737 if (bTestSmilie
&& (SmilyHit(aInString
, aInLength
, bArg
, ":-)",
738 u
"🙂"_ns
, // smile, U+1F642
739 outputHTML
, glyphTextLen
) ||
741 SmilyHit(aInString
, aInLength
, bArg
, ":)",
742 u
"🙂"_ns
, // smile, U+1F642
743 outputHTML
, glyphTextLen
) ||
745 SmilyHit(aInString
, aInLength
, bArg
, ":-D",
746 u
"😂"_ns
, // laughing, U+1F602
747 outputHTML
, glyphTextLen
) ||
749 SmilyHit(aInString
, aInLength
, bArg
, ":-(",
750 u
"🙁"_ns
, // frown, U+1F641
751 outputHTML
, glyphTextLen
) ||
753 SmilyHit(aInString
, aInLength
, bArg
, ":(",
754 u
"🙁"_ns
, // frown, U+1F641
755 outputHTML
, glyphTextLen
) ||
757 SmilyHit(aInString
, aInLength
, bArg
, ":$",
758 u
"😳"_ns
, // embarassed, U+1F633
759 outputHTML
, glyphTextLen
) ||
761 SmilyHit(aInString
, aInLength
, bArg
, ";-)",
762 u
"😉"_ns
, // wink, U+1F609
763 outputHTML
, glyphTextLen
) ||
765 SmilyHit(aInString
, aInLength
, col0
, ";)",
766 u
"😉"_ns
, // wink, U+1F609
767 outputHTML
, glyphTextLen
) ||
769 SmilyHit(aInString
, aInLength
, bArg
, ":-\\",
770 u
"😕"_ns
, // undecided, U+1F615
771 outputHTML
, glyphTextLen
) ||
773 SmilyHit(aInString
, aInLength
, bArg
, ":-P",
774 u
"😛"_ns
, // tongue, U+1F61B
775 outputHTML
, glyphTextLen
) ||
777 SmilyHit(aInString
, aInLength
, bArg
, ";-P",
778 u
"😜"_ns
, // winking face with tongue, U+1F61C
779 outputHTML
, glyphTextLen
) ||
781 SmilyHit(aInString
, aInLength
, bArg
, "=-O",
782 u
"😮"_ns
, // surprise, U+1F62E
783 outputHTML
, glyphTextLen
) ||
785 SmilyHit(aInString
, aInLength
, bArg
, ":-*",
786 u
"😘"_ns
, // kiss, U+1F618
787 outputHTML
, glyphTextLen
) ||
789 SmilyHit(aInString
, aInLength
, bArg
, ">:o",
790 u
"🤬"_ns
, // swearing, U+1F92C
791 outputHTML
, glyphTextLen
) ||
793 SmilyHit(aInString
, aInLength
, bArg
, ">:-o",
794 u
"🤬"_ns
, // swearing, U+1F92C
795 outputHTML
, glyphTextLen
) ||
797 SmilyHit(aInString
, aInLength
, bArg
, ">:(",
798 u
"😠"_ns
, // angry, U+1F620
799 outputHTML
, glyphTextLen
) ||
801 SmilyHit(aInString
, aInLength
, bArg
, ">:-(",
802 u
"😠"_ns
, // angry, U+1F620
803 outputHTML
, glyphTextLen
) ||
805 SmilyHit(aInString
, aInLength
, bArg
, "8-)",
806 u
"😎"_ns
, // cool, U+1F60E
807 outputHTML
, glyphTextLen
) ||
809 SmilyHit(aInString
, aInLength
, bArg
, ":-$",
810 u
"🤑"_ns
, // money, U+1F911
811 outputHTML
, glyphTextLen
) ||
813 SmilyHit(aInString
, aInLength
, bArg
, ":-!",
814 u
"😬"_ns
, // foot, U+1F62C
815 outputHTML
, glyphTextLen
) ||
817 SmilyHit(aInString
, aInLength
, bArg
, "O:-)",
818 u
"😇"_ns
, // innocent, U+1F607
819 outputHTML
, glyphTextLen
) ||
821 SmilyHit(aInString
, aInLength
, bArg
, ":'(",
822 u
"😭"_ns
, // cry, U+1F62D
823 outputHTML
, glyphTextLen
) ||
825 SmilyHit(aInString
, aInLength
, bArg
, ":-X",
826 u
"🤐"_ns
, // sealed, U+1F910
827 outputHTML
, glyphTextLen
))) {
828 aOutputString
.Append(outputHTML
);
834 aOutputString
.AppendLiteral("<span class='moz-txt-formfeed'></span>");
838 if (text0
== '+' || text1
== '+') {
839 if (ItMatchesDelimited(aInString
, aInLength
, u
" +/-", 4, LT_IGNORE
,
841 aOutputString
.AppendLiteral(" ±");
845 if (col0
&& ItMatchesDelimited(aInString
, aInLength
, u
"+/-", 3, LT_IGNORE
,
847 aOutputString
.AppendLiteral("±");
853 // x^2 => x<sup>2</sup>, also handle powers x^-2, x^0.5
854 // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
856 (IsAsciiDigit(text0
) || IsAsciiAlpha(text0
) || text0
== ')' ||
857 text0
== ']' || text0
== '}') &&
858 ((2 < aInLength
&& IsAsciiDigit(aInString
[2])) ||
859 (3 < aInLength
&& aInString
[2] == '-' && IsAsciiDigit(aInString
[3])))) {
860 // Find first non-digit
861 int32_t delimPos
= 3; // skip "^" and first digit (or '-')
862 for (; delimPos
< aInLength
&&
863 (IsAsciiDigit(aInString
[delimPos
]) ||
864 (aInString
[delimPos
] == '.' && delimPos
+ 1 < aInLength
&&
865 IsAsciiDigit(aInString
[delimPos
+ 1])));
870 if (delimPos
< aInLength
&& IsAsciiAlpha(aInString
[delimPos
])) {
874 outputHTML
.Truncate();
876 outputHTML
.AppendLiteral(
877 "<sup class=\"moz-txt-sup\">"
878 "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
881 aOutputString
.Append(outputHTML
);
882 aOutputString
.Append(&aInString
[2], delimPos
- 2);
883 aOutputString
.AppendLiteral("</sup>");
885 glyphTextLen
= delimPos
/* - 1 + 1 */;
889 The following strings are not substituted:
891 +------+---------+----------
897 1/4 ¼ is triggered by 1/4 Part 1, 2/4 Part 2, ...
904 /***************************************************************************
905 Library-internal Interface
906 ****************************************************************************/
908 NS_IMPL_ISUPPORTS(mozTXTToHTMLConv
, mozITXTToHTMLConv
, nsIStreamConverter
,
909 nsIThreadRetargetableStreamListener
, nsIStreamListener
,
912 int32_t mozTXTToHTMLConv::CiteLevelTXT(const char16_t
* line
,
913 uint32_t& logLineStart
) {
915 int32_t lineLength
= NS_strlen(line
);
917 bool moreCites
= true;
919 /* E.g. the following lines count as quote:
922 //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
924 //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
931 logLineStart is the position of "t" in this example
933 uint32_t i
= logLineStart
;
935 #ifdef QUOTE_RECOGNITION_AGGRESSIVE
936 for (; int32_t(i
) < lineLength
&& IsSpace(line
[i
]); i
++)
938 for (; int32_t(i
) < lineLength
&& IsAsciiAlpha(line
[i
]) &&
939 nsCRT::IsUpper(line
[i
]);
942 if (int32_t(i
) < lineLength
&& (line
[i
] == '>' || line
[i
] == ']'))
944 if (int32_t(i
) < lineLength
&& line
[i
] == '>')
948 if (int32_t(i
) < lineLength
&& line
[i
] == ' ') i
++;
950 // Placed here for performance increase
951 const char16_t
* indexString
= &line
[logLineStart
];
952 // here, |logLineStart < lineLength| is always true
953 uint32_t minlength
= std::min(uint32_t(6), NS_strlen(indexString
));
954 if (Substring(indexString
, indexString
+ minlength
)
955 .Equals(Substring(u
">From "_ns
, 0, minlength
),
956 nsCaseInsensitiveStringComparator
)) {
972 mozTXTToHTMLConv::ScanTXT(const nsAString
& aInString
, uint32_t whattodo
,
973 nsAString
& aOutString
) {
974 if (aInString
.Length() == 0) {
975 aOutString
.Truncate();
979 if (!aOutString
.SetCapacity(uint32_t(aInString
.Length() * growthRate
),
980 mozilla::fallible
)) {
981 return NS_ERROR_OUT_OF_MEMORY
;
984 bool doURLs
= 0 != (whattodo
& kURLs
);
985 bool doGlyphSubstitution
= 0 != (whattodo
& kGlyphSubstitution
);
986 bool doStructPhrase
= 0 != (whattodo
& kStructPhrase
);
988 uint32_t structPhrase_strong
= 0; // Number of currently open tags
989 uint32_t structPhrase_underline
= 0;
990 uint32_t structPhrase_italic
= 0;
991 uint32_t structPhrase_code
= 0;
993 uint32_t endOfLastURLOutput
= 0;
995 nsAutoString outputHTML
; // moved here for performance increase
997 const char16_t
* rawInputString
= aInString
.BeginReading();
998 uint32_t inLength
= aInString
.Length();
1000 const Span
<const char16_t
> inString(aInString
);
1001 GraphemeClusterBreakIteratorUtf16
ci(inString
);
1003 while (i
< inLength
) {
1004 if (doGlyphSubstitution
) {
1005 int32_t glyphTextLen
;
1006 if (GlyphHit(&rawInputString
[i
], inLength
- i
, i
== 0, aOutString
,
1008 i
= *ci
.Seek(i
+ glyphTextLen
- 1);
1013 if (doStructPhrase
) {
1014 const char16_t
* newOffset
= rawInputString
;
1015 int32_t newLength
= aInString
.Length();
1016 if (i
> 0) // skip the first element?
1018 GraphemeClusterBreakReverseIteratorUtf16
ri(
1019 Span
<const char16_t
>(rawInputString
, i
));
1020 Maybe
<uint32_t> nextPos
= ri
.Next();
1021 newOffset
+= *nextPos
;
1022 newLength
-= *nextPos
;
1025 switch (aInString
[i
]) // Performance increase
1028 if (StructPhraseHit(newOffset
, newLength
, i
== 0, u
"*", 1, "b",
1029 "class=\"moz-txt-star\"", aOutString
,
1030 structPhrase_strong
)) {
1036 if (StructPhraseHit(newOffset
, newLength
, i
== 0, u
"/", 1, "i",
1037 "class=\"moz-txt-slash\"", aOutString
,
1038 structPhrase_italic
)) {
1044 if (StructPhraseHit(newOffset
, newLength
, i
== 0, u
"_", 1,
1045 "span" /* <u> is deprecated */,
1046 "class=\"moz-txt-underscore\"", aOutString
,
1047 structPhrase_underline
)) {
1053 if (StructPhraseHit(newOffset
, newLength
, i
== 0, u
"|", 1, "code",
1054 "class=\"moz-txt-verticalline\"", aOutString
,
1055 structPhrase_code
)) {
1064 switch (aInString
[i
]) {
1068 if ((i
== 0 || ((i
> 0) && aInString
[i
- 1] != ' ')) &&
1069 ((i
== aInString
.Length() - 1) ||
1070 (aInString
[i
+ 1] != ' '))) // Performance increase
1072 int32_t replaceBefore
;
1073 int32_t replaceAfter
;
1074 if (FindURL(rawInputString
, aInString
.Length(), i
, whattodo
,
1075 outputHTML
, replaceBefore
, replaceAfter
) &&
1076 structPhrase_strong
+ structPhrase_italic
+
1077 structPhrase_underline
+ structPhrase_code
==
1079 /* workaround for bug #19445 */) {
1080 // Don't cut into previously inserted HTML (bug 1509493)
1081 if (aOutString
.Length() - replaceBefore
< endOfLastURLOutput
) {
1084 aOutString
.Cut(aOutString
.Length() - replaceBefore
,
1086 aOutString
+= outputHTML
;
1087 endOfLastURLOutput
= aOutString
.Length();
1088 i
= *ci
.Seek(i
+ replaceAfter
);
1096 switch (aInString
[i
]) {
1101 EscapeChar(aInString
[i
], aOutString
, false);
1104 // Normal characters
1106 const uint32_t oldIdx
= i
;
1108 aOutString
.Append(inString
.FromTo(oldIdx
, i
));
1117 mozTXTToHTMLConv::ScanHTML(const nsAString
& input
, uint32_t whattodo
,
1118 nsAString
& aOutString
) {
1119 const nsPromiseFlatString
& aInString
= PromiseFlatString(input
);
1120 if (!aOutString
.SetCapacity(uint32_t(aInString
.Length() * growthRate
),
1121 mozilla::fallible
)) {
1122 return NS_ERROR_OUT_OF_MEMORY
;
1125 // some common variables we were recalculating
1126 // every time inside the for loop...
1127 int32_t lengthOfInString
= aInString
.Length();
1128 const char16_t
* uniBuffer
= aInString
.get();
1130 #ifdef DEBUG_BenB_Perf
1131 PRTime parsing_start
= PR_IntervalNow();
1134 // Look for simple entities not included in a tags and scan them.
1135 // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"),
1136 // comment tag ("<!--[...]-->"), style tag, script tag or head tag.
1137 // Unescape the rest (text between tags) and pass it to ScanTXT.
1138 nsAutoCString
canFollow(" \f\n\r\t>");
1139 for (int32_t i
= 0; i
< lengthOfInString
;) {
1140 if (aInString
[i
] == '<') // html tag
1143 if (i
+ 2 < lengthOfInString
&& nsCRT::ToLower(aInString
[i
+ 1]) == 'a' &&
1144 canFollow
.FindChar(aInString
[i
+ 2]) != kNotFound
)
1145 // if a tag, skip until </a>.
1146 // Make sure there's a white-space character after, not to match "abbr".
1148 i
= aInString
.LowerCaseFindASCII("</a>", i
);
1149 if (i
== kNotFound
) {
1150 i
= lengthOfInString
;
1154 } else if (Substring(aInString
, i
+ 1, 3).LowerCaseEqualsASCII("!--"))
1155 // if out-commended code, skip until -->
1157 i
= aInString
.Find(u
"-->", i
);
1158 if (i
== kNotFound
) {
1159 i
= lengthOfInString
;
1163 } else if (i
+ 6 < lengthOfInString
&&
1164 Substring(aInString
, i
+ 1, 5).LowerCaseEqualsASCII("style") &&
1165 canFollow
.FindChar(aInString
[i
+ 6]) != kNotFound
)
1166 // if style tag, skip until </style>
1168 i
= aInString
.LowerCaseFindASCII("</style>", i
);
1169 if (i
== kNotFound
) {
1170 i
= lengthOfInString
;
1174 } else if (i
+ 7 < lengthOfInString
&&
1175 Substring(aInString
, i
+ 1, 6)
1176 .LowerCaseEqualsASCII("script") &&
1177 canFollow
.FindChar(aInString
[i
+ 7]) != kNotFound
)
1178 // if script tag, skip until </script>
1180 i
= aInString
.LowerCaseFindASCII("</script>", i
);
1181 if (i
== kNotFound
) {
1182 i
= lengthOfInString
;
1186 } else if (i
+ 5 < lengthOfInString
&&
1187 Substring(aInString
, i
+ 1, 4).LowerCaseEqualsASCII("head") &&
1188 canFollow
.FindChar(aInString
[i
+ 5]) != kNotFound
)
1189 // if head tag, skip until </head>
1190 // Make sure not to match <header>.
1192 i
= aInString
.LowerCaseFindASCII("</head>", i
);
1193 if (i
== kNotFound
) {
1194 i
= lengthOfInString
;
1198 } else // just skip tag (attributes etc.)
1200 i
= aInString
.FindChar('>', i
);
1201 if (i
== kNotFound
) {
1202 i
= lengthOfInString
;
1207 aOutString
.Append(&uniBuffer
[start
], i
- start
);
1209 uint32_t start
= uint32_t(i
);
1210 i
= aInString
.FindChar('<', i
);
1211 if (i
== kNotFound
) i
= lengthOfInString
;
1213 nsAutoStringN
<256> tempString
;
1214 tempString
.SetCapacity(uint32_t((uint32_t(i
) - start
) * growthRate
));
1215 UnescapeStr(uniBuffer
, start
, uint32_t(i
) - start
, tempString
);
1216 ScanTXT(tempString
, whattodo
, aOutString
);
1220 #ifdef DEBUG_BenB_Perf
1221 printf("ScanHTML time: %d ms\n",
1222 PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start
));
1227 /****************************************************************************
1229 *****************************************************************************/
1232 mozTXTToHTMLConv::Convert(nsIInputStream
* aFromStream
, const char* aFromType
,
1233 const char* aToType
, nsISupports
* aCtxt
,
1234 nsIInputStream
** _retval
) {
1235 return NS_ERROR_NOT_IMPLEMENTED
;
1239 mozTXTToHTMLConv::AsyncConvertData(const char* aFromType
, const char* aToType
,
1240 nsIStreamListener
* aListener
,
1241 nsISupports
* aCtxt
) {
1242 return NS_ERROR_NOT_IMPLEMENTED
;
1246 mozTXTToHTMLConv::GetConvertedType(const nsACString
& aFromType
,
1247 nsIChannel
* aChannel
, nsACString
& aToType
) {
1248 return NS_ERROR_NOT_IMPLEMENTED
;
1252 mozTXTToHTMLConv::OnDataAvailable(nsIRequest
* request
, nsIInputStream
* inStr
,
1253 uint64_t sourceOffset
, uint32_t count
) {
1254 return NS_ERROR_NOT_IMPLEMENTED
;
1258 mozTXTToHTMLConv::OnDataFinished(nsresult aStatus
) {
1259 return NS_ERROR_NOT_IMPLEMENTED
;
1263 mozTXTToHTMLConv::CheckListenerChain() { return NS_ERROR_NOT_IMPLEMENTED
; }
1266 mozTXTToHTMLConv::MaybeRetarget(nsIRequest
* request
) {
1267 return NS_ERROR_NOT_IMPLEMENTED
;
1271 mozTXTToHTMLConv::OnStartRequest(nsIRequest
* request
) {
1272 return NS_ERROR_NOT_IMPLEMENTED
;
1276 mozTXTToHTMLConv::OnStopRequest(nsIRequest
* request
, nsresult aStatus
) {
1277 return NS_ERROR_NOT_IMPLEMENTED
;
1281 mozTXTToHTMLConv::CiteLevelTXT(const char16_t
* line
, uint32_t* logLineStart
,
1282 uint32_t* _retval
) {
1283 if (!logLineStart
|| !_retval
|| !line
) return NS_ERROR_NULL_POINTER
;
1284 *_retval
= CiteLevelTXT(line
, *logLineStart
);
1288 nsresult
MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv
** aConv
) {
1289 MOZ_ASSERT(aConv
!= nullptr, "null ptr");
1290 if (!aConv
) return NS_ERROR_NULL_POINTER
;
1292 RefPtr
<mozTXTToHTMLConv
> conv
= new mozTXTToHTMLConv();
1294 // return (*aConv)->Init();