1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
8 * nsIContentSerializer implementation that can be used with an
9 * nsIDocumentEncoder to convert a DOM into plaintext in a nice way
10 * (eg for copy/paste as plaintext).
13 #include "nsPlainTextSerializer.h"
14 #include "nsIServiceManager.h"
15 #include "nsGkAtoms.h"
16 #include "nsNameSpaceManager.h"
17 #include "nsTextFragment.h"
18 #include "nsContentUtils.h"
19 #include "nsReadableUtils.h"
20 #include "nsUnicharUtils.h"
22 #include "mozilla/dom/Element.h"
23 #include "mozilla/Preferences.h"
24 #include "mozilla/BinarySearch.h"
25 #include "nsComputedDOMStyle.h"
31 using namespace mozilla
;
32 using namespace mozilla::dom
;
34 #define PREF_STRUCTS "converter.html2txt.structs"
35 #define PREF_HEADER_STRATEGY "converter.html2txt.header_strategy"
36 #define PREF_ALWAYS_INCLUDE_RUBY "converter.html2txt.always_include_ruby"
38 static const int32_t kTabSize
= 4;
39 static const int32_t kIndentSizeHeaders
= 2; /* Indention of h1, if
40 mHeaderStrategy = 1 or = 2.
41 Indention of other headers
44 static const int32_t kIndentIncrementHeaders
= 2; /* If mHeaderStrategy = 1,
45 indent h(x+1) this many
46 columns more than h(x) */
47 static const int32_t kIndentSizeList
= kTabSize
;
48 // Indention of non-first lines of ul and ol
49 static const int32_t kIndentSizeDD
= kTabSize
; // Indention of <dd>
50 static const char16_t kNBSP
= 160;
51 static const char16_t kSPACE
= ' ';
53 static int32_t HeaderLevel(nsAtom
* aTag
);
54 static int32_t GetUnicharWidth(char16_t ucs
);
55 static int32_t GetUnicharStringWidth(const char16_t
* pwcs
, int32_t n
);
57 // Someday may want to make this non-const:
58 static const uint32_t TagStackSize
= 500;
59 static const uint32_t OLStackSize
= 100;
61 static bool gPreferenceInitialized
= false;
62 static bool gAlwaysIncludeRuby
= false;
64 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsPlainTextSerializer
)
65 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsPlainTextSerializer
)
67 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsPlainTextSerializer
)
68 NS_INTERFACE_MAP_ENTRY(nsIContentSerializer
)
69 NS_INTERFACE_MAP_ENTRY(nsISupports
)
72 NS_IMPL_CYCLE_COLLECTION(nsPlainTextSerializer
, mElement
)
74 nsresult
NS_NewPlainTextSerializer(nsIContentSerializer
** aSerializer
) {
75 RefPtr
<nsPlainTextSerializer
> it
= new nsPlainTextSerializer();
76 it
.forget(aSerializer
);
80 nsPlainTextSerializer::nsPlainTextSerializer()
84 kSpace(NS_LITERAL_STRING(" ")) // Init of "constant"
86 mOutputString
= nullptr;
88 mAtFirstColumn
= true;
91 mStructs
= true; // will be read from prefs later
92 mHeaderStrategy
= 1 /*indent increasingly*/; // ditto
93 mHasWrittenCiteBlockquote
= false;
95 for (int32_t i
= 0; i
<= 6; i
++) {
96 mHeaderCounter
[i
] = 0;
100 mWrapColumn
= 72; // XXX magic number, we expect someone to reset this
101 mCurrentLineWidth
= 0;
104 mEmptyLines
= 1; // The start of the document is an "empty line" in itself,
105 mInWhitespace
= false;
106 mPreFormattedMail
= false;
107 mStartedOutput
= false;
109 mPreformattedBlockBoundary
= false;
110 mWithRubyAnnotation
= false; // will be read from pref and flag later
112 // initialize the tag stack to zero:
113 // The stack only ever contains pointers to static atoms, so they don't
115 mTagStack
= new nsAtom
*[TagStackSize
];
117 mIgnoreAboveIndex
= (uint32_t)kNotFound
;
119 // initialize the OL stack, where numbers for ordered lists are kept
120 mOLStack
= new int32_t[OLStackSize
];
125 mIgnoredChildNodeLevel
= 0;
127 if (!gPreferenceInitialized
) {
128 Preferences::AddBoolVarCache(&gAlwaysIncludeRuby
, PREF_ALWAYS_INCLUDE_RUBY
,
130 gPreferenceInitialized
= true;
134 nsPlainTextSerializer::~nsPlainTextSerializer() {
137 NS_WARNING_ASSERTION(mHeadLevel
== 0, "Wrong head level!");
141 nsPlainTextSerializer::Init(uint32_t aFlags
, uint32_t aWrapColumn
,
142 const Encoding
* aEncoding
, bool aIsCopying
,
143 bool aIsWholeDocument
,
144 bool* aNeedsPreformatScanning
) {
146 // Check if the major control flags are set correctly.
147 if (aFlags
& nsIDocumentEncoder::OutputFormatFlowed
) {
148 NS_ASSERTION(aFlags
& nsIDocumentEncoder::OutputFormatted
,
149 "If you want format=flowed, you must combine it with "
150 "nsIDocumentEncoder::OutputFormatted");
153 if (aFlags
& nsIDocumentEncoder::OutputFormatted
) {
155 !(aFlags
& nsIDocumentEncoder::OutputPreformatted
),
156 "Can't do formatted and preformatted output at the same time!");
160 *aNeedsPreformatScanning
= true;
162 mWrapColumn
= aWrapColumn
;
164 // Only create a linebreaker if we will handle wrapping.
165 if (MayWrap() && MayBreakLines()) {
166 mLineBreaker
= nsContentUtils::LineBreaker();
169 // Set the line break character:
170 if ((mFlags
& nsIDocumentEncoder::OutputCRLineBreak
) &&
171 (mFlags
& nsIDocumentEncoder::OutputLFLineBreak
)) {
173 mLineBreak
.AssignLiteral("\r\n");
174 } else if (mFlags
& nsIDocumentEncoder::OutputCRLineBreak
) {
176 mLineBreak
.Assign(char16_t('\r'));
177 } else if (mFlags
& nsIDocumentEncoder::OutputLFLineBreak
) {
179 mLineBreak
.Assign(char16_t('\n'));
182 mLineBreak
.AssignLiteral(NS_LINEBREAK
);
185 mLineBreakDue
= false;
188 mPreformattedBlockBoundary
= false;
190 if (mFlags
& nsIDocumentEncoder::OutputFormatted
) {
191 // Get some prefs that controls how we do formatted output
192 mStructs
= Preferences::GetBool(PREF_STRUCTS
, mStructs
);
195 Preferences::GetInt(PREF_HEADER_STRATEGY
, mHeaderStrategy
);
198 // The pref is default inited to false in libpref, but we use true
199 // as fallback value because we don't want to affect behavior in
200 // other places which use this serializer currently.
201 mWithRubyAnnotation
=
202 gAlwaysIncludeRuby
|| (mFlags
& nsIDocumentEncoder::OutputRubyAnnotation
);
204 // XXX We should let the caller decide whether to do this or not
205 mFlags
&= ~nsIDocumentEncoder::OutputNoFramesContent
;
210 bool nsPlainTextSerializer::GetLastBool(const nsTArray
<bool>& aStack
) {
211 uint32_t size
= aStack
.Length();
215 return aStack
.ElementAt(size
- 1);
218 void nsPlainTextSerializer::SetLastBool(nsTArray
<bool>& aStack
, bool aValue
) {
219 uint32_t size
= aStack
.Length();
221 aStack
.ElementAt(size
- 1) = aValue
;
223 NS_ERROR("There is no \"Last\" value");
227 void nsPlainTextSerializer::PushBool(nsTArray
<bool>& aStack
, bool aValue
) {
228 aStack
.AppendElement(bool(aValue
));
231 bool nsPlainTextSerializer::PopBool(nsTArray
<bool>& aStack
) {
232 bool returnValue
= false;
233 uint32_t size
= aStack
.Length();
235 returnValue
= aStack
.ElementAt(size
- 1);
236 aStack
.RemoveElementAt(size
- 1);
241 bool nsPlainTextSerializer::ShouldReplaceContainerWithPlaceholder(
243 // If nsIDocumentEncoder::OutputNonTextContentAsPlaceholder is set,
244 // non-textual container element should be serialized as placeholder
245 // character and its child nodes should be ignored. See bug 895239.
246 if (!(mFlags
& nsIDocumentEncoder::OutputNonTextContentAsPlaceholder
)) {
250 return (aTag
== nsGkAtoms::audio
) || (aTag
== nsGkAtoms::canvas
) ||
251 (aTag
== nsGkAtoms::iframe
) || (aTag
== nsGkAtoms::meter
) ||
252 (aTag
== nsGkAtoms::progress
) || (aTag
== nsGkAtoms::object
) ||
253 (aTag
== nsGkAtoms::svg
) || (aTag
== nsGkAtoms::video
);
256 bool nsPlainTextSerializer::IsIgnorableRubyAnnotation(nsAtom
* aTag
) {
257 if (mWithRubyAnnotation
) {
261 return aTag
== nsGkAtoms::rp
|| aTag
== nsGkAtoms::rt
||
262 aTag
== nsGkAtoms::rtc
;
265 // Return true if aElement has 'display:none' or if we just don't know.
266 static bool IsDisplayNone(Element
* aElement
) {
267 RefPtr
<ComputedStyle
> computedStyle
=
268 nsComputedDOMStyle::GetComputedStyleNoFlush(aElement
, nullptr);
269 return !computedStyle
||
270 computedStyle
->StyleDisplay()->mDisplay
== StyleDisplay::None
;
273 static bool IsIgnorableScriptOrStyle(Element
* aElement
) {
274 return aElement
->IsAnyOfHTMLElements(nsGkAtoms::script
, nsGkAtoms::style
) &&
275 IsDisplayNone(aElement
);
279 nsPlainTextSerializer::AppendText(nsIContent
* aText
, int32_t aStartOffset
,
280 int32_t aEndOffset
, nsAString
& aStr
) {
281 if (mIgnoreAboveIndex
!= (uint32_t)kNotFound
) {
285 NS_ASSERTION(aStartOffset
>= 0, "Negative start offset for text fragment!");
286 if (aStartOffset
< 0) return NS_ERROR_INVALID_ARG
;
288 NS_ENSURE_ARG(aText
);
292 nsIContent
* content
= aText
;
293 const nsTextFragment
* frag
;
294 if (!content
|| !(frag
= content
->GetText())) {
295 return NS_ERROR_FAILURE
;
298 int32_t fragLength
= frag
->GetLength();
300 (aEndOffset
== -1) ? fragLength
: std::min(aEndOffset
, fragLength
);
301 NS_ASSERTION(aStartOffset
<= endoffset
,
302 "A start offset is beyond the end of the text fragment!");
304 int32_t length
= endoffset
- aStartOffset
;
309 nsAutoString textstr
;
311 textstr
.Assign(frag
->Get2b() + aStartOffset
, length
);
313 // AssignASCII is for 7-bit character only, so don't use it
314 const char* data
= frag
->Get1b();
315 CopyASCIItoUTF16(Substring(data
+ aStartOffset
, data
+ endoffset
), textstr
);
318 mOutputString
= &aStr
;
320 // We have to split the string across newlines
321 // to match parser behavior
323 int32_t offset
= textstr
.FindCharInSet("\n\r");
324 while (offset
!= kNotFound
) {
325 if (offset
> start
) {
327 DoAddText(false, Substring(textstr
, start
, offset
- start
));
331 DoAddText(true, mLineBreak
);
334 offset
= textstr
.FindCharInSet("\n\r", start
);
337 // Consume the last bit of the string if there's any left
338 if (start
< length
) {
340 DoAddText(false, Substring(textstr
, start
, length
- start
));
342 DoAddText(false, textstr
);
346 mOutputString
= nullptr;
352 nsPlainTextSerializer::AppendCDATASection(nsIContent
* aCDATASection
,
353 int32_t aStartOffset
,
354 int32_t aEndOffset
, nsAString
& aStr
) {
355 return AppendText(aCDATASection
, aStartOffset
, aEndOffset
, aStr
);
359 nsPlainTextSerializer::ScanElementForPreformat(Element
* aElement
) {
360 mPreformatStack
.push(IsElementPreformatted(aElement
));
365 nsPlainTextSerializer::ForgetElementForPreformat(Element
* aElement
) {
366 MOZ_RELEASE_ASSERT(!mPreformatStack
.empty(),
367 "Tried to pop without previous push.");
368 mPreformatStack
.pop();
373 nsPlainTextSerializer::AppendElementStart(Element
* aElement
,
374 Element
* aOriginalElement
,
376 NS_ENSURE_ARG(aElement
);
381 nsAtom
* id
= GetIdForContent(mElement
);
383 bool isContainer
= !FragmentOrElement::IsHTMLVoid(id
);
385 mOutputString
= &aStr
;
388 rv
= DoOpenContainer(id
);
394 mOutputString
= nullptr;
396 if (id
== nsGkAtoms::head
) {
404 nsPlainTextSerializer::AppendElementEnd(Element
* aElement
, nsAString
& aStr
) {
405 NS_ENSURE_ARG(aElement
);
410 nsAtom
* id
= GetIdForContent(mElement
);
412 bool isContainer
= !FragmentOrElement::IsHTMLVoid(id
);
414 mOutputString
= &aStr
;
418 rv
= DoCloseContainer(id
);
422 mOutputString
= nullptr;
424 if (id
== nsGkAtoms::head
) {
425 NS_ASSERTION(mHeadLevel
!= 0, "mHeadLevel being decremented below 0");
433 nsPlainTextSerializer::Flush(nsAString
& aStr
) {
434 mOutputString
= &aStr
;
436 mOutputString
= nullptr;
441 nsPlainTextSerializer::AppendDocumentStart(Document
* aDocument
,
446 nsresult
nsPlainTextSerializer::DoOpenContainer(nsAtom
* aTag
) {
447 // Check if we need output current node as placeholder character and ignore
449 if (ShouldReplaceContainerWithPlaceholder(mElement
->NodeInfo()->NameAtom())) {
450 if (mIgnoredChildNodeLevel
== 0) {
451 // Serialize current node as placeholder character
452 Write(NS_LITERAL_STRING(u
"\xFFFC"));
454 // Ignore child nodes.
455 mIgnoredChildNodeLevel
++;
458 if (IsIgnorableRubyAnnotation(aTag
)) {
459 // Ignorable ruby annotation shouldn't be replaced by a placeholder
460 // character, neither any of its descendants.
461 mIgnoredChildNodeLevel
++;
464 if (IsIgnorableScriptOrStyle(mElement
)) {
465 mIgnoredChildNodeLevel
++;
469 if (mFlags
& nsIDocumentEncoder::OutputForPlainTextClipboardCopy
) {
470 if (mPreformattedBlockBoundary
&& DoOutput()) {
471 // Should always end a line, but get no more whitespace
472 if (mFloatingLines
< 0) mFloatingLines
= 0;
473 mLineBreakDue
= true;
475 mPreformattedBlockBoundary
= false;
478 if (mFlags
& nsIDocumentEncoder::OutputRaw
) {
479 // Raw means raw. Don't even think about doing anything fancy
480 // here like indenting, adding line breaks or any other
481 // characters such as list item bullets, quote characters
482 // around <q>, etc. I mean it! Don't make me smack you!
487 if (mTagStackIndex
< TagStackSize
) {
488 mTagStack
[mTagStackIndex
++] = aTag
;
491 if (mIgnoreAboveIndex
!= (uint32_t)kNotFound
) {
495 // Reset this so that <blockquote type=cite> doesn't affect the whitespace
496 // above random <pre>s below it.
497 mHasWrittenCiteBlockquote
=
498 mHasWrittenCiteBlockquote
&& aTag
== nsGkAtoms::pre
;
500 bool isInCiteBlockquote
= false;
502 // XXX special-case <blockquote type=cite> so that we don't add additional
503 // newlines before the text.
504 if (aTag
== nsGkAtoms::blockquote
) {
506 nsresult rv
= GetAttributeValue(nsGkAtoms::type
, value
);
507 isInCiteBlockquote
= NS_SUCCEEDED(rv
) && value
.EqualsIgnoreCase("cite");
510 if (mLineBreakDue
&& !isInCiteBlockquote
) EnsureVerticalSpace(mFloatingLines
);
512 // Check if this tag's content that should not be output
513 if ((aTag
== nsGkAtoms::noscript
&&
514 !(mFlags
& nsIDocumentEncoder::OutputNoScriptContent
)) ||
515 ((aTag
== nsGkAtoms::iframe
|| aTag
== nsGkAtoms::noframes
) &&
516 !(mFlags
& nsIDocumentEncoder::OutputNoFramesContent
))) {
517 // Ignore everything that follows the current tag in
518 // question until a matching end tag is encountered.
519 mIgnoreAboveIndex
= mTagStackIndex
- 1;
523 if (aTag
== nsGkAtoms::body
) {
524 // Try to figure out here whether we have a
525 // preformatted style attribute set by Thunderbird.
527 // Trigger on the presence of a "pre-wrap" in the
528 // style attribute. That's a very simplistic way to do
529 // it, but better than nothing.
530 // Also set mWrapColumn to the value given there
531 // (which arguably we should only do if told to do so).
534 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::style
, style
)) &&
535 (kNotFound
!= (whitespace
= style
.Find("white-space:")))) {
536 if (kNotFound
!= style
.Find("pre-wrap", true, whitespace
)) {
537 #ifdef DEBUG_preformatted
538 printf("Set mPreFormattedMail based on style pre-wrap\n");
540 mPreFormattedMail
= true;
541 int32_t widthOffset
= style
.Find("width:");
542 if (widthOffset
>= 0) {
543 // We have to search for the ch before the semicolon,
544 // not for the semicolon itself, because nsString::ToInteger()
545 // considers 'c' to be a valid numeric char (even if radix=10)
546 // but then gets confused if it sees it next to the number
547 // when the radix specified was 10, and returns an error code.
548 int32_t semiOffset
= style
.Find("ch", false, widthOffset
+ 6);
549 int32_t length
= (semiOffset
> 0 ? semiOffset
- widthOffset
- 6
550 : style
.Length() - widthOffset
);
551 nsAutoString widthstr
;
552 style
.Mid(widthstr
, widthOffset
+ 6, length
);
554 int32_t col
= widthstr
.ToInteger(&err
);
556 if (NS_SUCCEEDED(err
)) {
557 mWrapColumn
= (uint32_t)col
;
558 #ifdef DEBUG_preformatted
559 printf("Set wrap column to %d based on style\n", mWrapColumn
);
563 } else if (kNotFound
!= style
.Find("pre", true, whitespace
)) {
564 #ifdef DEBUG_preformatted
565 printf("Set mPreFormattedMail based on style pre\n");
567 mPreFormattedMail
= true;
571 /* See comment at end of function. */
572 mInWhitespace
= true;
573 mPreFormattedMail
= false;
579 // Keep this in sync with DoCloseContainer!
584 if (aTag
== nsGkAtoms::p
)
585 EnsureVerticalSpace(1);
586 else if (aTag
== nsGkAtoms::pre
) {
587 if (GetLastBool(mIsInCiteBlockquote
))
588 EnsureVerticalSpace(0);
589 else if (mHasWrittenCiteBlockquote
) {
590 EnsureVerticalSpace(0);
591 mHasWrittenCiteBlockquote
= false;
593 EnsureVerticalSpace(1);
594 } else if (aTag
== nsGkAtoms::tr
) {
595 PushBool(mHasWrittenCellsForRow
, false);
596 } else if (aTag
== nsGkAtoms::td
|| aTag
== nsGkAtoms::th
) {
597 // We must make sure that the content of two table cells get a
598 // space between them.
600 // To make the separation between cells most obvious and
601 // importable, we use a TAB.
602 if (GetLastBool(mHasWrittenCellsForRow
)) {
603 // Bypass |Write| so that the TAB isn't compressed away.
605 mInWhitespace
= true;
606 } else if (mHasWrittenCellsForRow
.IsEmpty()) {
607 // We don't always see a <tr> (nor a <table>) before the <td> if we're
608 // copying part of a table
609 PushBool(mHasWrittenCellsForRow
, true); // will never be popped
611 SetLastBool(mHasWrittenCellsForRow
, true);
613 } else if (aTag
== nsGkAtoms::ul
) {
614 // Indent here to support nested lists, which aren't included in li :-(
615 EnsureVerticalSpace(mULCount
+ mOLStackIndex
== 0 ? 1 : 0);
616 // Must end the current line before we change indention
617 mIndent
+= kIndentSizeList
;
619 } else if (aTag
== nsGkAtoms::ol
) {
620 EnsureVerticalSpace(mULCount
+ mOLStackIndex
== 0 ? 1 : 0);
621 if (mFlags
& nsIDocumentEncoder::OutputFormatted
) {
622 // Must end the current line before we change indention
623 if (mOLStackIndex
< OLStackSize
) {
624 nsAutoString startAttr
;
625 int32_t startVal
= 1;
626 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::start
, startAttr
))) {
628 startVal
= startAttr
.ToInteger(&rv
);
629 if (NS_FAILED(rv
)) startVal
= 1;
631 mOLStack
[mOLStackIndex
++] = startVal
;
636 mIndent
+= kIndentSizeList
; // see ul
637 } else if (aTag
== nsGkAtoms::li
&&
638 (mFlags
& nsIDocumentEncoder::OutputFormatted
)) {
639 if (mTagStackIndex
> 1 && IsInOL()) {
640 if (mOLStackIndex
> 0) {
641 nsAutoString valueAttr
;
642 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::value
, valueAttr
))) {
644 int32_t valueAttrVal
= valueAttr
.ToInteger(&rv
);
645 if (NS_SUCCEEDED(rv
)) mOLStack
[mOLStackIndex
- 1] = valueAttrVal
;
647 // This is what nsBulletFrame does for OLs:
648 mInIndentString
.AppendInt(mOLStack
[mOLStackIndex
- 1]++, 10);
650 mInIndentString
.Append(char16_t('#'));
653 mInIndentString
.Append(char16_t('.'));
656 static const char bulletCharArray
[] = "*o+#";
657 uint32_t index
= mULCount
> 0 ? (mULCount
- 1) : 3;
658 char bulletChar
= bulletCharArray
[index
% 4];
659 mInIndentString
.Append(char16_t(bulletChar
));
662 mInIndentString
.Append(char16_t(' '));
663 } else if (aTag
== nsGkAtoms::dl
) {
664 EnsureVerticalSpace(1);
665 } else if (aTag
== nsGkAtoms::dt
) {
666 EnsureVerticalSpace(0);
667 } else if (aTag
== nsGkAtoms::dd
) {
668 EnsureVerticalSpace(0);
669 mIndent
+= kIndentSizeDD
;
670 } else if (aTag
== nsGkAtoms::span
) {
672 } else if (aTag
== nsGkAtoms::blockquote
) {
674 PushBool(mIsInCiteBlockquote
, isInCiteBlockquote
);
675 if (isInCiteBlockquote
) {
676 EnsureVerticalSpace(0);
679 EnsureVerticalSpace(1);
680 mIndent
+= kTabSize
; // Check for some maximum value?
682 } else if (aTag
== nsGkAtoms::q
) {
683 Write(NS_LITERAL_STRING("\""));
686 // Else make sure we'll separate block level tags,
687 // even if we're about to leave, before doing any other formatting.
688 else if (IsElementBlock(mElement
)) {
689 EnsureVerticalSpace(0);
692 //////////////////////////////////////////////////////////////
693 if (!(mFlags
& nsIDocumentEncoder::OutputFormatted
)) {
696 //////////////////////////////////////////////////////////////
697 // The rest of this routine is formatted output stuff,
698 // which we should skip if we're not formatted:
699 //////////////////////////////////////////////////////////////
702 bool currentNodeIsConverted
= IsCurrentNodeConverted();
704 if (aTag
== nsGkAtoms::h1
|| aTag
== nsGkAtoms::h2
|| aTag
== nsGkAtoms::h3
||
705 aTag
== nsGkAtoms::h4
|| aTag
== nsGkAtoms::h5
|| aTag
== nsGkAtoms::h6
) {
706 EnsureVerticalSpace(2);
707 if (mHeaderStrategy
== 2) { // numbered
708 mIndent
+= kIndentSizeHeaders
;
710 int32_t level
= HeaderLevel(aTag
);
711 // Increase counter for current level
712 mHeaderCounter
[level
]++;
713 // Reset all lower levels
716 for (i
= level
+ 1; i
<= 6; i
++) {
717 mHeaderCounter
[i
] = 0;
722 for (i
= 1; i
<= level
; i
++) {
723 leadup
.AppendInt(mHeaderCounter
[i
]);
724 leadup
.Append(char16_t('.'));
726 leadup
.Append(char16_t(' '));
728 } else if (mHeaderStrategy
== 1) { // indent increasingly
729 mIndent
+= kIndentSizeHeaders
;
730 for (int32_t i
= HeaderLevel(aTag
); i
> 1; i
--) {
731 // for h(x), run x-1 times
732 mIndent
+= kIndentIncrementHeaders
;
735 } else if (aTag
== nsGkAtoms::a
&& !currentNodeIsConverted
) {
737 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::href
, url
)) &&
741 } else if (aTag
== nsGkAtoms::sup
&& mStructs
&& !currentNodeIsConverted
) {
742 Write(NS_LITERAL_STRING("^"));
743 } else if (aTag
== nsGkAtoms::sub
&& mStructs
&& !currentNodeIsConverted
) {
744 Write(NS_LITERAL_STRING("_"));
745 } else if (aTag
== nsGkAtoms::code
&& mStructs
&& !currentNodeIsConverted
) {
746 Write(NS_LITERAL_STRING("|"));
747 } else if ((aTag
== nsGkAtoms::strong
|| aTag
== nsGkAtoms::b
) && mStructs
&&
748 !currentNodeIsConverted
) {
749 Write(NS_LITERAL_STRING("*"));
750 } else if ((aTag
== nsGkAtoms::em
|| aTag
== nsGkAtoms::i
) && mStructs
&&
751 !currentNodeIsConverted
) {
752 Write(NS_LITERAL_STRING("/"));
753 } else if (aTag
== nsGkAtoms::u
&& mStructs
&& !currentNodeIsConverted
) {
754 Write(NS_LITERAL_STRING("_"));
757 /* Container elements are always block elements, so we shouldn't
758 output any whitespace immediately after the container tag even if
759 there's extra whitespace there because the HTML is pretty-printed
760 or something. To ensure that happens, tell the serializer we're
761 already in whitespace so it won't output more. */
762 mInWhitespace
= true;
767 nsresult
nsPlainTextSerializer::DoCloseContainer(nsAtom
* aTag
) {
768 if (ShouldReplaceContainerWithPlaceholder(mElement
->NodeInfo()->NameAtom())) {
769 mIgnoredChildNodeLevel
--;
772 if (IsIgnorableRubyAnnotation(aTag
)) {
773 mIgnoredChildNodeLevel
--;
776 if (IsIgnorableScriptOrStyle(mElement
)) {
777 mIgnoredChildNodeLevel
--;
781 if (mFlags
& nsIDocumentEncoder::OutputForPlainTextClipboardCopy
) {
782 if (DoOutput() && IsInPre() && IsElementBlock(mElement
)) {
783 // If we're closing a preformatted block element, output a line break
784 // when we find a new container.
785 mPreformattedBlockBoundary
= true;
789 if (mFlags
& nsIDocumentEncoder::OutputRaw
) {
790 // Raw means raw. Don't even think about doing anything fancy
791 // here like indenting, adding line breaks or any other
792 // characters such as list item bullets, quote characters
793 // around <q>, etc. I mean it! Don't make me smack you!
798 if (mTagStackIndex
> 0) {
802 if (mTagStackIndex
>= mIgnoreAboveIndex
) {
803 if (mTagStackIndex
== mIgnoreAboveIndex
) {
804 // We're dealing with the close tag whose matching
805 // open tag had set the mIgnoreAboveIndex value.
806 // Reset mIgnoreAboveIndex before discarding this tag.
807 mIgnoreAboveIndex
= (uint32_t)kNotFound
;
812 // End current line if we're ending a block level tag
813 if ((aTag
== nsGkAtoms::body
) || (aTag
== nsGkAtoms::html
)) {
814 // We want the output to end with a new line,
815 // but in preformatted areas like text fields,
816 // we can't emit newlines that weren't there.
817 // So add the newline only in the case of formatted output.
818 if (mFlags
& nsIDocumentEncoder::OutputFormatted
) {
819 EnsureVerticalSpace(0);
823 // We won't want to do anything with these in formatted mode either,
824 // so just return now:
828 // Keep this in sync with DoOpenContainer!
833 if (aTag
== nsGkAtoms::tr
) {
834 PopBool(mHasWrittenCellsForRow
);
835 // Should always end a line, but get no more whitespace
836 if (mFloatingLines
< 0) mFloatingLines
= 0;
837 mLineBreakDue
= true;
838 } else if (((aTag
== nsGkAtoms::li
) || (aTag
== nsGkAtoms::dt
)) &&
839 (mFlags
& nsIDocumentEncoder::OutputFormatted
)) {
840 // Items that should always end a line, but get no more whitespace
841 if (mFloatingLines
< 0) mFloatingLines
= 0;
842 mLineBreakDue
= true;
843 } else if (aTag
== nsGkAtoms::pre
) {
844 mFloatingLines
= GetLastBool(mIsInCiteBlockquote
) ? 0 : 1;
845 mLineBreakDue
= true;
846 } else if (aTag
== nsGkAtoms::ul
) {
848 mIndent
-= kIndentSizeList
;
849 if (--mULCount
+ mOLStackIndex
== 0) {
851 mLineBreakDue
= true;
853 } else if (aTag
== nsGkAtoms::ol
) {
854 FlushLine(); // Doing this after decreasing OLStackIndex would be wrong.
855 mIndent
-= kIndentSizeList
;
856 NS_ASSERTION(mOLStackIndex
, "Wrong OLStack level!");
858 if (mULCount
+ mOLStackIndex
== 0) {
860 mLineBreakDue
= true;
862 } else if (aTag
== nsGkAtoms::dl
) {
864 mLineBreakDue
= true;
865 } else if (aTag
== nsGkAtoms::dd
) {
867 mIndent
-= kIndentSizeDD
;
868 } else if (aTag
== nsGkAtoms::span
) {
869 NS_ASSERTION(mSpanLevel
, "Span level will be negative!");
871 } else if (aTag
== nsGkAtoms::div
) {
872 if (mFloatingLines
< 0) mFloatingLines
= 0;
873 mLineBreakDue
= true;
874 } else if (aTag
== nsGkAtoms::blockquote
) {
875 FlushLine(); // Is this needed?
878 bool isInCiteBlockquote
= PopBool(mIsInCiteBlockquote
);
880 if (isInCiteBlockquote
) {
881 NS_ASSERTION(mCiteQuoteLevel
, "CiteQuote level will be negative!");
884 mHasWrittenCiteBlockquote
= true;
889 mLineBreakDue
= true;
890 } else if (aTag
== nsGkAtoms::q
) {
891 Write(NS_LITERAL_STRING("\""));
892 } else if (IsElementBlock(mElement
)) {
893 // All other blocks get 1 vertical space after them
894 // in formatted mode, otherwise 0.
895 // This is hard. Sometimes 0 is a better number, but
897 if (mFlags
& nsIDocumentEncoder::OutputFormatted
)
898 EnsureVerticalSpace(1);
900 if (mFloatingLines
< 0) mFloatingLines
= 0;
901 mLineBreakDue
= true;
905 //////////////////////////////////////////////////////////////
906 if (!(mFlags
& nsIDocumentEncoder::OutputFormatted
)) {
909 //////////////////////////////////////////////////////////////
910 // The rest of this routine is formatted output stuff,
911 // which we should skip if we're not formatted:
912 //////////////////////////////////////////////////////////////
914 // Pop the currentConverted stack
915 bool currentNodeIsConverted
= IsCurrentNodeConverted();
917 if (aTag
== nsGkAtoms::h1
|| aTag
== nsGkAtoms::h2
|| aTag
== nsGkAtoms::h3
||
918 aTag
== nsGkAtoms::h4
|| aTag
== nsGkAtoms::h5
|| aTag
== nsGkAtoms::h6
) {
919 if (mHeaderStrategy
) { /*numbered or indent increasingly*/
920 mIndent
-= kIndentSizeHeaders
;
922 if (mHeaderStrategy
== 1 /*indent increasingly*/) {
923 for (int32_t i
= HeaderLevel(aTag
); i
> 1; i
--) {
924 // for h(x), run x-1 times
925 mIndent
-= kIndentIncrementHeaders
;
928 EnsureVerticalSpace(1);
929 } else if (aTag
== nsGkAtoms::a
&& !currentNodeIsConverted
&&
932 temp
.AssignLiteral(" <");
934 temp
.Append(char16_t('>'));
937 } else if ((aTag
== nsGkAtoms::sup
|| aTag
== nsGkAtoms::sub
) && mStructs
&&
938 !currentNodeIsConverted
) {
940 } else if (aTag
== nsGkAtoms::code
&& mStructs
&& !currentNodeIsConverted
) {
941 Write(NS_LITERAL_STRING("|"));
942 } else if ((aTag
== nsGkAtoms::strong
|| aTag
== nsGkAtoms::b
) && mStructs
&&
943 !currentNodeIsConverted
) {
944 Write(NS_LITERAL_STRING("*"));
945 } else if ((aTag
== nsGkAtoms::em
|| aTag
== nsGkAtoms::i
) && mStructs
&&
946 !currentNodeIsConverted
) {
947 Write(NS_LITERAL_STRING("/"));
948 } else if (aTag
== nsGkAtoms::u
&& mStructs
&& !currentNodeIsConverted
) {
949 Write(NS_LITERAL_STRING("_"));
955 bool nsPlainTextSerializer::MustSuppressLeaf() {
956 if (mIgnoredChildNodeLevel
> 0) {
960 if ((mTagStackIndex
> 1 &&
961 mTagStack
[mTagStackIndex
- 2] == nsGkAtoms::select
) ||
962 (mTagStackIndex
> 0 &&
963 mTagStack
[mTagStackIndex
- 1] == nsGkAtoms::select
)) {
964 // Don't output the contents of SELECT elements;
965 // Might be nice, eventually, to output just the selected element.
966 // Read more in bug 31994.
973 void nsPlainTextSerializer::DoAddText(bool aIsLineBreak
,
974 const nsAString
& aText
) {
975 // If we don't want any output, just return
981 // Make sure to reset this, since it's no longer true.
982 mHasWrittenCiteBlockquote
= false;
985 if (mLineBreakDue
) EnsureVerticalSpace(mFloatingLines
);
987 if (MustSuppressLeaf()) {
992 // The only times we want to pass along whitespace from the original
993 // html source are if we're forced into preformatted mode via flags,
994 // or if we're prettyprinting and we're inside a <pre>.
995 // Otherwise, either we're collapsing to minimal text, or we're
996 // prettyprinting to mimic the html format, and in neither case
997 // does the formatting of the html source help us.
998 if ((mFlags
& nsIDocumentEncoder::OutputPreformatted
) ||
999 (mPreFormattedMail
&& !mWrapColumn
) || IsInPre()) {
1000 EnsureVerticalSpace(mEmptyLines
+ 1);
1001 } else if (!mInWhitespace
) {
1003 mInWhitespace
= true;
1008 /* Check, if we are in a link (symbolized with mURL containing the URL)
1009 and the text is equal to the URL. In that case we don't want to output
1010 the URL twice so we scrap the text in mURL. */
1011 if (!mURL
.IsEmpty() && mURL
.Equals(aText
)) {
1017 nsresult
nsPlainTextSerializer::DoAddLeaf(nsAtom
* aTag
) {
1018 mPreformattedBlockBoundary
= false;
1020 // If we don't want any output, just return
1025 if (mLineBreakDue
) EnsureVerticalSpace(mFloatingLines
);
1027 if (MustSuppressLeaf()) {
1031 if (aTag
== nsGkAtoms::br
) {
1032 // Another egregious editor workaround, see bug 38194:
1033 // ignore the bogus br tags that the editor sticks here and there.
1034 nsAutoString tagAttr
;
1035 if (NS_FAILED(GetAttributeValue(nsGkAtoms::type
, tagAttr
)) ||
1036 !tagAttr
.EqualsLiteral("_moz")) {
1037 EnsureVerticalSpace(mEmptyLines
+ 1);
1039 } else if (aTag
== nsGkAtoms::hr
&&
1040 (mFlags
& nsIDocumentEncoder::OutputFormatted
)) {
1041 EnsureVerticalSpace(0);
1043 // Make a line of dashes as wide as the wrap width
1044 // XXX honoring percentage would be nice
1046 uint32_t width
= (mWrapColumn
> 0 ? mWrapColumn
: 25);
1047 while (line
.Length() < width
) {
1048 line
.Append(char16_t('-'));
1052 EnsureVerticalSpace(0);
1053 } else if (mFlags
& nsIDocumentEncoder::OutputNonTextContentAsPlaceholder
) {
1054 Write(NS_LITERAL_STRING(u
"\xFFFC"));
1055 } else if (aTag
== nsGkAtoms::img
) {
1056 /* Output (in decreasing order of preference)
1057 alt, title or nothing */
1058 // See <http://www.w3.org/TR/REC-html40/struct/objects.html#edef-IMG>
1059 nsAutoString imageDescription
;
1060 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::alt
, imageDescription
))) {
1061 // If the alt attribute has an empty value (|alt=""|), output nothing
1062 } else if (NS_SUCCEEDED(
1063 GetAttributeValue(nsGkAtoms::title
, imageDescription
)) &&
1064 !imageDescription
.IsEmpty()) {
1066 NS_LITERAL_STRING(" [") + imageDescription
+ NS_LITERAL_STRING("] ");
1069 Write(imageDescription
);
1076 * Adds as many newline as necessary to get |noOfRows| empty lines
1078 * noOfRows = -1 : Being in the middle of some line of text
1079 * noOfRows = 0 : Being at the start of a line
1080 * noOfRows = n>0 : Having n empty lines before the current line.
1082 void nsPlainTextSerializer::EnsureVerticalSpace(int32_t noOfRows
) {
1083 // If we have something in the indent we probably want to output
1084 // it and it's not included in the count for empty lines so we don't
1085 // realize that we should start a new line.
1086 if (noOfRows
>= 0 && !mInIndentString
.IsEmpty()) {
1088 mInWhitespace
= true;
1091 while (mEmptyLines
< noOfRows
) {
1093 mInWhitespace
= true;
1095 mLineBreakDue
= false;
1096 mFloatingLines
= -1;
1100 * This empties the current line cache without adding a NEWLINE.
1101 * Should not be used if line wrapping is of importance since
1102 * this function destroys the cache information.
1104 * It will also write indentation and quotes if we believe us to be
1105 * at the start of the line.
1107 void nsPlainTextSerializer::FlushLine() {
1108 if (!mCurrentLine
.IsEmpty()) {
1109 if (mAtFirstColumn
) {
1110 OutputQuotesAndIndent(); // XXX: Should we always do this? Bug?
1113 Output(mCurrentLine
);
1114 mAtFirstColumn
= mAtFirstColumn
&& mCurrentLine
.IsEmpty();
1115 mCurrentLine
.Truncate();
1116 mCurrentLineWidth
= 0;
1121 * Prints the text to output to our current output device (the string
1122 * mOutputString). The only logic here is to replace non breaking spaces with a
1123 * normal space since most (all?) receivers of the result won't understand the
1124 * nbsp and even be confused by it.
1126 void nsPlainTextSerializer::Output(nsString
& aString
) {
1127 if (!aString
.IsEmpty()) {
1128 mStartedOutput
= true;
1131 if (!(mFlags
& nsIDocumentEncoder::OutputPersistNBSP
)) {
1132 // First, replace all nbsp characters with spaces,
1133 // which the unicode encoder won't do for us.
1134 aString
.ReplaceChar(kNBSP
, kSPACE
);
1136 mOutputString
->Append(aString
);
1139 static bool IsSpaceStuffable(const char16_t
* s
) {
1140 if (s
[0] == '>' || s
[0] == ' ' || s
[0] == kNBSP
||
1141 NS_strncmp(s
, u
"From ", 5) == 0)
1148 * This function adds a piece of text to the current stored line. If we are
1149 * wrapping text and the stored line will become too long, a suitable
1150 * location to wrap will be found and the line that's complete will be
1153 void nsPlainTextSerializer::AddToLine(const char16_t
* aLineFragment
,
1154 int32_t aLineFragmentLength
) {
1155 uint32_t prefixwidth
=
1156 (mCiteQuoteLevel
> 0 ? mCiteQuoteLevel
+ 1 : 0) + mIndent
;
1158 if (mLineBreakDue
) EnsureVerticalSpace(mFloatingLines
);
1160 int32_t linelength
= mCurrentLine
.Length();
1161 if (0 == linelength
) {
1162 if (0 == aLineFragmentLength
) {
1163 // Nothing at all. Are you kidding me?
1167 if (mFlags
& nsIDocumentEncoder::OutputFormatFlowed
) {
1168 if (IsSpaceStuffable(aLineFragment
) &&
1169 mCiteQuoteLevel
== 0 // We space-stuff quoted lines anyway
1171 // Space stuffing a la RFC 2646 (format=flowed).
1172 mCurrentLine
.Append(char16_t(' '));
1175 mCurrentLineWidth
+= GetUnicharWidth(' ');
1176 #ifdef DEBUG_wrapping
1177 NS_ASSERTION(GetUnicharStringWidth(mCurrentLine
.get(),
1178 mCurrentLine
.Length()) ==
1179 (int32_t)mCurrentLineWidth
,
1180 "mCurrentLineWidth and reality out of sync!");
1188 mCurrentLine
.Append(aLineFragment
, aLineFragmentLength
);
1190 mCurrentLineWidth
+=
1191 GetUnicharStringWidth(aLineFragment
, aLineFragmentLength
);
1192 #ifdef DEBUG_wrapping
1194 GetUnicharstringWidth(mCurrentLine
.get(), mCurrentLine
.Length()) ==
1195 (int32_t)mCurrentLineWidth
,
1196 "mCurrentLineWidth and reality out of sync!");
1200 linelength
= mCurrentLine
.Length();
1204 #ifdef DEBUG_wrapping
1206 GetUnicharstringWidth(mCurrentLine
.get(), mCurrentLine
.Length()) ==
1207 (int32_t)mCurrentLineWidth
,
1208 "mCurrentLineWidth and reality out of sync!");
1211 // The "+4" is to avoid wrap lines that only would be a couple
1212 // of letters too long. We give this bonus only if the
1213 // wrapcolumn is more than 20.
1214 uint32_t bonuswidth
= (mWrapColumn
> 20) ? 4 : 0;
1216 // XXX: Should calculate prefixwidth with GetUnicharStringWidth
1217 while (mCurrentLineWidth
+ prefixwidth
> mWrapColumn
+ bonuswidth
) {
1218 // We go from the end removing one letter at a time until
1219 // we have a reasonable width
1220 int32_t goodSpace
= mCurrentLine
.Length();
1221 uint32_t width
= mCurrentLineWidth
;
1222 while (goodSpace
> 0 && (width
+ prefixwidth
> mWrapColumn
)) {
1224 width
-= GetUnicharWidth(mCurrentLine
[goodSpace
]);
1230 goodSpace
= mLineBreaker
->Prev(mCurrentLine
.get(),
1231 mCurrentLine
.Length(), goodSpace
);
1232 if (goodSpace
!= NS_LINEBREAKER_NEED_MORE_TEXT
&&
1233 nsCRT::IsAsciiSpace(mCurrentLine
.CharAt(goodSpace
- 1))) {
1234 --goodSpace
; // adjust the position since line breaker returns a
1235 // position next to space
1238 // fallback if the line breaker is unavailable or failed
1239 if (!mLineBreaker
) {
1240 if (mCurrentLine
.IsEmpty() || mWrapColumn
< prefixwidth
) {
1241 goodSpace
= NS_LINEBREAKER_NEED_MORE_TEXT
;
1244 std::min(mWrapColumn
- prefixwidth
, mCurrentLine
.Length() - 1);
1245 while (goodSpace
>= 0 &&
1246 !nsCRT::IsAsciiSpace(mCurrentLine
.CharAt(goodSpace
))) {
1252 nsAutoString restOfLine
;
1253 if (goodSpace
== NS_LINEBREAKER_NEED_MORE_TEXT
) {
1254 // If we didn't find a good place to break, accept long line and
1255 // try to find another place to break
1257 (prefixwidth
> mWrapColumn
+ 1) ? 1 : mWrapColumn
- prefixwidth
+ 1;
1259 if ((uint32_t)goodSpace
< mCurrentLine
.Length())
1260 goodSpace
= mLineBreaker
->Next(mCurrentLine
.get(),
1261 mCurrentLine
.Length(), goodSpace
);
1262 if (goodSpace
== NS_LINEBREAKER_NEED_MORE_TEXT
)
1263 goodSpace
= mCurrentLine
.Length();
1265 // fallback if the line breaker is unavailable or failed
1266 if (!mLineBreaker
) {
1268 (prefixwidth
> mWrapColumn
) ? 1 : mWrapColumn
- prefixwidth
;
1269 while (goodSpace
< linelength
&&
1270 !nsCRT::IsAsciiSpace(mCurrentLine
.CharAt(goodSpace
))) {
1276 if ((goodSpace
< linelength
) && (goodSpace
> 0)) {
1277 // Found a place to break
1279 // -1 (trim a char at the break position)
1280 // only if the line break was a space.
1281 if (nsCRT::IsAsciiSpace(mCurrentLine
.CharAt(goodSpace
))) {
1282 mCurrentLine
.Right(restOfLine
, linelength
- goodSpace
- 1);
1284 mCurrentLine
.Right(restOfLine
, linelength
- goodSpace
);
1286 // if breaker was U+0020, it has to consider for delsp=yes support
1287 bool breakBySpace
= mCurrentLine
.CharAt(goodSpace
) == ' ';
1288 mCurrentLine
.Truncate(goodSpace
);
1289 EndLine(true, breakBySpace
);
1290 mCurrentLine
.Truncate();
1291 // Space stuff new line?
1292 if (mFlags
& nsIDocumentEncoder::OutputFormatFlowed
) {
1293 if (!restOfLine
.IsEmpty() && IsSpaceStuffable(restOfLine
.get()) &&
1294 mCiteQuoteLevel
== 0 // We space-stuff quoted lines anyway
1296 // Space stuffing a la RFC 2646 (format=flowed).
1297 mCurrentLine
.Append(char16_t(' '));
1298 // XXX doesn't seem to work correctly for ' '
1301 mCurrentLine
.Append(restOfLine
);
1303 GetUnicharStringWidth(mCurrentLine
.get(), mCurrentLine
.Length());
1304 linelength
= mCurrentLine
.Length();
1307 // Nothing to do. Hopefully we get more data later
1308 // to use for a place to break line
1318 * Outputs the contents of mCurrentLine, and resets line specific
1319 * variables. Also adds an indentation and prefix if there is
1320 * one specified. Strips ending spaces from the line if it isn't
1323 void nsPlainTextSerializer::EndLine(bool aSoftlinebreak
, bool aBreakBySpace
) {
1324 uint32_t currentlinelength
= mCurrentLine
.Length();
1326 if (aSoftlinebreak
&& 0 == currentlinelength
) {
1331 /* In non-preformatted mode, remove spaces from the end of the line for
1332 * format=flowed compatibility. Don't do this for these special cases:
1333 * "-- ", the signature separator (RFC 2646) shouldn't be touched and
1334 * "- -- ", the OpenPGP dash-escaped signature separator in inline
1335 * signed messages according to the OpenPGP standard (RFC 2440).
1337 if (!(mFlags
& nsIDocumentEncoder::OutputPreformatted
) &&
1338 !(mFlags
& nsIDocumentEncoder::OutputDontRemoveLineEndingSpaces
) &&
1339 (aSoftlinebreak
|| !(mCurrentLine
.EqualsLiteral("-- ") ||
1340 mCurrentLine
.EqualsLiteral("- -- ")))) {
1341 // Remove spaces from the end of the line.
1342 while (currentlinelength
> 0 &&
1343 mCurrentLine
[currentlinelength
- 1] == ' ') {
1344 --currentlinelength
;
1346 mCurrentLine
.SetLength(currentlinelength
);
1349 if (aSoftlinebreak
&& (mFlags
& nsIDocumentEncoder::OutputFormatFlowed
) &&
1351 // Add the soft part of the soft linebreak (RFC 2646 4.1)
1352 // We only do this when there is no indentation since format=flowed
1353 // lines and indentation doesn't work well together.
1355 // If breaker character is ASCII space with RFC 3676 support (delsp=yes),
1357 if ((mFlags
& nsIDocumentEncoder::OutputFormatDelSp
) && aBreakBySpace
)
1358 mCurrentLine
.AppendLiteral(" ");
1360 mCurrentLine
.Append(char16_t(' '));
1363 if (aSoftlinebreak
) {
1367 if (!mCurrentLine
.IsEmpty() || !mInIndentString
.IsEmpty()) {
1374 if (mAtFirstColumn
) {
1375 // If we don't have anything "real" to output we have to
1376 // make sure the indent doesn't end in a space since that
1377 // would trick a format=flowed-aware receiver.
1378 bool stripTrailingSpaces
= mCurrentLine
.IsEmpty();
1379 OutputQuotesAndIndent(stripTrailingSpaces
);
1382 mCurrentLine
.Append(mLineBreak
);
1383 Output(mCurrentLine
);
1384 mCurrentLine
.Truncate();
1385 mCurrentLineWidth
= 0;
1386 mAtFirstColumn
= true;
1387 mInWhitespace
= true;
1388 mLineBreakDue
= false;
1389 mFloatingLines
= -1;
1393 * Outputs the calculated and stored indent and text in the indentation. That is
1394 * quote chars and numbers for numbered lists and such. It will also reset any
1395 * stored text to put in the indentation after using it.
1397 void nsPlainTextSerializer::OutputQuotesAndIndent(
1398 bool stripTrailingSpaces
/* = false */) {
1399 nsAutoString stringToOutput
;
1401 // Put the mail quote "> " chars in, if appropriate:
1402 if (mCiteQuoteLevel
> 0) {
1403 nsAutoString quotes
;
1404 for (int i
= 0; i
< mCiteQuoteLevel
; i
++) {
1405 quotes
.Append(char16_t('>'));
1407 if (!mCurrentLine
.IsEmpty()) {
1408 /* Better don't output a space here, if the line is empty,
1409 in case a receiving f=f-aware UA thinks, this were a flowed line,
1410 which it isn't - it's just empty.
1411 (Flowed lines may be joined with the following one,
1412 so the empty line may be lost completely.) */
1413 quotes
.Append(char16_t(' '));
1415 stringToOutput
= quotes
;
1416 mAtFirstColumn
= false;
1419 // Indent if necessary
1420 int32_t indentwidth
= mIndent
- mInIndentString
.Length();
1421 if (indentwidth
> 0 && (!mCurrentLine
.IsEmpty() || !mInIndentString
.IsEmpty())
1422 // Don't make empty lines look flowed
1424 nsAutoString spaces
;
1425 for (int i
= 0; i
< indentwidth
; ++i
) spaces
.Append(char16_t(' '));
1426 stringToOutput
+= spaces
;
1427 mAtFirstColumn
= false;
1430 if (!mInIndentString
.IsEmpty()) {
1431 stringToOutput
+= mInIndentString
;
1432 mAtFirstColumn
= false;
1433 mInIndentString
.Truncate();
1436 if (stripTrailingSpaces
) {
1437 int32_t lineLength
= stringToOutput
.Length();
1438 while (lineLength
> 0 && ' ' == stringToOutput
[lineLength
- 1]) {
1441 stringToOutput
.SetLength(lineLength
);
1444 if (!stringToOutput
.IsEmpty()) {
1445 Output(stringToOutput
);
1450 * Write a string. This is the highlevel function to use to get text output.
1451 * By using AddToLine, Output, EndLine and other functions it handles quotation,
1452 * line wrapping, indentation, whitespace compression and other things.
1454 void nsPlainTextSerializer::Write(const nsAString
& aStr
) {
1455 // XXX Copy necessary to use nsString methods and gain
1456 // access to underlying buffer
1457 nsAutoString
str(aStr
);
1459 #ifdef DEBUG_wrapping
1460 printf("Write(%s): wrap col = %d\n", NS_ConvertUTF16toUTF8(str
).get(),
1467 int32_t totLen
= str
.Length();
1469 // If the string is empty, do nothing:
1470 if (totLen
<= 0) return;
1472 // For Flowed text change nbsp-ses to spaces at end of lines to allow them
1473 // to be cut off along with usual spaces if required. (bug #125928)
1474 if (mFlags
& nsIDocumentEncoder::OutputFormatFlowed
) {
1475 for (int32_t i
= totLen
- 1; i
>= 0; i
--) {
1476 char16_t c
= str
[i
];
1477 if ('\n' == c
|| '\r' == c
|| ' ' == c
|| '\t' == c
) continue;
1479 str
.Replace(i
, 1, ' ');
1485 // We have two major codepaths here. One that does preformatted text and one
1486 // that does normal formatted text. The one for preformatted text calls
1487 // Output directly while the other code path goes through AddToLine.
1488 if ((mPreFormattedMail
&& !mWrapColumn
) ||
1489 (IsInPre() && !mPreFormattedMail
) ||
1490 (mSpanLevel
> 0 && mEmptyLines
>= 0 && IsQuotedLine(str
))) {
1491 // No intelligent wrapping.
1493 // This mustn't be mixed with intelligent wrapping without clearing
1494 // the mCurrentLine buffer before!!!
1495 NS_ASSERTION(mCurrentLine
.IsEmpty() || (IsInPre() && !mPreFormattedMail
),
1496 "Mixed wrapping data and nonwrapping data on the same line");
1497 if (!mCurrentLine
.IsEmpty()) {
1501 // Put the mail quote "> " chars in, if appropriate.
1502 // Have to put it in before every line.
1503 while (bol
< totLen
) {
1504 bool outputQuotes
= mAtFirstColumn
;
1506 bool outputLineBreak
= false;
1507 bool spacesOnly
= true;
1509 // Find one of '\n' or '\r' using iterators since nsAString
1510 // doesn't have the old FindCharInSet function.
1511 nsAString::const_iterator iter
;
1512 str
.BeginReading(iter
);
1513 nsAString::const_iterator done_searching
;
1514 str
.EndReading(done_searching
);
1516 int32_t new_newline
= bol
;
1517 newline
= kNotFound
;
1518 while (iter
!= done_searching
) {
1519 if ('\n' == *iter
|| '\r' == *iter
) {
1520 newline
= new_newline
;
1523 if (' ' != *iter
) spacesOnly
= false;
1529 nsAutoString stringpart
;
1530 if (newline
== kNotFound
) {
1532 stringpart
.Assign(Substring(str
, bol
, totLen
- bol
));
1533 if (!stringpart
.IsEmpty()) {
1534 char16_t lastchar
= stringpart
[stringpart
.Length() - 1];
1535 if ((lastchar
== '\t') || (lastchar
== ' ') || (lastchar
== '\r') ||
1536 (lastchar
== '\n')) {
1537 mInWhitespace
= true;
1539 mInWhitespace
= false;
1543 atFirstColumn
= mAtFirstColumn
&& (totLen
- bol
) == 0;
1546 // There is a newline
1547 stringpart
.Assign(Substring(str
, bol
, newline
- bol
));
1548 mInWhitespace
= true;
1549 outputLineBreak
= true;
1551 atFirstColumn
= true;
1553 if ('\r' == *iter
&& bol
< totLen
&& '\n' == *++iter
) {
1554 // There was a CRLF in the input. This used to be illegal and
1555 // stripped by the parser. Apparently not anymore. Let's skip
1561 mCurrentLine
.Truncate();
1562 if (mFlags
& nsIDocumentEncoder::OutputFormatFlowed
) {
1563 if ((outputLineBreak
|| !spacesOnly
) && // bugs 261467,125928
1564 !IsQuotedLine(stringpart
) && !stringpart
.EqualsLiteral("-- ") &&
1565 !stringpart
.EqualsLiteral("- -- "))
1566 stringpart
.Trim(" ", false, true, true);
1567 if (IsSpaceStuffable(stringpart
.get()) && !IsQuotedLine(stringpart
))
1568 mCurrentLine
.Append(char16_t(' '));
1570 mCurrentLine
.Append(stringpart
);
1573 // Note: this call messes with mAtFirstColumn
1574 OutputQuotesAndIndent();
1577 Output(mCurrentLine
);
1578 if (outputLineBreak
) {
1581 mAtFirstColumn
= atFirstColumn
;
1584 // Reset mCurrentLine.
1585 mCurrentLine
.Truncate();
1587 #ifdef DEBUG_wrapping
1588 printf("No wrapping: newline is %d, totLen is %d\n", newline
, totLen
);
1593 // Intelligent handling of text
1594 // If needed, strip out all "end of lines"
1595 // and multiple whitespace between words
1597 const char16_t
* offsetIntoBuffer
= nullptr;
1599 while (bol
< totLen
) { // Loop over lines
1600 // Find a place where we may have to do whitespace compression
1601 nextpos
= str
.FindCharInSet(" \t\n\r", bol
);
1602 #ifdef DEBUG_wrapping
1603 nsAutoString remaining
;
1604 str
.Right(remaining
, totLen
- bol
);
1605 foo
= ToNewCString(remaining
);
1606 // printf("Next line: bol = %d, newlinepos = %d, totLen = %d, "
1607 // "string = '%s'\n", bol, nextpos, totLen, foo);
1611 if (nextpos
== kNotFound
) {
1612 // The rest of the string
1613 offsetIntoBuffer
= str
.get() + bol
;
1614 AddToLine(offsetIntoBuffer
, totLen
- bol
);
1616 mInWhitespace
= false;
1618 // There's still whitespace left in the string
1619 if (nextpos
!= 0 && (nextpos
+ 1) < totLen
) {
1620 offsetIntoBuffer
= str
.get() + nextpos
;
1621 // skip '\n' if it is between CJ chars
1622 if (offsetIntoBuffer
[0] == '\n' && IS_CJ_CHAR(offsetIntoBuffer
[-1]) &&
1623 IS_CJ_CHAR(offsetIntoBuffer
[1])) {
1624 offsetIntoBuffer
= str
.get() + bol
;
1625 AddToLine(offsetIntoBuffer
, nextpos
- bol
);
1630 // If we're already in whitespace and not preformatted, just skip it:
1631 if (mInWhitespace
&& (nextpos
== bol
) && !mPreFormattedMail
&&
1632 !(mFlags
& nsIDocumentEncoder::OutputPreformatted
)) {
1638 if (nextpos
== bol
) {
1639 // Note that we are in whitespace.
1640 mInWhitespace
= true;
1641 offsetIntoBuffer
= str
.get() + nextpos
;
1642 AddToLine(offsetIntoBuffer
, 1);
1647 mInWhitespace
= true;
1649 offsetIntoBuffer
= str
.get() + bol
;
1650 if (mPreFormattedMail
||
1651 (mFlags
& nsIDocumentEncoder::OutputPreformatted
)) {
1652 // Preserve the real whitespace character
1654 AddToLine(offsetIntoBuffer
, nextpos
- bol
);
1657 // Replace the whitespace with a space
1658 AddToLine(offsetIntoBuffer
, nextpos
- bol
);
1659 AddToLine(kSpace
.get(), 1);
1660 bol
= nextpos
+ 1; // Let's eat the whitespace
1663 } // Continue looping over the string
1667 * Gets the value of an attribute in a string. If the function returns
1668 * NS_ERROR_NOT_AVAILABLE, there was none such attribute specified.
1670 nsresult
nsPlainTextSerializer::GetAttributeValue(nsAtom
* aName
,
1671 nsString
& aValueRet
) {
1673 if (mElement
->GetAttr(kNameSpaceID_None
, aName
, aValueRet
)) {
1678 return NS_ERROR_NOT_AVAILABLE
;
1682 * Returns true, if the element was inserted by Moz' TXT->HTML converter.
1683 * In this case, we should ignore it.
1685 bool nsPlainTextSerializer::IsCurrentNodeConverted() {
1687 nsresult rv
= GetAttributeValue(nsGkAtoms::_class
, value
);
1688 return (NS_SUCCEEDED(rv
) && (value
.EqualsIgnoreCase("moz-txt", 7) ||
1689 value
.EqualsIgnoreCase("\"moz-txt", 8)));
1693 nsAtom
* nsPlainTextSerializer::GetIdForContent(nsIContent
* aContent
) {
1694 if (!aContent
->IsHTMLElement()) {
1698 nsAtom
* localName
= aContent
->NodeInfo()->NameAtom();
1699 return localName
->IsStatic() ? localName
: nullptr;
1702 bool nsPlainTextSerializer::IsInPre() {
1703 return !mPreformatStack
.empty() && mPreformatStack
.top();
1706 bool nsPlainTextSerializer::IsElementPreformatted(Element
* aElement
) {
1707 RefPtr
<ComputedStyle
> computedStyle
=
1708 nsComputedDOMStyle::GetComputedStyleNoFlush(aElement
, nullptr);
1709 if (computedStyle
) {
1710 const nsStyleText
* textStyle
= computedStyle
->StyleText();
1711 return textStyle
->WhiteSpaceOrNewlineIsSignificant();
1713 // Fall back to looking at the tag, in case there is no style information.
1714 return GetIdForContent(aElement
) == nsGkAtoms::pre
;
1717 bool nsPlainTextSerializer::IsElementBlock(Element
* aElement
) {
1718 RefPtr
<ComputedStyle
> computedStyle
=
1719 nsComputedDOMStyle::GetComputedStyleNoFlush(aElement
, nullptr);
1720 if (computedStyle
) {
1721 const nsStyleDisplay
* displayStyle
= computedStyle
->StyleDisplay();
1722 return displayStyle
->IsBlockOutsideStyle();
1724 // Fall back to looking at the tag, in case there is no style information.
1725 return nsContentUtils::IsHTMLBlock(aElement
);
1729 * This method is required only to identify LI's inside OL.
1730 * Returns TRUE if we are inside an OL tag and FALSE otherwise.
1732 bool nsPlainTextSerializer::IsInOL() {
1733 int32_t i
= mTagStackIndex
;
1735 if (mTagStack
[i
] == nsGkAtoms::ol
) return true;
1736 if (mTagStack
[i
] == nsGkAtoms::ul
) {
1737 // If a UL is reached first, LI belongs the UL nested in OL.
1741 // We may reach here for orphan LI's.
1746 @return 0 = no header, 1 = h1, ..., 6 = h6
1748 int32_t HeaderLevel(nsAtom
* aTag
) {
1749 if (aTag
== nsGkAtoms::h1
) {
1752 if (aTag
== nsGkAtoms::h2
) {
1755 if (aTag
== nsGkAtoms::h3
) {
1758 if (aTag
== nsGkAtoms::h4
) {
1761 if (aTag
== nsGkAtoms::h5
) {
1764 if (aTag
== nsGkAtoms::h6
) {
1771 * This is an implementation of GetUnicharWidth() and
1772 * GetUnicharStringWidth() as defined in
1773 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
1774 * <http://www.UNIX-systems.org/online.html>
1776 * Markus Kuhn -- 2000-02-08 -- public domain
1778 * Minor alterations to fit Mozilla's data types by Daniel Bratell
1781 /* These functions define the column width of an ISO 10646 character
1784 * - The null character (U+0000) has a column width of 0.
1786 * - Other C0/C1 control characters and DEL will lead to a return
1789 * - Non-spacing and enclosing combining characters (general
1790 * category code Mn or Me in the Unicode database) have a
1791 * column width of 0.
1793 * - Spacing characters in the East Asian Wide (W) or East Asian
1794 * FullWidth (F) category as defined in Unicode Technical
1795 * Report #11 have a column width of 2.
1797 * - All remaining characters (including all printable
1798 * ISO 8859-1 and WGL4 characters, Unicode control characters,
1799 * etc.) have a column width of 1.
1801 * This implementation assumes that wchar_t characters are encoded
1812 struct CombiningComparator
{
1813 const char16_t mUcs
;
1814 explicit CombiningComparator(char16_t aUcs
) : mUcs(aUcs
) {}
1815 int operator()(const interval
& combining
) const {
1816 if (mUcs
> combining
.last
) return 1;
1817 if (mUcs
< combining
.first
) return -1;
1819 MOZ_ASSERT(combining
.first
<= mUcs
);
1820 MOZ_ASSERT(mUcs
<= combining
.last
);
1827 int32_t GetUnicharWidth(char16_t ucs
) {
1828 /* sorted list of non-overlapping intervals of non-spacing characters */
1829 static const interval combining
[] = {
1830 {0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486}, {0x0488, 0x0489},
1831 {0x0591, 0x05A1}, {0x05A3, 0x05B9}, {0x05BB, 0x05BD}, {0x05BF, 0x05BF},
1832 {0x05C1, 0x05C2}, {0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670},
1833 {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED}, {0x0711, 0x0711},
1834 {0x0730, 0x074A}, {0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C},
1835 {0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954}, {0x0962, 0x0963},
1836 {0x0981, 0x0981}, {0x09BC, 0x09BC}, {0x09C1, 0x09C4}, {0x09CD, 0x09CD},
1837 {0x09E2, 0x09E3}, {0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42},
1838 {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71}, {0x0A81, 0x0A82},
1839 {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5}, {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD},
1840 {0x0B01, 0x0B01}, {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43},
1841 {0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82}, {0x0BC0, 0x0BC0},
1842 {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40}, {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D},
1843 {0x0C55, 0x0C56}, {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
1844 {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA}, {0x0DD2, 0x0DD4},
1845 {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31}, {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E},
1846 {0x0EB1, 0x0EB1}, {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD},
1847 {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37}, {0x0F39, 0x0F39},
1848 {0x0F71, 0x0F7E}, {0x0F80, 0x0F84}, {0x0F86, 0x0F87}, {0x0F90, 0x0F97},
1849 {0x0F99, 0x0FBC}, {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032},
1850 {0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059}, {0x17B7, 0x17BD},
1851 {0x17C6, 0x17C6}, {0x17C9, 0x17D3}, {0x18A9, 0x18A9}, {0x20D0, 0x20E3},
1852 {0x302A, 0x302F}, {0x3099, 0x309A}, {0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}};
1854 /* test for 8-bit control characters */
1855 if (ucs
== 0) return 0;
1856 if (ucs
< 32 || (ucs
>= 0x7f && ucs
< 0xa0)) return -1;
1858 /* first quick check for Latin-1 etc. characters */
1859 if (ucs
< combining
[0].first
) return 1;
1861 /* binary search in table of non-spacing characters */
1863 if (BinarySearchIf(combining
, 0, ArrayLength(combining
),
1864 CombiningComparator(ucs
), &idx
)) {
1868 /* if we arrive here, ucs is not a combining or C0/C1 control character */
1870 /* fast test for majority of non-wide scripts */
1871 if (ucs
< 0x1100) return 1;
1874 ((ucs
>= 0x1100 && ucs
<= 0x115f) || /* Hangul Jamo */
1875 (ucs
>= 0x2e80 && ucs
<= 0xa4cf && (ucs
& ~0x0011) != 0x300a &&
1876 ucs
!= 0x303f) || /* CJK ... Yi */
1877 (ucs
>= 0xac00 && ucs
<= 0xd7a3) || /* Hangul Syllables */
1878 (ucs
>= 0xf900 && ucs
<= 0xfaff) || /* CJK Compatibility Ideographs */
1879 (ucs
>= 0xfe30 && ucs
<= 0xfe6f) || /* CJK Compatibility Forms */
1880 (ucs
>= 0xff00 && ucs
<= 0xff5f) || /* Fullwidth Forms */
1881 (ucs
>= 0xffe0 && ucs
<= 0xffe6));
1884 int32_t GetUnicharStringWidth(const char16_t
* pwcs
, int32_t n
) {
1885 int32_t w
, width
= 0;
1887 for (; *pwcs
&& n
-- > 0; pwcs
++)
1888 if ((w
= GetUnicharWidth(*pwcs
)) < 0)
1889 ++width
; // Taking 1 as the width of non-printable character, for bug#