1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 sw=2 et tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
8 * nsIContentSerializer implementation that can be used with an
9 * nsIDocumentEncoder to convert an HTML (not XHTML!) DOM to an HTML
10 * string that could be parsed into more or less the original DOM.
13 #include "nsHTMLContentSerializer.h"
15 #include "nsIDOMElement.h"
16 #include "nsIContent.h"
17 #include "nsIDocument.h"
18 #include "nsNameSpaceManager.h"
20 #include "nsUnicharUtils.h"
21 #include "nsXPIDLString.h"
22 #include "nsIServiceManager.h"
23 #include "nsIDocumentEncoder.h"
24 #include "nsGkAtoms.h"
26 #include "nsNetUtil.h"
28 #include "nsITextToSubURI.h"
30 #include "nsIParserService.h"
31 #include "nsContentUtils.h"
32 #include "nsLWBrkCIID.h"
33 #include "nsIScriptElement.h"
34 #include "nsAttrName.h"
35 #include "nsIDocShell.h"
36 #include "nsIEditor.h"
37 #include "nsIHTMLEditor.h"
38 #include "mozilla/dom/Element.h"
39 #include "nsParserConstants.h"
41 using namespace mozilla::dom
;
43 nsresult
NS_NewHTMLContentSerializer(nsIContentSerializer
** aSerializer
)
45 nsHTMLContentSerializer
* it
= new nsHTMLContentSerializer();
47 return NS_ERROR_OUT_OF_MEMORY
;
50 return CallQueryInterface(it
, aSerializer
);
53 nsHTMLContentSerializer::nsHTMLContentSerializer()
55 mIsHTMLSerializer
= true;
58 nsHTMLContentSerializer::~nsHTMLContentSerializer()
64 nsHTMLContentSerializer::AppendDocumentStart(nsIDocument
*aDocument
,
71 nsHTMLContentSerializer::SerializeHTMLAttributes(nsIContent
* aContent
,
72 nsIContent
*aOriginalElement
,
73 nsAString
& aTagPrefix
,
74 const nsAString
& aTagNamespaceURI
,
79 int32_t count
= aContent
->GetAttrCount();
84 nsAutoString valueStr
;
85 NS_NAMED_LITERAL_STRING(_mozStr
, "_moz");
87 for (int32_t index
= count
; index
> 0;) {
89 const nsAttrName
* name
= aContent
->GetAttrNameAt(index
);
90 int32_t namespaceID
= name
->NamespaceID();
91 nsIAtom
* attrName
= name
->LocalName();
93 // Filter out any attribute starting with [-|_]moz
94 nsDependentAtomString
attrNameStr(attrName
);
95 if (StringBeginsWith(attrNameStr
, NS_LITERAL_STRING("_moz")) ||
96 StringBeginsWith(attrNameStr
, NS_LITERAL_STRING("-moz"))) {
99 aContent
->GetAttr(namespaceID
, attrName
, valueStr
);
102 // Filter out special case of <br type="_moz"> or <br _moz*>,
103 // used by the editor. Bug 16988. Yuck.
105 if (aTagName
== nsGkAtoms::br
&& aNamespace
== kNameSpaceID_XHTML
&&
106 attrName
== nsGkAtoms::type
&& namespaceID
== kNameSpaceID_None
&&
107 StringBeginsWith(valueStr
, _mozStr
)) {
111 if (mIsCopying
&& mIsFirstChildOfOL
&&
112 aTagName
== nsGkAtoms::li
&& aNamespace
== kNameSpaceID_XHTML
&&
113 attrName
== nsGkAtoms::value
&& namespaceID
== kNameSpaceID_None
){
114 // This is handled separately in SerializeLIValueAttribute()
117 bool isJS
= IsJavaScript(aContent
, attrName
, namespaceID
, valueStr
);
119 if (((attrName
== nsGkAtoms::href
&&
120 (namespaceID
== kNameSpaceID_None
||
121 namespaceID
== kNameSpaceID_XLink
)) ||
122 (attrName
== nsGkAtoms::src
&& namespaceID
== kNameSpaceID_None
))) {
123 // Make all links absolute when converting only the selection:
124 if (mFlags
& nsIDocumentEncoder::OutputAbsoluteLinks
) {
125 // Would be nice to handle OBJECT and APPLET tags,
126 // but that gets more complicated since we have to
127 // search the tag list for CODEBASE as well.
128 // For now, just leave them relative.
129 nsCOMPtr
<nsIURI
> uri
= aContent
->GetBaseURI();
132 rv
= NS_MakeAbsoluteURI(absURI
, valueStr
, uri
);
133 if (NS_SUCCEEDED(rv
)) {
138 // Need to escape URI.
139 nsAutoString
tempURI(valueStr
);
140 if (!isJS
&& NS_FAILED(EscapeURI(aContent
, tempURI
, valueStr
)))
144 if (mRewriteEncodingDeclaration
&& aTagName
== nsGkAtoms::meta
&&
145 aNamespace
== kNameSpaceID_XHTML
&& attrName
== nsGkAtoms::content
146 && namespaceID
== kNameSpaceID_None
) {
147 // If we're serializing a <meta http-equiv="content-type">,
148 // use the proper value, rather than what's in the document.
150 aContent
->GetAttr(kNameSpaceID_None
, nsGkAtoms::httpEquiv
, header
);
151 if (header
.LowerCaseEqualsLiteral("content-type")) {
152 valueStr
= NS_LITERAL_STRING("text/html; charset=") +
153 NS_ConvertASCIItoUTF16(mCharset
);
157 nsDependentAtomString
nameStr(attrName
);
159 if (namespaceID
== kNameSpaceID_XML
) {
160 prefix
.AssignLiteral(MOZ_UTF16("xml"));
161 } else if (namespaceID
== kNameSpaceID_XLink
) {
162 prefix
.AssignLiteral(MOZ_UTF16("xlink"));
165 // Expand shorthand attribute.
166 if (aNamespace
== kNameSpaceID_XHTML
&&
167 namespaceID
== kNameSpaceID_None
&&
168 IsShorthandAttr(attrName
, aTagName
) &&
169 valueStr
.IsEmpty()) {
172 SerializeAttr(prefix
, nameStr
, valueStr
, aStr
, !isJS
);
177 nsHTMLContentSerializer::AppendElementStart(Element
* aElement
,
178 Element
* aOriginalElement
,
181 NS_ENSURE_ARG(aElement
);
183 nsIContent
* content
= aElement
;
185 bool forceFormat
= false;
186 if (!CheckElementStart(content
, forceFormat
, aStr
)) {
190 nsIAtom
*name
= content
->Tag();
191 int32_t ns
= content
->GetNameSpaceID();
193 bool lineBreakBeforeOpen
= LineBreakBeforeOpen(ns
, name
);
195 if ((mDoFormat
|| forceFormat
) && !mDoRaw
&& !PreLevel()) {
196 if (mColPos
&& lineBreakBeforeOpen
) {
197 AppendNewLineToString(aStr
);
200 MaybeAddNewlineForRootNode(aStr
);
203 AppendIndentation(aStr
);
205 else if (mAddSpace
) {
206 AppendToString(char16_t(' '), aStr
);
210 else if (mAddSpace
) {
211 AppendToString(char16_t(' '), aStr
);
215 MaybeAddNewlineForRootNode(aStr
);
217 // Always reset to avoid false newlines in case MaybeAddNewlineForRootNode wasn't
219 mAddNewlineForRootNode
= false;
221 AppendToString(kLessThan
, aStr
);
223 AppendToString(nsDependentAtomString(name
), aStr
);
225 MaybeEnterInPreContent(content
);
227 // for block elements, we increase the indentation
228 if ((mDoFormat
|| forceFormat
) && !mDoRaw
&& !PreLevel())
229 IncrIndentation(name
);
231 // Need to keep track of OL and LI elements in order to get ordinal number
233 if (mIsCopying
&& name
== nsGkAtoms::ol
&& ns
== kNameSpaceID_XHTML
){
234 // We are copying and current node is an OL;
235 // Store its start attribute value in olState->startVal.
237 int32_t startAttrVal
= 0;
239 aElement
->GetAttr(kNameSpaceID_None
, nsGkAtoms::start
, start
);
240 if (!start
.IsEmpty()){
242 startAttrVal
= start
.ToInteger(&rv
);
243 //If OL has "start" attribute, first LI element has to start with that value
244 //Therefore subtracting 1 as all the LI elements are incrementing it before using it;
245 //In failure of ToInteger(), default StartAttrValue to 0.
246 if (NS_SUCCEEDED(rv
))
251 mOLStateStack
.AppendElement(olState(startAttrVal
, true));
254 if (mIsCopying
&& name
== nsGkAtoms::li
&& ns
== kNameSpaceID_XHTML
) {
255 mIsFirstChildOfOL
= IsFirstChildOfOL(aOriginalElement
);
256 if (mIsFirstChildOfOL
){
257 // If OL is parent of this LI, serialize attributes in different manner.
258 SerializeLIValueAttribute(aElement
, aStr
);
262 // Even LI passed above have to go through this
263 // for serializing attributes other than "value".
264 nsAutoString dummyPrefix
;
265 SerializeHTMLAttributes(content
,
273 AppendToString(kGreaterThan
, aStr
);
275 if (ns
== kNameSpaceID_XHTML
&&
276 (name
== nsGkAtoms::script
||
277 name
== nsGkAtoms::style
||
278 name
== nsGkAtoms::noscript
||
279 name
== nsGkAtoms::noframes
)) {
280 ++mDisableEntityEncoding
;
283 if ((mDoFormat
|| forceFormat
) && !mDoRaw
&& !PreLevel() &&
284 LineBreakAfterOpen(ns
, name
)) {
285 AppendNewLineToString(aStr
);
288 AfterElementStart(content
, aOriginalElement
, aStr
);
294 nsHTMLContentSerializer::AppendElementEnd(Element
* aElement
,
297 NS_ENSURE_ARG(aElement
);
299 nsIContent
* content
= aElement
;
301 nsIAtom
*name
= content
->Tag();
302 int32_t ns
= content
->GetNameSpaceID();
304 if (ns
== kNameSpaceID_XHTML
&&
305 (name
== nsGkAtoms::script
||
306 name
== nsGkAtoms::style
||
307 name
== nsGkAtoms::noscript
||
308 name
== nsGkAtoms::noframes
)) {
309 --mDisableEntityEncoding
;
312 bool forceFormat
= !(mFlags
& nsIDocumentEncoder::OutputIgnoreMozDirty
) &&
313 content
->HasAttr(kNameSpaceID_None
, nsGkAtoms::mozdirty
);
315 if ((mDoFormat
|| forceFormat
) && !mDoRaw
&& !PreLevel()) {
316 DecrIndentation(name
);
319 if (name
== nsGkAtoms::script
) {
320 nsCOMPtr
<nsIScriptElement
> script
= do_QueryInterface(aElement
);
322 if (ShouldMaintainPreLevel() && script
&& script
->IsMalformed()) {
323 // We're looking at a malformed script tag. This means that the end tag
324 // was missing in the source. Imitate that here by not serializing the end
330 else if (mIsCopying
&& name
== nsGkAtoms::ol
&& ns
== kNameSpaceID_XHTML
) {
331 NS_ASSERTION((!mOLStateStack
.IsEmpty()), "Cannot have an empty OL Stack");
332 /* Though at this point we must always have an state to be deleted as all
333 the OL opening tags are supposed to push an olState object to the stack*/
334 if (!mOLStateStack
.IsEmpty()) {
335 mOLStateStack
.RemoveElementAt(mOLStateStack
.Length() -1);
339 if (ns
== kNameSpaceID_XHTML
) {
340 nsIParserService
* parserService
= nsContentUtils::GetParserService();
346 IsContainer(parserService
->HTMLCaseSensitiveAtomTagToId(name
),
354 if ((mDoFormat
|| forceFormat
) && !mDoRaw
&& !PreLevel()) {
356 bool lineBreakBeforeClose
= LineBreakBeforeClose(ns
, name
);
358 if (mColPos
&& lineBreakBeforeClose
) {
359 AppendNewLineToString(aStr
);
362 AppendIndentation(aStr
);
364 else if (mAddSpace
) {
365 AppendToString(char16_t(' '), aStr
);
369 else if (mAddSpace
) {
370 AppendToString(char16_t(' '), aStr
);
374 AppendToString(kEndTag
, aStr
);
375 AppendToString(nsDependentAtomString(name
), aStr
);
376 AppendToString(kGreaterThan
, aStr
);
378 MaybeLeaveFromPreContent(content
);
380 if ((mDoFormat
|| forceFormat
)&& !mDoRaw
&& !PreLevel()
381 && LineBreakAfterClose(ns
, name
)) {
382 AppendNewLineToString(aStr
);
385 MaybeFlagNewlineForRootNode(aElement
);
388 if (name
== nsGkAtoms::body
&& ns
== kNameSpaceID_XHTML
) {
395 static const uint16_t kValNBSP
= 160;
396 static const char* kEntities
[] = {
397 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
398 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
399 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
400 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, "&", nullptr,
401 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
402 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
403 "<", nullptr, ">", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
404 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
405 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
406 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
407 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
408 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
409 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
410 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
411 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
412 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
416 static const char* kAttrEntities
[] = {
417 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
418 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
419 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
420 nullptr, nullptr, nullptr, nullptr, """, nullptr, nullptr, nullptr, "&", nullptr,
421 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
422 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
423 "<", nullptr, ">", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
424 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
425 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
426 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
427 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
428 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
429 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
430 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
431 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
432 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
436 uint32_t FindNextBasicEntity(const nsAString
& aStr
,
439 const char** aEntityTable
,
440 const char** aEntity
)
442 for (; aIndex
< aLen
; ++aIndex
) {
443 // for each character in this chunk, check if it
444 // needs to be replaced
445 char16_t val
= aStr
[aIndex
];
446 if (val
<= kValNBSP
&& aEntityTable
[val
]) {
447 *aEntity
= aEntityTable
[val
];
455 nsHTMLContentSerializer::AppendAndTranslateEntities(const nsAString
& aStr
,
456 nsAString
& aOutputStr
)
458 if (mBodyOnly
&& !mInBody
) {
462 if (mDisableEntityEncoding
) {
463 aOutputStr
.Append(aStr
);
467 bool nonBasicEntities
=
468 !!(mFlags
& (nsIDocumentEncoder::OutputEncodeLatin1Entities
|
469 nsIDocumentEncoder::OutputEncodeHTMLEntities
|
470 nsIDocumentEncoder::OutputEncodeW3CEntities
));
472 if (!nonBasicEntities
&&
473 (mFlags
& (nsIDocumentEncoder::OutputEncodeBasicEntities
))) {
474 const char **entityTable
= mInAttribute
? kAttrEntities
: kEntities
;
476 const uint32_t len
= aStr
.Length();
477 for (uint32_t i
= 0; i
< len
; ++i
) {
478 const char* entity
= nullptr;
479 i
= FindNextBasicEntity(aStr
, len
, i
, entityTable
, &entity
);
480 uint32_t normalTextLen
= i
- start
;
482 aOutputStr
.Append(Substring(aStr
, start
, normalTextLen
));
485 aOutputStr
.AppendASCII(entity
);
490 } else if (nonBasicEntities
) {
491 nsIParserService
* parserService
= nsContentUtils::GetParserService();
493 if (!parserService
) {
494 NS_ERROR("Can't get parser service");
498 nsReadingIterator
<char16_t
> done_reading
;
499 aStr
.EndReading(done_reading
);
501 // for each chunk of |aString|...
502 uint32_t advanceLength
= 0;
503 nsReadingIterator
<char16_t
> iter
;
505 const char **entityTable
= mInAttribute
? kAttrEntities
: kEntities
;
506 nsAutoCString entityReplacement
;
508 for (aStr
.BeginReading(iter
);
509 iter
!= done_reading
;
510 iter
.advance(int32_t(advanceLength
))) {
511 uint32_t fragmentLength
= iter
.size_forward();
512 uint32_t lengthReplaced
= 0; // the number of UTF-16 codepoints
513 // replaced by a particular entity
514 const char16_t
* c
= iter
.get();
515 const char16_t
* fragmentStart
= c
;
516 const char16_t
* fragmentEnd
= c
+ fragmentLength
;
517 const char* entityText
= nullptr;
518 const char* fullConstEntityText
= nullptr;
519 char* fullEntityText
= nullptr;
522 // for each character in this chunk, check if it
523 // needs to be replaced
524 for (; c
< fragmentEnd
; c
++, advanceLength
++) {
526 if (val
<= kValNBSP
&& entityTable
[val
]) {
527 fullConstEntityText
= entityTable
[val
];
529 } else if (val
> 127 &&
531 mFlags
& nsIDocumentEncoder::OutputEncodeLatin1Entities
) ||
532 mFlags
& nsIDocumentEncoder::OutputEncodeHTMLEntities
)) {
533 entityReplacement
.Truncate();
534 parserService
->HTMLConvertUnicodeToEntity(val
, entityReplacement
);
536 if (!entityReplacement
.IsEmpty()) {
537 entityText
= entityReplacement
.get();
541 else if (val
> 127 &&
542 mFlags
& nsIDocumentEncoder::OutputEncodeW3CEntities
&&
544 if (NS_IS_HIGH_SURROGATE(val
) &&
545 c
+ 1 < fragmentEnd
&&
546 NS_IS_LOW_SURROGATE(*(c
+ 1))) {
547 uint32_t valUTF32
= SURROGATE_TO_UCS4(val
, *(++c
));
548 if (NS_SUCCEEDED(mEntityConverter
->ConvertUTF32ToEntity(valUTF32
,
549 nsIEntityConverter::entityW3C
, &fullEntityText
))) {
557 else if (NS_SUCCEEDED(mEntityConverter
->ConvertToEntity(val
,
558 nsIEntityConverter::entityW3C
,
566 aOutputStr
.Append(fragmentStart
, advanceLength
);
568 aOutputStr
.Append(char16_t('&'));
569 AppendASCIItoUTF16(entityText
, aOutputStr
);
570 aOutputStr
.Append(char16_t(';'));
573 else if (fullConstEntityText
) {
574 aOutputStr
.AppendASCII(fullConstEntityText
);
577 // if it comes from nsIEntityConverter, it already has '&' and ';'
578 else if (fullEntityText
) {
579 AppendASCIItoUTF16(fullEntityText
, aOutputStr
);
580 nsMemory::Free(fullEntityText
);
581 advanceLength
+= lengthReplaced
;
585 nsXMLContentSerializer::AppendAndTranslateEntities(aStr
, aOutputStr
);