Bumping manifests a=b2g-bump
[gecko.git] / parser / html / nsHtml5StreamParser.cpp
blob9c58ef10e025a084a8af834d889516770afba75f
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set sw=2 ts=2 et tw=79: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "mozilla/DebugOnly.h"
9 #include "nsHtml5StreamParser.h"
10 #include "nsContentUtils.h"
11 #include "nsHtml5Tokenizer.h"
12 #include "nsIHttpChannel.h"
13 #include "nsHtml5Parser.h"
14 #include "nsHtml5TreeBuilder.h"
15 #include "nsHtml5AtomTable.h"
16 #include "nsHtml5Module.h"
17 #include "nsHtml5RefPtr.h"
18 #include "nsIScriptError.h"
19 #include "mozilla/Preferences.h"
20 #include "nsHtml5Highlighter.h"
21 #include "expat_config.h"
22 #include "expat.h"
23 #include "nsINestedURI.h"
24 #include "nsCharsetSource.h"
25 #include "nsIWyciwygChannel.h"
26 #include "nsIThreadRetargetableRequest.h"
27 #include "nsPrintfCString.h"
28 #include "nsNetUtil.h"
29 #include "nsXULAppAPI.h"
31 #include "mozilla/dom/EncodingUtils.h"
33 using namespace mozilla;
34 using mozilla::dom::EncodingUtils;
36 int32_t nsHtml5StreamParser::sTimerInitialDelay = 120;
37 int32_t nsHtml5StreamParser::sTimerSubsequentDelay = 120;
39 // static
40 void
41 nsHtml5StreamParser::InitializeStatics()
43 Preferences::AddIntVarCache(&sTimerInitialDelay,
44 "html5.flushtimer.initialdelay");
45 Preferences::AddIntVarCache(&sTimerSubsequentDelay,
46 "html5.flushtimer.subsequentdelay");
50 * Note that nsHtml5StreamParser implements cycle collecting AddRef and
51 * Release. Therefore, nsHtml5StreamParser must never be refcounted from
52 * the parser thread!
54 * To work around this limitation, runnables posted by the main thread to the
55 * parser thread hold their reference to the stream parser in an
56 * nsHtml5RefPtr. Upon creation, nsHtml5RefPtr addrefs the object it holds
57 * just like a regular nsRefPtr. This is OK, since the creation of the
58 * runnable and the nsHtml5RefPtr happens on the main thread.
60 * When the runnable is done on the parser thread, the destructor of
61 * nsHtml5RefPtr runs there. It doesn't call Release on the held object
62 * directly. Instead, it posts another runnable back to the main thread where
63 * that runnable calls Release on the wrapped object.
65 * When posting runnables in the other direction, the runnables have to be
66 * created on the main thread when nsHtml5StreamParser is instantiated and
67 * held for the lifetime of the nsHtml5StreamParser. This works, because the
68 * same runnabled can be dispatched multiple times and currently runnables
69 * posted from the parser thread to main thread don't need to wrap any
70 * runnable-specific data. (In the other direction, the runnables most notably
71 * wrap the byte data of the stream.)
73 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsHtml5StreamParser)
74 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsHtml5StreamParser)
76 NS_INTERFACE_TABLE_HEAD(nsHtml5StreamParser)
77 NS_INTERFACE_TABLE(nsHtml5StreamParser,
78 nsICharsetDetectionObserver)
79 NS_INTERFACE_TABLE_TO_MAP_SEGUE_CYCLE_COLLECTION(nsHtml5StreamParser)
80 NS_INTERFACE_MAP_END
82 NS_IMPL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
84 NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsHtml5StreamParser)
85 tmp->DropTimer();
86 NS_IMPL_CYCLE_COLLECTION_UNLINK(mObserver)
87 NS_IMPL_CYCLE_COLLECTION_UNLINK(mRequest)
88 NS_IMPL_CYCLE_COLLECTION_UNLINK(mOwner)
89 tmp->mExecutorFlusher = nullptr;
90 tmp->mLoadFlusher = nullptr;
91 tmp->mExecutor = nullptr;
92 NS_IMPL_CYCLE_COLLECTION_UNLINK(mChardet)
93 NS_IMPL_CYCLE_COLLECTION_UNLINK_END
95 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsHtml5StreamParser)
96 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mObserver)
97 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mRequest)
98 NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mOwner)
99 // hack: count the strongly owned edge wrapped in the runnable
100 if (tmp->mExecutorFlusher) {
101 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mExecutorFlusher->mExecutor");
102 cb.NoteXPCOMChild(static_cast<nsIContentSink*> (tmp->mExecutor));
104 // hack: count the strongly owned edge wrapped in the runnable
105 if (tmp->mLoadFlusher) {
106 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mLoadFlusher->mExecutor");
107 cb.NoteXPCOMChild(static_cast<nsIContentSink*> (tmp->mExecutor));
109 // hack: count self if held by mChardet
110 if (tmp->mChardet) {
111 NS_CYCLE_COLLECTION_NOTE_EDGE_NAME(cb, "mChardet->mObserver");
112 cb.NoteXPCOMChild(static_cast<nsICharsetDetectionObserver*>(tmp));
114 NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
116 class nsHtml5ExecutorFlusher : public nsRunnable
118 private:
119 nsRefPtr<nsHtml5TreeOpExecutor> mExecutor;
120 public:
121 explicit nsHtml5ExecutorFlusher(nsHtml5TreeOpExecutor* aExecutor)
122 : mExecutor(aExecutor)
124 NS_IMETHODIMP Run()
126 if (!mExecutor->isInList()) {
127 mExecutor->RunFlushLoop();
129 return NS_OK;
133 class nsHtml5LoadFlusher : public nsRunnable
135 private:
136 nsRefPtr<nsHtml5TreeOpExecutor> mExecutor;
137 public:
138 explicit nsHtml5LoadFlusher(nsHtml5TreeOpExecutor* aExecutor)
139 : mExecutor(aExecutor)
141 NS_IMETHODIMP Run()
143 mExecutor->FlushSpeculativeLoads();
144 return NS_OK;
148 nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
149 nsHtml5Parser* aOwner,
150 eParserMode aMode)
151 : mFirstBuffer(nullptr) // Will be filled when starting
152 , mLastBuffer(nullptr) // Will be filled when starting
153 , mExecutor(aExecutor)
154 , mTreeBuilder(new nsHtml5TreeBuilder((aMode == VIEW_SOURCE_HTML ||
155 aMode == VIEW_SOURCE_XML) ?
156 nullptr : mExecutor->GetStage(),
157 aMode == NORMAL ?
158 mExecutor->GetStage() : nullptr))
159 , mTokenizer(new nsHtml5Tokenizer(mTreeBuilder, aMode == VIEW_SOURCE_XML))
160 , mTokenizerMutex("nsHtml5StreamParser mTokenizerMutex")
161 , mOwner(aOwner)
162 , mSpeculationMutex("nsHtml5StreamParser mSpeculationMutex")
163 , mTerminatedMutex("nsHtml5StreamParser mTerminatedMutex")
164 , mThread(nsHtml5Module::GetStreamParserThread())
165 , mExecutorFlusher(new nsHtml5ExecutorFlusher(aExecutor))
166 , mLoadFlusher(new nsHtml5LoadFlusher(aExecutor))
167 , mFlushTimer(do_CreateInstance("@mozilla.org/timer;1"))
168 , mMode(aMode)
170 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
171 mFlushTimer->SetTarget(mThread);
172 #ifdef DEBUG
173 mAtomTable.SetPermittedLookupThread(mThread);
174 #endif
175 mTokenizer->setInterner(&mAtomTable);
176 mTokenizer->setEncodingDeclarationHandler(this);
178 if (aMode == VIEW_SOURCE_HTML || aMode == VIEW_SOURCE_XML) {
179 nsHtml5Highlighter* highlighter =
180 new nsHtml5Highlighter(mExecutor->GetStage());
181 mTokenizer->EnableViewSource(highlighter); // takes ownership
182 mTreeBuilder->EnableViewSource(highlighter); // doesn't own
185 // Chardet instantiation adapted from File.
186 // Chardet is initialized here even if it turns out to be useless
187 // to make the chardet refcount its observer (nsHtml5StreamParser)
188 // on the main thread.
189 const nsAdoptingCString& detectorName =
190 Preferences::GetLocalizedCString("intl.charset.detector");
191 if (!detectorName.IsEmpty()) {
192 nsAutoCString detectorContractID;
193 detectorContractID.AssignLiteral(NS_CHARSET_DETECTOR_CONTRACTID_BASE);
194 detectorContractID += detectorName;
195 if ((mChardet = do_CreateInstance(detectorContractID.get()))) {
196 (void) mChardet->Init(this);
197 mFeedChardet = true;
201 // There's a zeroing operator new for everything else
204 nsHtml5StreamParser::~nsHtml5StreamParser()
206 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
207 mTokenizer->end();
208 NS_ASSERTION(!mFlushTimer, "Flush timer was not dropped before dtor!");
209 #ifdef DEBUG
210 mRequest = nullptr;
211 mObserver = nullptr;
212 mUnicodeDecoder = nullptr;
213 mSniffingBuffer = nullptr;
214 mMetaScanner = nullptr;
215 mFirstBuffer = nullptr;
216 mExecutor = nullptr;
217 mTreeBuilder = nullptr;
218 mTokenizer = nullptr;
219 mOwner = nullptr;
220 #endif
223 nsresult
224 nsHtml5StreamParser::GetChannel(nsIChannel** aChannel)
226 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
227 return mRequest ? CallQueryInterface(mRequest, aChannel) :
228 NS_ERROR_NOT_AVAILABLE;
231 NS_IMETHODIMP
232 nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
234 NS_ASSERTION(IsParserThread(), "Wrong thread!");
235 if (aConf == eBestAnswer || aConf == eSureAnswer) {
236 mFeedChardet = false; // just in case
237 nsAutoCString encoding;
238 if (!EncodingUtils::FindEncodingForLabelNoReplacement(
239 nsDependentCString(aCharset), encoding)) {
240 return NS_OK;
242 if (HasDecoder()) {
243 if (mCharset.Equals(encoding)) {
244 NS_ASSERTION(mCharsetSource < kCharsetFromAutoDetection,
245 "Why are we running chardet at all?");
246 mCharsetSource = kCharsetFromAutoDetection;
247 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
248 } else {
249 // We've already committed to a decoder. Request a reload from the
250 // docshell.
251 mTreeBuilder->NeedsCharsetSwitchTo(encoding,
252 kCharsetFromAutoDetection,
254 FlushTreeOpsAndDisarmTimer();
255 Interrupt();
257 } else {
258 // Got a confident answer from the sniffing buffer. That code will
259 // take care of setting up the decoder.
260 mCharset.Assign(encoding);
261 mCharsetSource = kCharsetFromAutoDetection;
262 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
265 return NS_OK;
268 void
269 nsHtml5StreamParser::SetViewSourceTitle(nsIURI* aURL)
271 if (aURL) {
272 nsCOMPtr<nsIURI> temp;
273 bool isViewSource;
274 aURL->SchemeIs("view-source", &isViewSource);
275 if (isViewSource) {
276 nsCOMPtr<nsINestedURI> nested = do_QueryInterface(aURL);
277 nested->GetInnerURI(getter_AddRefs(temp));
278 } else {
279 temp = aURL;
281 bool isData;
282 temp->SchemeIs("data", &isData);
283 if (isData) {
284 // Avoid showing potentially huge data: URLs. The three last bytes are
285 // UTF-8 for an ellipsis.
286 mViewSourceTitle.AssignLiteral("data:\xE2\x80\xA6");
287 } else {
288 temp->GetSpec(mViewSourceTitle);
293 nsresult
294 nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, // can be null
295 uint32_t aCount,
296 uint32_t* aWriteCount)
298 NS_ASSERTION(IsParserThread(), "Wrong thread!");
299 nsresult rv = NS_OK;
300 mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
301 if (mSniffingBuffer) {
302 uint32_t writeCount;
303 rv = WriteStreamBytes(mSniffingBuffer, mSniffingLength, &writeCount);
304 NS_ENSURE_SUCCESS(rv, rv);
305 mSniffingBuffer = nullptr;
307 mMetaScanner = nullptr;
308 if (aFromSegment) {
309 rv = WriteStreamBytes(aFromSegment, aCount, aWriteCount);
311 return rv;
314 nsresult
315 nsHtml5StreamParser::SetupDecodingFromBom(const char* aDecoderCharsetName)
317 NS_ASSERTION(IsParserThread(), "Wrong thread!");
318 mCharset.Assign(aDecoderCharsetName);
319 mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
320 mCharsetSource = kCharsetFromByteOrderMark;
321 mFeedChardet = false;
322 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
323 mSniffingBuffer = nullptr;
324 mMetaScanner = nullptr;
325 mBomState = BOM_SNIFFING_OVER;
326 return NS_OK;
329 void
330 nsHtml5StreamParser::SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
331 uint32_t aCountToSniffingLimit)
333 // Avoid underspecified heuristic craziness for XHR
334 if (mMode == LOAD_AS_DATA) {
335 return;
337 // Make sure there's enough data. Require room for "<title></title>"
338 if (mSniffingLength + aCountToSniffingLimit < 30) {
339 return;
341 // even-numbered bytes tracked at 0, odd-numbered bytes tracked at 1
342 bool byteZero[2] = { false, false };
343 bool byteNonZero[2] = { false, false };
344 uint32_t i = 0;
345 if (mSniffingBuffer) {
346 for (; i < mSniffingLength; ++i) {
347 if (mSniffingBuffer[i]) {
348 if (byteNonZero[1 - (i % 2)]) {
349 return;
351 byteNonZero[i % 2] = true;
352 } else {
353 if (byteZero[1 - (i % 2)]) {
354 return;
356 byteZero[i % 2] = true;
360 if (aFromSegment) {
361 for (uint32_t j = 0; j < aCountToSniffingLimit; ++j) {
362 if (aFromSegment[j]) {
363 if (byteNonZero[1 - ((i + j) % 2)]) {
364 return;
366 byteNonZero[(i + j) % 2] = true;
367 } else {
368 if (byteZero[1 - ((i + j) % 2)]) {
369 return;
371 byteZero[(i + j) % 2] = true;
376 if (byteNonZero[0]) {
377 mCharset.AssignLiteral("UTF-16LE");
378 } else {
379 mCharset.AssignLiteral("UTF-16BE");
381 mCharsetSource = kCharsetFromIrreversibleAutoDetection;
382 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
383 mFeedChardet = false;
384 mTreeBuilder->MaybeComplainAboutCharset("EncBomlessUtf16",
385 true,
390 void
391 nsHtml5StreamParser::SetEncodingFromExpat(const char16_t* aEncoding)
393 if (aEncoding) {
394 nsDependentString utf16(aEncoding);
395 nsAutoCString utf8;
396 CopyUTF16toUTF8(utf16, utf8);
397 if (PreferredForInternalEncodingDecl(utf8)) {
398 mCharset.Assign(utf8);
399 mCharsetSource = kCharsetFromMetaTag; // closest for XML
400 return;
402 // else the page declared an encoding Gecko doesn't support and we'd
403 // end up defaulting to UTF-8 anyway. Might as well fall through here
404 // right away and let the encoding be set to UTF-8 which we'd default to
405 // anyway.
407 mCharset.AssignLiteral("UTF-8"); // XML defaults to UTF-8 without a BOM
408 mCharsetSource = kCharsetFromMetaTag; // means confident
411 // A separate user data struct is used instead of passing the
412 // nsHtml5StreamParser instance as user data in order to avoid including
413 // expat.h in nsHtml5StreamParser.h. Doing that would cause naming conflicts.
414 // Using a separate user data struct also avoids bloating nsHtml5StreamParser
415 // by one pointer.
416 struct UserData {
417 XML_Parser mExpat;
418 nsHtml5StreamParser* mStreamParser;
421 // Using no-namespace handler callbacks to avoid including expat.h in
422 // nsHtml5StreamParser.h, since doing so would cause naming conclicts.
423 static void
424 HandleXMLDeclaration(void* aUserData,
425 const XML_Char* aVersion,
426 const XML_Char* aEncoding,
427 int aStandalone)
429 UserData* ud = static_cast<UserData*>(aUserData);
430 ud->mStreamParser->SetEncodingFromExpat(
431 reinterpret_cast<const char16_t*>(aEncoding));
432 XML_StopParser(ud->mExpat, false);
435 static void
436 HandleStartElement(void* aUserData,
437 const XML_Char* aName,
438 const XML_Char **aAtts)
440 UserData* ud = static_cast<UserData*>(aUserData);
441 XML_StopParser(ud->mExpat, false);
444 static void
445 HandleEndElement(void* aUserData,
446 const XML_Char* aName)
448 UserData* ud = static_cast<UserData*>(aUserData);
449 XML_StopParser(ud->mExpat, false);
452 static void
453 HandleComment(void* aUserData,
454 const XML_Char* aName)
456 UserData* ud = static_cast<UserData*>(aUserData);
457 XML_StopParser(ud->mExpat, false);
460 static void
461 HandleProcessingInstruction(void* aUserData,
462 const XML_Char* aTarget,
463 const XML_Char* aData)
465 UserData* ud = static_cast<UserData*>(aUserData);
466 XML_StopParser(ud->mExpat, false);
469 nsresult
470 nsHtml5StreamParser::FinalizeSniffing(const uint8_t* aFromSegment, // can be null
471 uint32_t aCount,
472 uint32_t* aWriteCount,
473 uint32_t aCountToSniffingLimit)
475 NS_ASSERTION(IsParserThread(), "Wrong thread!");
476 NS_ASSERTION(mCharsetSource < kCharsetFromParentForced,
477 "Should not finalize sniffing when using forced charset.");
478 if (mMode == VIEW_SOURCE_XML) {
479 static const XML_Memory_Handling_Suite memsuite =
481 (void *(*)(size_t))moz_xmalloc,
482 (void *(*)(void *, size_t))moz_xrealloc,
483 moz_free
486 static const char16_t kExpatSeparator[] = { 0xFFFF, '\0' };
488 static const char16_t kISO88591[] =
489 { 'I', 'S', 'O', '-', '8', '8', '5', '9', '-', '1', '\0' };
491 UserData ud;
492 ud.mStreamParser = this;
494 // If we got this far, the stream didn't have a BOM. UTF-16-encoded XML
495 // documents MUST begin with a BOM. We don't support EBCDIC and such.
496 // Thus, at this point, what we have is garbage or something encoded using
497 // a rough ASCII superset. ISO-8859-1 allows us to decode ASCII bytes
498 // without throwing errors when bytes have the most significant bit set
499 // and without triggering expat's unknown encoding code paths. This is
500 // enough to be able to use expat to parse the XML declaration in order
501 // to extract the encoding name from it.
502 ud.mExpat = XML_ParserCreate_MM(kISO88591, &memsuite, kExpatSeparator);
503 XML_SetXmlDeclHandler(ud.mExpat, HandleXMLDeclaration);
504 XML_SetElementHandler(ud.mExpat, HandleStartElement, HandleEndElement);
505 XML_SetCommentHandler(ud.mExpat, HandleComment);
506 XML_SetProcessingInstructionHandler(ud.mExpat, HandleProcessingInstruction);
507 XML_SetUserData(ud.mExpat, static_cast<void*>(&ud));
509 XML_Status status = XML_STATUS_OK;
511 // aFromSegment points to the data obtained from the current network
512 // event. mSniffingBuffer (if it exists) contains the data obtained before
513 // the current event. Thus, mSniffingLenth bytes of mSniffingBuffer
514 // followed by aCountToSniffingLimit bytes from aFromSegment are the
515 // first 1024 bytes of the file (or the file as a whole if the file is
516 // 1024 bytes long or shorter). Thus, we parse both buffers, but if the
517 // first call succeeds already, we skip parsing the second buffer.
518 if (mSniffingBuffer) {
519 status = XML_Parse(ud.mExpat,
520 reinterpret_cast<const char*>(mSniffingBuffer.get()),
521 mSniffingLength,
522 false);
524 if (status == XML_STATUS_OK &&
525 mCharsetSource < kCharsetFromMetaTag &&
526 aFromSegment) {
527 status = XML_Parse(ud.mExpat,
528 reinterpret_cast<const char*>(aFromSegment),
529 aCountToSniffingLimit,
530 false);
532 XML_ParserFree(ud.mExpat);
534 if (mCharsetSource < kCharsetFromMetaTag) {
535 // Failed to get an encoding from the XML declaration. XML defaults
536 // confidently to UTF-8 in this case.
537 // It is also possible that the document has an XML declaration that is
538 // longer than 1024 bytes, but that case is not worth worrying about.
539 mCharset.AssignLiteral("UTF-8");
540 mCharsetSource = kCharsetFromMetaTag; // means confident
543 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
544 aCount,
545 aWriteCount);
548 // meta scan failed.
549 if (mCharsetSource >= kCharsetFromHintPrevDoc) {
550 mFeedChardet = false;
551 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
553 // Check for BOMless UTF-16 with Basic
554 // Latin content for compat with IE. See bug 631751.
555 SniffBOMlessUTF16BasicLatin(aFromSegment, aCountToSniffingLimit);
556 // the charset may have been set now
557 // maybe try chardet now;
558 if (mFeedChardet) {
559 bool dontFeed;
560 nsresult rv;
561 if (mSniffingBuffer) {
562 rv = mChardet->DoIt((const char*)mSniffingBuffer.get(), mSniffingLength, &dontFeed);
563 mFeedChardet = !dontFeed;
564 NS_ENSURE_SUCCESS(rv, rv);
566 if (mFeedChardet && aFromSegment) {
567 rv = mChardet->DoIt((const char*)aFromSegment,
568 // Avoid buffer boundary-dependent behavior when
569 // reparsing is forbidden. If reparse is forbidden,
570 // act as if we only saw the first 1024 bytes.
571 // When reparsing isn't forbidden, buffer boundaries
572 // can have an effect on whether the page is loaded
573 // once or twice. :-(
574 mReparseForbidden ? aCountToSniffingLimit : aCount,
575 &dontFeed);
576 mFeedChardet = !dontFeed;
577 NS_ENSURE_SUCCESS(rv, rv);
579 if (mFeedChardet && (!aFromSegment || mReparseForbidden)) {
580 // mReparseForbidden is checked so that we get to use the sniffing
581 // buffer with the best guess so far if we aren't allowed to guess
582 // better later.
583 mFeedChardet = false;
584 rv = mChardet->Done();
585 NS_ENSURE_SUCCESS(rv, rv);
587 // fall thru; callback may have changed charset
589 if (mCharsetSource == kCharsetUninitialized) {
590 // Hopefully this case is never needed, but dealing with it anyway
591 mCharset.AssignLiteral("windows-1252");
592 mCharsetSource = kCharsetFromFallback;
593 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
594 } else if (mMode == LOAD_AS_DATA &&
595 mCharsetSource == kCharsetFromFallback) {
596 NS_ASSERTION(mReparseForbidden, "Reparse should be forbidden for XHR");
597 NS_ASSERTION(!mFeedChardet, "Should not feed chardet for XHR");
598 NS_ASSERTION(mCharset.EqualsLiteral("UTF-8"),
599 "XHR should default to UTF-8");
600 // Now mark charset source as non-weak to signal that we have a decision
601 mCharsetSource = kCharsetFromDocTypeDefault;
602 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
604 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
607 nsresult
608 nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
609 uint32_t aCount,
610 uint32_t* aWriteCount)
612 NS_ASSERTION(IsParserThread(), "Wrong thread!");
613 nsresult rv = NS_OK;
614 uint32_t writeCount;
616 // mCharset and mCharsetSource potentially have come from channel or higher
617 // by now. If we find a BOM, SetupDecodingFromBom() will overwrite them.
618 // If we don't find a BOM, the previously set values of mCharset and
619 // mCharsetSource are not modified by the BOM sniffing here.
620 for (uint32_t i = 0; i < aCount && mBomState != BOM_SNIFFING_OVER; i++) {
621 switch (mBomState) {
622 case BOM_SNIFFING_NOT_STARTED:
623 NS_ASSERTION(i == 0, "Bad BOM sniffing state.");
624 switch (*aFromSegment) {
625 case 0xEF:
626 mBomState = SEEN_UTF_8_FIRST_BYTE;
627 break;
628 case 0xFF:
629 mBomState = SEEN_UTF_16_LE_FIRST_BYTE;
630 break;
631 case 0xFE:
632 mBomState = SEEN_UTF_16_BE_FIRST_BYTE;
633 break;
634 default:
635 mBomState = BOM_SNIFFING_OVER;
636 break;
638 break;
639 case SEEN_UTF_16_LE_FIRST_BYTE:
640 if (aFromSegment[i] == 0xFE) {
641 rv = SetupDecodingFromBom("UTF-16LE"); // upper case is the raw form
642 NS_ENSURE_SUCCESS(rv, rv);
643 uint32_t count = aCount - (i + 1);
644 rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
645 NS_ENSURE_SUCCESS(rv, rv);
646 *aWriteCount = writeCount + (i + 1);
647 return rv;
649 mBomState = BOM_SNIFFING_OVER;
650 break;
651 case SEEN_UTF_16_BE_FIRST_BYTE:
652 if (aFromSegment[i] == 0xFF) {
653 rv = SetupDecodingFromBom("UTF-16BE"); // upper case is the raw form
654 NS_ENSURE_SUCCESS(rv, rv);
655 uint32_t count = aCount - (i + 1);
656 rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
657 NS_ENSURE_SUCCESS(rv, rv);
658 *aWriteCount = writeCount + (i + 1);
659 return rv;
661 mBomState = BOM_SNIFFING_OVER;
662 break;
663 case SEEN_UTF_8_FIRST_BYTE:
664 if (aFromSegment[i] == 0xBB) {
665 mBomState = SEEN_UTF_8_SECOND_BYTE;
666 } else {
667 mBomState = BOM_SNIFFING_OVER;
669 break;
670 case SEEN_UTF_8_SECOND_BYTE:
671 if (aFromSegment[i] == 0xBF) {
672 rv = SetupDecodingFromBom("UTF-8"); // upper case is the raw form
673 NS_ENSURE_SUCCESS(rv, rv);
674 uint32_t count = aCount - (i + 1);
675 rv = WriteStreamBytes(aFromSegment + (i + 1), count, &writeCount);
676 NS_ENSURE_SUCCESS(rv, rv);
677 *aWriteCount = writeCount + (i + 1);
678 return rv;
680 mBomState = BOM_SNIFFING_OVER;
681 break;
682 default:
683 mBomState = BOM_SNIFFING_OVER;
684 break;
687 // if we get here, there either was no BOM or the BOM sniffing isn't complete
688 // yet
690 MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark,
691 "Should not come here if BOM was found.");
692 MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent,
693 "kCharsetFromOtherComponent is for XSLT.");
695 if (mBomState == BOM_SNIFFING_OVER &&
696 mCharsetSource == kCharsetFromChannel) {
697 // There was no BOM and the charset came from channel. mCharset
698 // still contains the charset from the channel as set by an
699 // earlier call to SetDocumentCharset(), since we didn't find a BOM and
700 // overwrite mCharset. (Note that if the user has overridden the charset,
701 // we don't come here but check <meta> for XSS-dangerous charsets first.)
702 mFeedChardet = false;
703 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
704 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
705 aCount, aWriteCount);
708 if (!mMetaScanner && (mMode == NORMAL ||
709 mMode == VIEW_SOURCE_HTML ||
710 mMode == LOAD_AS_DATA)) {
711 mMetaScanner = new nsHtml5MetaScanner();
714 if (mSniffingLength + aCount >= NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE) {
715 // this is the last buffer
716 uint32_t countToSniffingLimit =
717 NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
718 if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
719 nsHtml5ByteReadable readable(aFromSegment, aFromSegment +
720 countToSniffingLimit);
721 nsAutoCString encoding;
722 mMetaScanner->sniff(&readable, encoding);
723 if (!encoding.IsEmpty()) {
724 // meta scan successful; honor overrides unless meta is XSS-dangerous
725 if ((mCharsetSource == kCharsetFromParentForced ||
726 mCharsetSource == kCharsetFromUserForced) &&
727 EncodingUtils::IsAsciiCompatible(encoding)) {
728 // Honor override
729 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
730 aFromSegment, aCount, aWriteCount);
732 mCharset.Assign(encoding);
733 mCharsetSource = kCharsetFromMetaPrescan;
734 mFeedChardet = false;
735 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
736 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
737 aFromSegment, aCount, aWriteCount);
740 if (mCharsetSource == kCharsetFromParentForced ||
741 mCharsetSource == kCharsetFromUserForced) {
742 // meta not found, honor override
743 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
744 aFromSegment, aCount, aWriteCount);
746 return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
747 countToSniffingLimit);
750 // not the last buffer
751 if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
752 nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
753 nsAutoCString encoding;
754 mMetaScanner->sniff(&readable, encoding);
755 if (!encoding.IsEmpty()) {
756 // meta scan successful; honor overrides unless meta is XSS-dangerous
757 if ((mCharsetSource == kCharsetFromParentForced ||
758 mCharsetSource == kCharsetFromUserForced) &&
759 EncodingUtils::IsAsciiCompatible(encoding)) {
760 // Honor override
761 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
762 aCount, aWriteCount);
764 mCharset.Assign(encoding);
765 mCharsetSource = kCharsetFromMetaPrescan;
766 mFeedChardet = false;
767 mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
768 return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
769 aCount, aWriteCount);
773 if (!mSniffingBuffer) {
774 const mozilla::fallible_t fallible = mozilla::fallible_t();
775 mSniffingBuffer = new (fallible)
776 uint8_t[NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE];
777 if (!mSniffingBuffer) {
778 return NS_ERROR_OUT_OF_MEMORY;
781 memcpy(mSniffingBuffer + mSniffingLength, aFromSegment, aCount);
782 mSniffingLength += aCount;
783 *aWriteCount = aCount;
784 return NS_OK;
787 nsresult
788 nsHtml5StreamParser::WriteStreamBytes(const uint8_t* aFromSegment,
789 uint32_t aCount,
790 uint32_t* aWriteCount)
792 NS_ASSERTION(IsParserThread(), "Wrong thread!");
793 // mLastBuffer should always point to a buffer of the size
794 // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE.
795 if (!mLastBuffer) {
796 NS_WARNING("mLastBuffer should not be null!");
797 MarkAsBroken(NS_ERROR_NULL_POINTER);
798 return NS_ERROR_NULL_POINTER;
800 if (mLastBuffer->getEnd() == NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE) {
801 nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
802 nsHtml5OwningUTF16Buffer::FalliblyCreate(
803 NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
804 if (!newBuf) {
805 return NS_ERROR_OUT_OF_MEMORY;
807 mLastBuffer = (mLastBuffer->next = newBuf.forget());
809 int32_t totalByteCount = 0;
810 for (;;) {
811 int32_t end = mLastBuffer->getEnd();
812 int32_t byteCount = aCount - totalByteCount;
813 int32_t utf16Count = NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE - end;
815 NS_ASSERTION(utf16Count, "Trying to convert into a buffer with no free space!");
816 // byteCount may be zero to force the decoder to output a pending surrogate
817 // pair.
819 nsresult convResult = mUnicodeDecoder->Convert((const char*)aFromSegment, &byteCount, mLastBuffer->getBuffer() + end, &utf16Count);
820 MOZ_ASSERT(NS_SUCCEEDED(convResult));
822 end += utf16Count;
823 mLastBuffer->setEnd(end);
824 totalByteCount += byteCount;
825 aFromSegment += byteCount;
827 NS_ASSERTION(end <= NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE,
828 "The Unicode decoder wrote too much data.");
829 NS_ASSERTION(byteCount >= -1, "The decoder consumed fewer than -1 bytes.");
831 if (convResult == NS_PARTIAL_MORE_OUTPUT) {
832 nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
833 nsHtml5OwningUTF16Buffer::FalliblyCreate(
834 NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
835 if (!newBuf) {
836 return NS_ERROR_OUT_OF_MEMORY;
838 mLastBuffer = (mLastBuffer->next = newBuf.forget());
839 // All input may have been consumed if there is a pending surrogate pair
840 // that doesn't fit in the output buffer. Loop back to push a zero-length
841 // input to the decoder in that case.
842 } else {
843 NS_ASSERTION(totalByteCount == (int32_t)aCount,
844 "The Unicode decoder consumed the wrong number of bytes.");
845 *aWriteCount = (uint32_t)totalByteCount;
846 return NS_OK;
851 nsresult
852 nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
854 NS_PRECONDITION(STREAM_NOT_STARTED == mStreamState,
855 "Got OnStartRequest when the stream had already started.");
856 NS_PRECONDITION(!mExecutor->HasStarted(),
857 "Got OnStartRequest at the wrong stage in the executor life cycle.");
858 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
859 if (mObserver) {
860 mObserver->OnStartRequest(aRequest, aContext);
862 mRequest = aRequest;
864 mStreamState = STREAM_BEING_READ;
866 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
867 mTokenizer->StartViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
870 // For View Source, the parser should run with scripts "enabled" if a normal
871 // load would have scripts enabled.
872 bool scriptingEnabled = mMode == LOAD_AS_DATA ?
873 false : mExecutor->IsScriptEnabled();
874 mOwner->StartTokenizer(scriptingEnabled);
876 bool isSrcdoc = false;
877 nsCOMPtr<nsIChannel> channel;
878 nsresult rv = GetChannel(getter_AddRefs(channel));
879 if (NS_SUCCEEDED(rv)) {
880 isSrcdoc = NS_IsSrcdocChannel(channel);
882 mTreeBuilder->setIsSrcdocDocument(isSrcdoc);
883 mTreeBuilder->setScriptingEnabled(scriptingEnabled);
884 mTreeBuilder->SetPreventScriptExecution(!((mMode == NORMAL) &&
885 scriptingEnabled));
886 mTokenizer->start();
887 mExecutor->Start();
888 mExecutor->StartReadingFromStage();
890 if (mMode == PLAIN_TEXT) {
891 mTreeBuilder->StartPlainText();
892 mTokenizer->StartPlainText();
893 } else if (mMode == VIEW_SOURCE_PLAIN) {
894 mTreeBuilder->StartPlainTextViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
895 mTokenizer->StartPlainText();
899 * If you move the following line, be very careful not to cause
900 * WillBuildModel to be called before the document has had its
901 * script global object set.
903 rv = mExecutor->WillBuildModel(eDTDMode_unknown);
904 NS_ENSURE_SUCCESS(rv, rv);
906 nsRefPtr<nsHtml5OwningUTF16Buffer> newBuf =
907 nsHtml5OwningUTF16Buffer::FalliblyCreate(
908 NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
909 if (!newBuf) {
910 // marks this stream parser as terminated,
911 // which prevents entry to code paths that
912 // would use mFirstBuffer or mLastBuffer.
913 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
915 NS_ASSERTION(!mFirstBuffer, "How come we have the first buffer set?");
916 NS_ASSERTION(!mLastBuffer, "How come we have the last buffer set?");
917 mFirstBuffer = mLastBuffer = newBuf;
919 rv = NS_OK;
921 // The line below means that the encoding can end up being wrong if
922 // a view-source URL is loaded without having the encoding hint from a
923 // previous normal load in the history.
924 mReparseForbidden = !(mMode == NORMAL || mMode == PLAIN_TEXT);
926 nsCOMPtr<nsIHttpChannel> httpChannel(do_QueryInterface(mRequest, &rv));
927 if (NS_SUCCEEDED(rv)) {
928 nsAutoCString method;
929 httpChannel->GetRequestMethod(method);
930 // XXX does Necko have a way to renavigate POST, etc. without hitting
931 // the network?
932 if (!method.EqualsLiteral("GET")) {
933 // This is the old Gecko behavior but the HTML5 spec disagrees.
934 // Don't reparse on POST.
935 mReparseForbidden = true;
936 mFeedChardet = false; // can't restart anyway
940 // Attempt to retarget delivery of data (via OnDataAvailable) to the parser
941 // thread, rather than through the main thread.
942 nsCOMPtr<nsIThreadRetargetableRequest> threadRetargetableRequest =
943 do_QueryInterface(mRequest, &rv);
944 if (threadRetargetableRequest) {
945 rv = threadRetargetableRequest->RetargetDeliveryTo(mThread);
948 if (NS_FAILED(rv)) {
949 // for now skip warning if we're on child process, since we don't support
950 // off-main thread delivery there yet. This will change with bug 1015466
951 if (XRE_GetProcessType() != GeckoProcessType_Content) {
952 NS_WARNING("Failed to retarget HTML data delivery to the parser thread.");
956 if (mCharsetSource == kCharsetFromParentFrame) {
957 // Remember this in case chardet overwrites mCharsetSource
958 mInitialEncodingWasFromParentFrame = true;
961 if (mCharsetSource >= kCharsetFromAutoDetection) {
962 mFeedChardet = false;
965 nsCOMPtr<nsIWyciwygChannel> wyciwygChannel(do_QueryInterface(mRequest));
966 if (!wyciwygChannel) {
967 // we aren't ready to commit to an encoding yet
968 // leave converter uninstantiated for now
969 return NS_OK;
972 // We are reloading a document.open()ed doc.
973 mReparseForbidden = true;
974 mFeedChardet = false;
976 // Instantiate the converter here to avoid BOM sniffing.
977 mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset);
978 return NS_OK;
981 nsresult
982 nsHtml5StreamParser::CheckListenerChain()
984 NS_ASSERTION(NS_IsMainThread(), "Should be on the main thread!");
985 if (!mObserver) {
986 return NS_OK;
988 nsresult rv;
989 nsCOMPtr<nsIThreadRetargetableStreamListener> retargetable =
990 do_QueryInterface(mObserver, &rv);
991 if (NS_SUCCEEDED(rv) && retargetable) {
992 rv = retargetable->CheckListenerChain();
994 return rv;
997 void
998 nsHtml5StreamParser::DoStopRequest()
1000 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1001 NS_PRECONDITION(STREAM_BEING_READ == mStreamState,
1002 "Stream ended without being open.");
1003 mTokenizerMutex.AssertCurrentThreadOwns();
1005 if (IsTerminated()) {
1006 return;
1009 mStreamState = STREAM_ENDED;
1011 if (!mUnicodeDecoder) {
1012 uint32_t writeCount;
1013 nsresult rv;
1014 if (NS_FAILED(rv = FinalizeSniffing(nullptr, 0, &writeCount, 0))) {
1015 MarkAsBroken(rv);
1016 return;
1018 } else if (mFeedChardet) {
1019 mChardet->Done();
1022 if (IsTerminatedOrInterrupted()) {
1023 return;
1026 ParseAvailableData();
1029 class nsHtml5RequestStopper : public nsRunnable
1031 private:
1032 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1033 public:
1034 explicit nsHtml5RequestStopper(nsHtml5StreamParser* aStreamParser)
1035 : mStreamParser(aStreamParser)
1037 NS_IMETHODIMP Run()
1039 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1040 mStreamParser->DoStopRequest();
1041 return NS_OK;
1045 nsresult
1046 nsHtml5StreamParser::OnStopRequest(nsIRequest* aRequest,
1047 nsISupports* aContext,
1048 nsresult status)
1050 NS_ASSERTION(mRequest == aRequest, "Got Stop on wrong stream.");
1051 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1052 if (mObserver) {
1053 mObserver->OnStopRequest(aRequest, aContext, status);
1055 nsCOMPtr<nsIRunnable> stopper = new nsHtml5RequestStopper(this);
1056 if (NS_FAILED(mThread->Dispatch(stopper, nsIThread::DISPATCH_NORMAL))) {
1057 NS_WARNING("Dispatching StopRequest event failed.");
1059 return NS_OK;
1062 void
1063 nsHtml5StreamParser::DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength)
1065 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1066 NS_PRECONDITION(STREAM_BEING_READ == mStreamState,
1067 "DoDataAvailable called when stream not open.");
1068 mTokenizerMutex.AssertCurrentThreadOwns();
1070 if (IsTerminated()) {
1071 return;
1074 uint32_t writeCount;
1075 nsresult rv;
1076 if (HasDecoder()) {
1077 if (mFeedChardet) {
1078 bool dontFeed;
1079 mChardet->DoIt((const char*)aBuffer, aLength, &dontFeed);
1080 mFeedChardet = !dontFeed;
1082 rv = WriteStreamBytes(aBuffer, aLength, &writeCount);
1083 } else {
1084 rv = SniffStreamBytes(aBuffer, aLength, &writeCount);
1086 if (NS_FAILED(rv)) {
1087 MarkAsBroken(rv);
1088 return;
1090 NS_ASSERTION(writeCount == aLength, "Wrong number of stream bytes written/sniffed.");
1092 if (IsTerminatedOrInterrupted()) {
1093 return;
1096 ParseAvailableData();
1098 if (mFlushTimerArmed || mSpeculating) {
1099 return;
1102 mFlushTimer->InitWithFuncCallback(nsHtml5StreamParser::TimerCallback,
1103 static_cast<void*> (this),
1104 mFlushTimerEverFired ?
1105 sTimerInitialDelay :
1106 sTimerSubsequentDelay,
1107 nsITimer::TYPE_ONE_SHOT);
1108 mFlushTimerArmed = true;
1111 class nsHtml5DataAvailable : public nsRunnable
1113 private:
1114 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1115 nsAutoArrayPtr<uint8_t> mData;
1116 uint32_t mLength;
1117 public:
1118 nsHtml5DataAvailable(nsHtml5StreamParser* aStreamParser,
1119 uint8_t* aData,
1120 uint32_t aLength)
1121 : mStreamParser(aStreamParser)
1122 , mData(aData)
1123 , mLength(aLength)
1125 NS_IMETHODIMP Run()
1127 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1128 mStreamParser->DoDataAvailable(mData, mLength);
1129 return NS_OK;
1133 nsresult
1134 nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
1135 nsISupports* aContext,
1136 nsIInputStream* aInStream,
1137 uint64_t aSourceOffset,
1138 uint32_t aLength)
1140 nsresult rv;
1141 if (NS_FAILED(rv = mExecutor->IsBroken())) {
1142 return rv;
1145 NS_ASSERTION(mRequest == aRequest, "Got data on wrong stream.");
1146 uint32_t totalRead;
1147 // Main thread to parser thread dispatch requires copying to buffer first.
1148 if (NS_IsMainThread()) {
1149 const mozilla::fallible_t fallible = mozilla::fallible_t();
1150 nsAutoArrayPtr<uint8_t> data(new (fallible) uint8_t[aLength]);
1151 if (!data) {
1152 return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
1154 rv = aInStream->Read(reinterpret_cast<char*>(data.get()),
1155 aLength, &totalRead);
1156 NS_ENSURE_SUCCESS(rv, rv);
1157 NS_ASSERTION(totalRead <= aLength, "Read more bytes than were available?");
1159 nsCOMPtr<nsIRunnable> dataAvailable = new nsHtml5DataAvailable(this,
1160 data.forget(),
1161 totalRead);
1162 if (NS_FAILED(mThread->Dispatch(dataAvailable, nsIThread::DISPATCH_NORMAL))) {
1163 NS_WARNING("Dispatching DataAvailable event failed.");
1165 return rv;
1166 } else {
1167 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1168 mozilla::MutexAutoLock autoLock(mTokenizerMutex);
1170 // Read directly from response buffer.
1171 rv = aInStream->ReadSegments(CopySegmentsToParser, this, aLength,
1172 &totalRead);
1173 if (NS_FAILED(rv)) {
1174 NS_WARNING("Failed reading response data to parser");
1175 return rv;
1177 return NS_OK;
1181 /* static */
1182 NS_METHOD
1183 nsHtml5StreamParser::CopySegmentsToParser(nsIInputStream *aInStream,
1184 void *aClosure,
1185 const char *aFromSegment,
1186 uint32_t aToOffset,
1187 uint32_t aCount,
1188 uint32_t *aWriteCount)
1190 nsHtml5StreamParser* parser = static_cast<nsHtml5StreamParser*>(aClosure);
1192 parser->DoDataAvailable((const uint8_t*)aFromSegment, aCount);
1193 // Assume DoDataAvailable consumed all available bytes.
1194 *aWriteCount = aCount;
1195 return NS_OK;
1198 bool
1199 nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
1201 nsAutoCString newEncoding;
1202 if (!EncodingUtils::FindEncodingForLabel(aEncoding, newEncoding)) {
1203 // the encoding name is bogus
1204 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported",
1205 true,
1206 mTokenizer->getLineNumber());
1207 return false;
1210 if (newEncoding.EqualsLiteral("UTF-16BE") ||
1211 newEncoding.EqualsLiteral("UTF-16LE")) {
1212 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16",
1213 true,
1214 mTokenizer->getLineNumber());
1215 newEncoding.AssignLiteral("UTF-8");
1218 if (newEncoding.EqualsLiteral("x-user-defined")) {
1219 // WebKit/Blink hack for Indian and Armenian legacy sites
1220 mTreeBuilder->MaybeComplainAboutCharset("EncMetaUserDefined",
1221 true,
1222 mTokenizer->getLineNumber());
1223 newEncoding.AssignLiteral("windows-1252");
1226 if (newEncoding.Equals(mCharset)) {
1227 if (mCharsetSource < kCharsetFromMetaPrescan) {
1228 if (mInitialEncodingWasFromParentFrame) {
1229 mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaFrame",
1230 false,
1231 mTokenizer->getLineNumber());
1232 } else {
1233 mTreeBuilder->MaybeComplainAboutCharset("EncLateMeta",
1234 false,
1235 mTokenizer->getLineNumber());
1238 mCharsetSource = kCharsetFromMetaTag; // become confident
1239 mFeedChardet = false; // don't feed chardet when confident
1240 return false;
1243 aEncoding.Assign(newEncoding);
1244 return true;
1247 bool
1248 nsHtml5StreamParser::internalEncodingDeclaration(nsString* aEncoding)
1250 // This code needs to stay in sync with
1251 // nsHtml5MetaScanner::tryCharset. Unfortunately, the
1252 // trickery with member fields there leads to some copy-paste reuse. :-(
1253 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1254 if (mCharsetSource >= kCharsetFromMetaTag) { // this threshold corresponds to "confident" in the HTML5 spec
1255 return false;
1258 nsAutoCString newEncoding;
1259 CopyUTF16toUTF8(*aEncoding, newEncoding);
1261 if (!PreferredForInternalEncodingDecl(newEncoding)) {
1262 return false;
1265 if (mReparseForbidden) {
1266 // This mReparseForbidden check happens after the call to
1267 // PreferredForInternalEncodingDecl so that if that method calls
1268 // MaybeComplainAboutCharset, its charset complaint wins over the one
1269 // below.
1270 mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaTooLate",
1271 true,
1272 mTokenizer->getLineNumber());
1273 return false; // not reparsing even if we wanted to
1276 // Avoid having the chardet ask for another restart after this restart
1277 // request.
1278 mFeedChardet = false;
1279 mTreeBuilder->NeedsCharsetSwitchTo(newEncoding,
1280 kCharsetFromMetaTag,
1281 mTokenizer->getLineNumber());
1282 FlushTreeOpsAndDisarmTimer();
1283 Interrupt();
1284 // the tree op executor will cause the stream parser to terminate
1285 // if the charset switch request is accepted or it'll uninterrupt
1286 // if the request failed. Note that if the restart request fails,
1287 // we don't bother trying to make chardet resume. Might as well
1288 // assume that chardet-requested restarts would fail, too.
1289 return true;
1292 void
1293 nsHtml5StreamParser::FlushTreeOpsAndDisarmTimer()
1295 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1296 if (mFlushTimerArmed) {
1297 // avoid calling Cancel if the flush timer isn't armed to avoid acquiring
1298 // a mutex
1299 mFlushTimer->Cancel();
1300 mFlushTimerArmed = false;
1302 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1303 mTokenizer->FlushViewSource();
1305 mTreeBuilder->Flush();
1306 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1307 NS_WARNING("failed to dispatch executor flush event");
1311 void
1312 nsHtml5StreamParser::ParseAvailableData()
1314 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1315 mTokenizerMutex.AssertCurrentThreadOwns();
1317 if (IsTerminatedOrInterrupted()) {
1318 return;
1321 for (;;) {
1322 if (!mFirstBuffer->hasMore()) {
1323 if (mFirstBuffer == mLastBuffer) {
1324 switch (mStreamState) {
1325 case STREAM_BEING_READ:
1326 // never release the last buffer.
1327 if (!mSpeculating) {
1328 // reuse buffer space if not speculating
1329 mFirstBuffer->setStart(0);
1330 mFirstBuffer->setEnd(0);
1332 mTreeBuilder->FlushLoads();
1333 // Dispatch this runnable unconditionally, because the loads
1334 // that need flushing may have been flushed earlier even if the
1335 // flush right above here did nothing.
1336 if (NS_FAILED(NS_DispatchToMainThread(mLoadFlusher))) {
1337 NS_WARNING("failed to dispatch load flush event");
1339 return; // no more data for now but expecting more
1340 case STREAM_ENDED:
1341 if (mAtEOF) {
1342 return;
1344 mAtEOF = true;
1345 if (mCharsetSource < kCharsetFromMetaTag) {
1346 if (mInitialEncodingWasFromParentFrame) {
1347 // Unfortunately, this check doesn't take effect for
1348 // cross-origin frames, so cross-origin ad frames that have
1349 // no text and only an image or a Flash embed get the more
1350 // severe message from the next if block. The message is
1351 // technically accurate, though.
1352 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationFrame",
1353 false,
1355 } else if (mMode == NORMAL) {
1356 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclaration",
1357 true,
1359 } else if (mMode == PLAIN_TEXT) {
1360 mTreeBuilder->MaybeComplainAboutCharset("EncNoDeclarationPlain",
1361 true,
1365 mTokenizer->eof();
1366 mTreeBuilder->StreamEnded();
1367 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1368 mTokenizer->EndViewSource();
1370 FlushTreeOpsAndDisarmTimer();
1371 return; // no more data and not expecting more
1372 default:
1373 NS_NOTREACHED("It should be impossible to reach this.");
1374 return;
1377 mFirstBuffer = mFirstBuffer->next;
1378 continue;
1381 // now we have a non-empty buffer
1382 mFirstBuffer->adjust(mLastWasCR);
1383 mLastWasCR = false;
1384 if (mFirstBuffer->hasMore()) {
1385 mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer);
1386 // At this point, internalEncodingDeclaration() may have called
1387 // Terminate, but that never happens together with script.
1388 // Can't assert that here, though, because it's possible that the main
1389 // thread has called Terminate() while this thread was parsing.
1390 if (mTreeBuilder->HasScript()) {
1391 // HasScript() cannot return true if the tree builder is preventing
1392 // script execution.
1393 MOZ_ASSERT(mMode == NORMAL);
1394 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1395 nsHtml5Speculation* speculation =
1396 new nsHtml5Speculation(mFirstBuffer,
1397 mFirstBuffer->getStart(),
1398 mTokenizer->getLineNumber(),
1399 mTreeBuilder->newSnapshot());
1400 mTreeBuilder->AddSnapshotToScript(speculation->GetSnapshot(),
1401 speculation->GetStartLineNumber());
1402 FlushTreeOpsAndDisarmTimer();
1403 mTreeBuilder->SetOpSink(speculation);
1404 mSpeculations.AppendElement(speculation); // adopts the pointer
1405 mSpeculating = true;
1407 if (IsTerminatedOrInterrupted()) {
1408 return;
1411 continue;
1415 class nsHtml5StreamParserContinuation : public nsRunnable
1417 private:
1418 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1419 public:
1420 explicit nsHtml5StreamParserContinuation(nsHtml5StreamParser* aStreamParser)
1421 : mStreamParser(aStreamParser)
1423 NS_IMETHODIMP Run()
1425 mozilla::MutexAutoLock autoLock(mStreamParser->mTokenizerMutex);
1426 mStreamParser->Uninterrupt();
1427 mStreamParser->ParseAvailableData();
1428 return NS_OK;
1432 void
1433 nsHtml5StreamParser::ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
1434 nsHtml5TreeBuilder* aTreeBuilder,
1435 bool aLastWasCR)
1437 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1438 NS_ASSERTION(!(mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML),
1439 "ContinueAfterScripts called in view source mode!");
1440 if (NS_FAILED(mExecutor->IsBroken())) {
1441 return;
1443 #ifdef DEBUG
1444 mExecutor->AssertStageEmpty();
1445 #endif
1446 bool speculationFailed = false;
1448 mozilla::MutexAutoLock speculationAutoLock(mSpeculationMutex);
1449 if (mSpeculations.IsEmpty()) {
1450 NS_NOTREACHED("ContinueAfterScripts called without speculations.");
1451 return;
1453 nsHtml5Speculation* speculation = mSpeculations.ElementAt(0);
1454 if (aLastWasCR ||
1455 !aTokenizer->isInDataState() ||
1456 !aTreeBuilder->snapshotMatches(speculation->GetSnapshot())) {
1457 speculationFailed = true;
1458 // We've got a failed speculation :-(
1459 Interrupt(); // Make the parser thread release the tokenizer mutex sooner
1460 // now fall out of the speculationAutoLock into the tokenizerAutoLock block
1461 } else {
1462 // We've got a successful speculation!
1463 if (mSpeculations.Length() > 1) {
1464 // the first speculation isn't the current speculation, so there's
1465 // no need to bother the parser thread.
1466 speculation->FlushToSink(mExecutor);
1467 NS_ASSERTION(!mExecutor->IsScriptExecuting(),
1468 "ParseUntilBlocked() was supposed to ensure we don't come "
1469 "here when scripts are executing.");
1470 NS_ASSERTION(mExecutor->IsInFlushLoop(), "How are we here if "
1471 "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
1472 "only caller of this method?");
1473 mSpeculations.RemoveElementAt(0);
1474 return;
1476 // else
1477 Interrupt(); // Make the parser thread release the tokenizer mutex sooner
1479 // now fall through
1480 // the first speculation is the current speculation. Need to
1481 // release the the speculation mutex and acquire the tokenizer
1482 // mutex. (Just acquiring the other mutex here would deadlock)
1486 mozilla::MutexAutoLock tokenizerAutoLock(mTokenizerMutex);
1487 #ifdef DEBUG
1489 nsCOMPtr<nsIThread> mainThread;
1490 NS_GetMainThread(getter_AddRefs(mainThread));
1491 mAtomTable.SetPermittedLookupThread(mainThread);
1493 #endif
1494 // In principle, the speculation mutex should be acquired here,
1495 // but there's no point, because the parser thread only acquires it
1496 // when it has also acquired the tokenizer mutex and we are already
1497 // holding the tokenizer mutex.
1498 if (speculationFailed) {
1499 // Rewind the stream
1500 mAtEOF = false;
1501 nsHtml5Speculation* speculation = mSpeculations.ElementAt(0);
1502 mFirstBuffer = speculation->GetBuffer();
1503 mFirstBuffer->setStart(speculation->GetStart());
1504 mTokenizer->setLineNumber(speculation->GetStartLineNumber());
1506 nsContentUtils::ReportToConsole(nsIScriptError::warningFlag,
1507 NS_LITERAL_CSTRING("DOM Events"),
1508 mExecutor->GetDocument(),
1509 nsContentUtils::eDOM_PROPERTIES,
1510 "SpeculationFailed",
1511 nullptr, 0,
1512 nullptr,
1513 EmptyString(),
1514 speculation->GetStartLineNumber());
1516 nsHtml5OwningUTF16Buffer* buffer = mFirstBuffer->next;
1517 while (buffer) {
1518 buffer->setStart(0);
1519 buffer = buffer->next;
1522 mSpeculations.Clear(); // potentially a huge number of destructors
1523 // run here synchronously on the main thread...
1525 mTreeBuilder->flushCharacters(); // empty the pending buffer
1526 mTreeBuilder->ClearOps(); // now get rid of the failed ops
1528 mTreeBuilder->SetOpSink(mExecutor->GetStage());
1529 mExecutor->StartReadingFromStage();
1530 mSpeculating = false;
1532 // Copy state over
1533 mLastWasCR = aLastWasCR;
1534 mTokenizer->loadState(aTokenizer);
1535 mTreeBuilder->loadState(aTreeBuilder, &mAtomTable);
1536 } else {
1537 // We've got a successful speculation and at least a moment ago it was
1538 // the current speculation
1539 mSpeculations.ElementAt(0)->FlushToSink(mExecutor);
1540 NS_ASSERTION(!mExecutor->IsScriptExecuting(),
1541 "ParseUntilBlocked() was supposed to ensure we don't come "
1542 "here when scripts are executing.");
1543 NS_ASSERTION(mExecutor->IsInFlushLoop(), "How are we here if "
1544 "RunFlushLoop() didn't call ParseUntilBlocked() which is the "
1545 "only caller of this method?");
1546 mSpeculations.RemoveElementAt(0);
1547 if (mSpeculations.IsEmpty()) {
1548 // yes, it was still the only speculation. Now stop speculating
1549 // However, before telling the executor to read from stage, flush
1550 // any pending ops straight to the executor, because otherwise
1551 // they remain unflushed until we get more data from the network.
1552 mTreeBuilder->SetOpSink(mExecutor);
1553 mTreeBuilder->Flush(true);
1554 mTreeBuilder->SetOpSink(mExecutor->GetStage());
1555 mExecutor->StartReadingFromStage();
1556 mSpeculating = false;
1559 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
1560 if (NS_FAILED(mThread->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
1561 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
1563 // A stream event might run before this event runs, but that's harmless.
1564 #ifdef DEBUG
1565 mAtomTable.SetPermittedLookupThread(mThread);
1566 #endif
1570 void
1571 nsHtml5StreamParser::ContinueAfterFailedCharsetSwitch()
1573 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1574 nsCOMPtr<nsIRunnable> event = new nsHtml5StreamParserContinuation(this);
1575 if (NS_FAILED(mThread->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
1576 NS_WARNING("Failed to dispatch nsHtml5StreamParserContinuation");
1580 class nsHtml5TimerKungFu : public nsRunnable
1582 private:
1583 nsHtml5RefPtr<nsHtml5StreamParser> mStreamParser;
1584 public:
1585 explicit nsHtml5TimerKungFu(nsHtml5StreamParser* aStreamParser)
1586 : mStreamParser(aStreamParser)
1588 NS_IMETHODIMP Run()
1590 if (mStreamParser->mFlushTimer) {
1591 mStreamParser->mFlushTimer->Cancel();
1592 mStreamParser->mFlushTimer = nullptr;
1594 return NS_OK;
1598 void
1599 nsHtml5StreamParser::DropTimer()
1601 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
1603 * Simply nulling out the timer wouldn't work, because if the timer is
1604 * armed, it needs to be canceled first. Simply canceling it first wouldn't
1605 * work, because nsTimerImpl::Cancel is not safe for calling from outside
1606 * the thread where nsTimerImpl::Fire would run. It's not safe to
1607 * dispatch a runnable to cancel the timer from the destructor of this
1608 * class, because the timer has a weak (void*) pointer back to this instance
1609 * of the stream parser and having the timer fire before the runnable
1610 * cancels it would make the timer access a deleted object.
1612 * This DropTimer method addresses these issues. This method must be called
1613 * on the main thread before the destructor of this class is reached.
1614 * The nsHtml5TimerKungFu object has an nsHtml5RefPtr that addrefs this
1615 * stream parser object to keep it alive until the runnable is done.
1616 * The runnable cancels the timer on the parser thread, drops the timer
1617 * and lets nsHtml5RefPtr send a runnable back to the main thread to
1618 * release the stream parser.
1620 if (mFlushTimer) {
1621 nsCOMPtr<nsIRunnable> event = new nsHtml5TimerKungFu(this);
1622 if (NS_FAILED(mThread->Dispatch(event, nsIThread::DISPATCH_NORMAL))) {
1623 NS_WARNING("Failed to dispatch TimerKungFu event");
1628 // Using a static, because the method name Notify is taken by the chardet
1629 // callback.
1630 void
1631 nsHtml5StreamParser::TimerCallback(nsITimer* aTimer, void* aClosure)
1633 (static_cast<nsHtml5StreamParser*> (aClosure))->TimerFlush();
1636 void
1637 nsHtml5StreamParser::TimerFlush()
1639 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1640 mozilla::MutexAutoLock autoLock(mTokenizerMutex);
1642 NS_ASSERTION(!mSpeculating, "Flush timer fired while speculating.");
1644 // The timer fired if we got here. No need to cancel it. Mark it as
1645 // not armed, though.
1646 mFlushTimerArmed = false;
1648 mFlushTimerEverFired = true;
1650 if (IsTerminatedOrInterrupted()) {
1651 return;
1654 if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
1655 mTreeBuilder->Flush(); // delete useless ops
1656 if (mTokenizer->FlushViewSource()) {
1657 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1658 NS_WARNING("failed to dispatch executor flush event");
1661 } else {
1662 // we aren't speculating and we don't know when new data is
1663 // going to arrive. Send data to the main thread.
1664 if (mTreeBuilder->Flush(true)) {
1665 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1666 NS_WARNING("failed to dispatch executor flush event");
1672 void
1673 nsHtml5StreamParser::MarkAsBroken(nsresult aRv)
1675 NS_ASSERTION(IsParserThread(), "Wrong thread!");
1676 mTokenizerMutex.AssertCurrentThreadOwns();
1678 Terminate();
1679 mTreeBuilder->MarkAsBroken(aRv);
1680 mozilla::DebugOnly<bool> hadOps = mTreeBuilder->Flush(false);
1681 NS_ASSERTION(hadOps, "Should have had the markAsBroken op!");
1682 if (NS_FAILED(NS_DispatchToMainThread(mExecutorFlusher))) {
1683 NS_WARNING("failed to dispatch executor flush event");