1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
11 #include "MainThreadUtils.h"
12 #include "mozilla/AlreadyAddRefed.h"
13 #include "mozilla/Assertions.h"
14 #include "mozilla/Encoding.h"
15 #include "mozilla/Mutex.h"
16 #include "mozilla/NotNull.h"
17 #include "mozilla/RefPtr.h"
18 #include "mozilla/Span.h"
19 #include "mozilla/UniquePtr.h"
20 #include "nsCharsetSource.h"
22 #include "nsCycleCollectionParticipant.h"
24 #include "nsHtml5AtomTable.h"
25 #include "nsIRequestObserver.h"
26 #include "nsISerialEventTarget.h"
27 #include "nsISupports.h"
28 #include "nsStringFwd.h"
32 class nsCycleCollectionTraversalCallback
;
33 class nsHtml5OwningUTF16Buffer
;
35 class nsHtml5Speculation
;
37 class nsHtml5Tokenizer
;
38 class nsHtml5TreeBuilder
;
39 class nsHtml5TreeOpExecutor
;
48 class EncodingDetector
;
55 } // namespace mozilla
59 * Parse a document normally as HTML.
64 * View document as HTML source.
69 * View document as XML source
74 * View document as plain text source
79 * View document as plain text
91 * BOM sniffing hasn't started.
93 BOM_SNIFFING_NOT_STARTED
,
96 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
99 SEEN_UTF_16_LE_FIRST_BYTE
,
102 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
105 SEEN_UTF_16_BE_FIRST_BYTE
,
108 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
111 SEEN_UTF_8_FIRST_BYTE
,
114 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
117 SEEN_UTF_8_SECOND_BYTE
,
120 * Seen \x00 in UTF-16BE bogo-XML declaration.
122 SEEN_UTF_16_BE_XML_FIRST
,
125 * Seen \x00< in UTF-16BE bogo-XML declaration.
127 SEEN_UTF_16_BE_XML_SECOND
,
130 * Seen \x00<\x00 in UTF-16BE bogo-XML declaration.
132 SEEN_UTF_16_BE_XML_THIRD
,
135 * Seen \x00<\x00? in UTF-16BE bogo-XML declaration.
137 SEEN_UTF_16_BE_XML_FOURTH
,
140 * Seen \x00<\x00?\x00 in UTF-16BE bogo-XML declaration.
142 SEEN_UTF_16_BE_XML_FIFTH
,
145 * Seen < in UTF-16BE bogo-XML declaration.
147 SEEN_UTF_16_LE_XML_FIRST
,
150 * Seen <\x00 in UTF-16BE bogo-XML declaration.
152 SEEN_UTF_16_LE_XML_SECOND
,
155 * Seen <\x00? in UTF-16BE bogo-XML declaration.
157 SEEN_UTF_16_LE_XML_THIRD
,
160 * Seen <\x00?\x00 in UTF-16BE bogo-XML declaration.
162 SEEN_UTF_16_LE_XML_FOURTH
,
165 * Seen <\x00?\x00x in UTF-16BE bogo-XML declaration.
167 SEEN_UTF_16_LE_XML_FIFTH
,
170 * BOM sniffing was started but is now over for whatever reason.
175 enum eHtml5StreamState
{
176 STREAM_NOT_STARTED
= 0,
177 STREAM_BEING_READ
= 1,
181 class nsHtml5StreamParser final
: public nsISupports
{
182 template <typename T
>
183 using NotNull
= mozilla::NotNull
<T
>;
184 using Encoding
= mozilla::Encoding
;
186 const uint32_t UNCONDITIONAL_META_SCAN_BOUNDARY
= 1024;
187 const uint32_t READ_BUFFER_SIZE
= 1024;
188 const uint32_t LOCAL_FILE_UTF_8_BUFFER_SIZE
= 1024 * 1024 * 4; // 4 MB
190 friend class nsHtml5RequestStopper
;
191 friend class nsHtml5DataAvailable
;
192 friend class nsHtml5StreamParserContinuation
;
193 friend class nsHtml5TimerKungFu
;
194 friend class nsHtml5StreamParserPtr
;
195 friend class nsHtml5StreamListener
;
198 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
199 NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser
)
201 nsHtml5StreamParser(nsHtml5TreeOpExecutor
* aExecutor
, nsHtml5Parser
* aOwner
,
204 nsresult
OnStartRequest(nsIRequest
* aRequest
);
206 nsresult
OnDataAvailable(nsIRequest
* aRequest
, nsIInputStream
* aInStream
,
207 uint64_t aSourceOffset
, uint32_t aLength
);
209 nsresult
OnStopRequest(nsIRequest
* aRequest
, nsresult status
);
211 // EncodingDeclarationHandler
212 // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
214 * Tree builder uses this to report a late <meta charset>
216 bool internalEncodingDeclaration(nsHtml5String aEncoding
);
218 bool TemplatePushedOrHeadPopped();
220 void RememberGt(int32_t aPos
);
222 // Not from an external interface
225 * Post a runnable to the main thread to perform the speculative load
226 * operations without performing the tree operations.
228 * This should be called at the end of each data available or stop
229 * request runnable running on the parser thread.
231 void PostLoadFlusher();
234 * Pass a buffer to chardetng.
236 void FeedDetector(mozilla::Span
<const uint8_t> aBuffer
);
239 * Report EOF to chardetng.
244 * Call this method once you've created a parser, and want to instruct it
245 * about what charset to load
247 * @param aEncoding the charset of a document
248 * @param aCharsetSource the source of the charset
250 inline void SetDocumentCharset(NotNull
<const Encoding
*> aEncoding
,
251 nsCharsetSource aSource
,
252 bool aForceAutoDetection
) {
253 MOZ_ASSERT(mStreamState
== STREAM_NOT_STARTED
,
254 "SetDocumentCharset called too late.");
255 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
256 MOZ_ASSERT(!(aForceAutoDetection
&& aSource
>= kCharsetFromOtherComponent
),
257 "Can't force with high-ranking source.");
258 mEncoding
= aEncoding
;
259 mCharsetSource
= aSource
;
260 mForceAutoDetection
= aForceAutoDetection
;
261 mChannelHadCharset
= (aSource
== kCharsetFromChannel
);
264 nsresult
GetChannel(nsIChannel
** aChannel
);
267 * The owner parser must call this after script execution
268 * when no scripts are executing and the document.written
269 * buffer has been exhausted.
271 * If the first two arguments are nullptr, instead of
272 * continuing after scripts, this method commits to an
273 * internally-discovered encoding.
275 void ContinueAfterScriptsOrEncodingCommitment(
276 nsHtml5Tokenizer
* aTokenizer
, nsHtml5TreeBuilder
* aTreeBuilder
,
280 * Continues the stream parser if the charset switch failed.
282 void ContinueAfterFailedCharsetSwitch();
284 void Terminate() { mTerminated
= true; }
289 * Sets the URL for View Source title in case this parser ends up being
290 * used for View Source. If aURL is a view-source: URL, takes the inner
291 * URL. data: URLs are shown with an ellipsis instead of the actual data.
293 void SetViewSourceTitle(nsIURI
* aURL
);
296 virtual ~nsHtml5StreamParser();
299 bool IsParserThread() { return mEventTarget
->IsOnCurrentThread(); }
302 void MarkAsBroken(nsresult aRv
);
305 * Marks the stream parser as interrupted. If you ever add calls to this
306 * method, be sure to review Uninterrupt usage very, very carefully to
307 * avoid having a previous in-flight runnable cancel your Interrupt()
308 * call on the other thread too soon.
311 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
315 void Uninterrupt() MOZ_NO_THREAD_SAFETY_ANALYSIS
{
316 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
317 mTokenizerMutex
.AssertCurrentThreadOwns();
318 mInterrupted
= false;
322 * Flushes the tree ops from the tree builder and disarms the flush
325 void FlushTreeOpsAndDisarmTimer();
327 void SwitchDecoderIfAsciiSoFar(NotNull
<const Encoding
*> aEncoding
)
328 MOZ_REQUIRES(mTokenizerMutex
);
333 void DiscardMetaSpeculation();
335 bool ProcessLookingForMetaCharset(bool aEof
) MOZ_REQUIRES(mTokenizerMutex
);
337 void ParseAvailableData();
339 void DoStopRequest();
341 void DoDataAvailableBuffer(mozilla::Buffer
<uint8_t>&& aBuffer
)
342 MOZ_REQUIRES(mTokenizerMutex
);
344 void DoDataAvailable(mozilla::Span
<const uint8_t> aBuffer
)
345 MOZ_REQUIRES(mTokenizerMutex
);
347 static nsresult
CopySegmentsToParser(nsIInputStream
* aInStream
,
348 void* aClosure
, const char* aFromSegment
,
349 uint32_t aToOffset
, uint32_t aCount
,
350 uint32_t* aWriteCount
)
351 MOZ_REQUIRES(mTokenizerMutex
);
353 bool IsTerminatedOrInterrupted() { return mTerminated
|| mInterrupted
; }
355 bool IsTerminated() { return mTerminated
; }
358 * True when there is a Unicode decoder already
360 inline bool HasDecoder() { return !!mUnicodeDecoder
; }
363 * Returns 0 if 1) there aren't at least 2 buffers in mBufferedBytes
364 * or 2) there is no byte '>' in the second buffer.
365 * Otherwise, returns the length of the prefix of the second buffer
366 * that is long enough to contain the first byte '>' in the second
367 * buffer (including the '>' byte).
369 size_t LengthOfLtContainingPrefixInSecondBuffer();
372 * Push bytes from network when there is no Unicode decoder yet
374 nsresult
SniffStreamBytes(mozilla::Span
<const uint8_t> aFromSegment
,
375 bool aEof
) MOZ_REQUIRES(mTokenizerMutex
);
378 * Push bytes from network when there is a Unicode decoder already
380 nsresult
WriteStreamBytes(mozilla::Span
<const uint8_t> aFromSegment
)
381 MOZ_REQUIRES(mTokenizerMutex
);
384 * Set up the Unicode decoder and write the sniffing buffer into it
385 * followed by the current network buffer.
387 * @param aPrefix the part of the stream that has already been seen
388 * prior to aFromSegment. In practice, these are the
389 * bytes that are baked into the state of the BOM
390 * and UTF-16 XML declaration-like sniffing state
392 * @param aFromSegment The current network buffer
394 nsresult
SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
395 mozilla::Span
<const uint8_t> aPrefix
,
396 mozilla::Span
<const uint8_t> aFromSegment
) MOZ_REQUIRES(mTokenizerMutex
);
399 * Initialize the Unicode decoder, mark the BOM as the source and
402 * @param aDecoderCharsetName The name for the decoder's charset
403 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
406 void SetupDecodingFromBom(NotNull
<const Encoding
*> aEncoding
);
408 void SetupDecodingFromUtf16BogoXml(NotNull
<const Encoding
*> aEncoding
);
411 * When speculatively decoding from file: URL as UTF-8, commit
412 * to UTF-8 as the non-speculative encoding and start processing
415 [[nodiscard
]] nsresult
CommitLocalFileToEncoding();
418 * When speculatively decoding from file: URL as UTF-8, redecode
419 * using fallback and then continue normally with the fallback.
421 [[nodiscard
]] nsresult
ReDecodeLocalFile() MOZ_REQUIRES(mTokenizerMutex
);
424 * Potentially guess the encoding using mozilla::EncodingDetector.
425 * Returns the guessed encoding and a telemetry-appropriate source.
427 std::tuple
<NotNull
<const Encoding
*>, nsCharsetSource
> GuessEncoding(
431 * Become confident or resolve and encoding name to its preferred form.
432 * @param aEncoding the value of an internal encoding decl. Acts as an
433 * out param, too, when the method returns true.
434 * @return true if the parser needs to start using the new value of
435 * aEncoding and false if the parser became confident or if
436 * the encoding name did not specify a usable encoding
438 const Encoding
* PreferredForInternalEncodingDecl(const nsAString
& aEncoding
);
441 * Callback for mFlushTimer.
443 static void TimerCallback(nsITimer
* aTimer
, void* aClosure
);
446 * Parser thread entry point for (maybe) flushing the ops and posting
447 * a flush runnable back on the main thread.
452 * Called when speculation fails.
454 void MaybeDisableFutureSpeculation() { mSpeculationFailureCount
++; }
457 * Used to check whether we're getting too many speculation failures and
458 * should just stop trying. The 100 is picked pretty randomly to be not too
459 * small (so most pages are not affected) but small enough that we don't end
460 * up with failed speculations over and over in pathological cases.
462 bool IsSpeculationEnabled() { return mSpeculationFailureCount
< 100; }
465 * Dispatch an event to a Quantum DOM main thread-ish thread.
466 * (Not the parser thread.)
468 nsresult
DispatchToMain(already_AddRefed
<nsIRunnable
>&& aRunnable
);
471 * Notify any devtools listeners about content newly received for parsing.
473 inline void OnNewContent(mozilla::Span
<const char16_t
> aData
);
476 * Notify any devtools listeners after all parse content has been received.
478 inline void OnContentComplete();
480 nsCOMPtr
<nsIRequest
> mRequest
;
483 * The document title to use if this turns out to be a View Source parser.
485 nsCString mViewSourceTitle
;
488 * The Unicode decoder
490 mozilla::UniquePtr
<mozilla::Decoder
> mUnicodeDecoder
;
497 // encoding-related stuff
499 * The source (confidence) of the character encoding in use
501 nsCharsetSource mCharsetSource
;
503 nsCharsetSource mEncodingSwitchSource
;
506 * The character encoding in use
508 NotNull
<const Encoding
*> mEncoding
;
510 const Encoding
* mNeedsEncodingSwitchTo
;
512 bool mSeenEligibleMetaCharset
;
518 bool mStartedFeedingDetector
;
520 bool mStartedFeedingDevTools
;
525 * Whether reparse is forbidden
527 bool mReparseForbidden
;
530 * Whether the Repair Text Encoding menu item was invoked
532 bool mForceAutoDetection
;
535 * Whether there was a valid charset parameter on the HTTP layer.
537 bool mChannelHadCharset
;
540 * We are in the process of looking for <meta charset>
542 bool mLookingForMetaCharset
;
545 * Whether the byte stream started with ASCII <?
547 bool mStartsWithLtQuestion
;
550 * If we are viewing XML source and are waiting for a '>' form the network.
552 bool mLookingForXmlDeclarationForXmlViewSource
;
555 * Whether template has been pushed or head popped within the first 1024
558 bool mTemplatePushedOrHeadPopped
;
560 // Portable parser objects
562 * The first buffer in the pending UTF-16 buffer queue
564 RefPtr
<nsHtml5OwningUTF16Buffer
> mFirstBuffer
;
567 * Non-owning pointer to the most recent buffer that contains the most recent
568 * remembered greater-than sign. Used only while mLookingForMetaCharset is
569 * true. While mLookingForMetaCharset is true, mFirstBuffer is not changed and
570 * keeps the whole linked list of buffers alive. This pointer is non-owning to
571 * avoid frequent refcounting.
573 nsHtml5OwningUTF16Buffer
* mGtBuffer
;
578 * The last buffer in the pending UTF-16 buffer queue
580 nsHtml5OwningUTF16Buffer
*
581 mLastBuffer
; // weak ref; always points to
582 // a buffer of the size
583 // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
586 * The first buffer of the document if looking for <meta charset> or
587 * nullptr afterwards.
589 RefPtr
<nsHtml5OwningUTF16Buffer
> mFirstBufferOfMetaScan
;
592 * The tree operation executor
594 nsHtml5TreeOpExecutor
* mExecutor
;
597 * Network event target for mExecutor->mDocument
599 nsCOMPtr
<nsISerialEventTarget
> mNetworkEventTarget
;
602 * The HTML5 tree builder
604 mozilla::UniquePtr
<nsHtml5TreeBuilder
> mTreeBuilder
;
607 * The HTML5 tokenizer
609 mozilla::UniquePtr
<nsHtml5Tokenizer
> mTokenizer
;
612 * Makes sure the main thread can't mess the tokenizer state while it's
613 * tokenizing. This mutex also protects the current speculation.
615 mozilla::Mutex mTokenizerMutex
;
618 * The scoped atom table
620 nsHtml5AtomTable mAtomTable
;
625 RefPtr
<nsHtml5Parser
> mOwner
;
628 * Whether the last character tokenized was a carriage return (for CRLF)
633 * For tracking stream life cycle
635 eHtml5StreamState mStreamState
;
638 * Whether we are speculating.
643 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
648 * The speculations. The mutex protects the nsTArray itself.
649 * To access the queue of current speculation, mTokenizerMutex must be
651 * The current speculation is the last element
653 nsTArray
<mozilla::UniquePtr
<nsHtml5Speculation
>> mSpeculations
;
654 mozilla::Mutex mSpeculationMutex
;
657 * Number of times speculation has failed for this parser.
659 mozilla::Atomic
<uint32_t> mSpeculationFailureCount
;
662 * Number of bytes already buffered into mBufferedBytes.
664 uint32_t mNumBytesBuffered
;
666 nsTArray
<mozilla::Buffer
<uint8_t>> mBufferedBytes
;
669 * True to terminate early.
671 mozilla::Atomic
<bool> mTerminated
;
674 * True to release mTokenizerMutex early.
676 mozilla::Atomic
<bool> mInterrupted
;
679 * The thread this stream parser runs on.
681 nsCOMPtr
<nsISerialEventTarget
> mEventTarget
;
683 nsCOMPtr
<nsIRunnable
> mExecutorFlusher
;
685 nsCOMPtr
<nsIRunnable
> mLoadFlusher
;
688 * This runnable is distinct from the regular flushers to
689 * signal the intent of encoding commitment without having to
690 * protect mPendingEncodingCommitment in the executer with a
693 nsCOMPtr
<nsIRunnable
> mEncodingCommitter
;
696 * The generict detector.
698 mozilla::UniquePtr
<mozilla::EncodingDetector
> mDetector
;
701 * The TLD we're loading from or empty if unknown.
706 * Whether the initial charset source was kCharsetFromParentFrame
708 bool mInitialEncodingWasFromParentFrame
;
712 bool mDetectorHasSeenNonAscii
;
715 * If true, we are decoding a local file that lacks an encoding
716 * declaration and we are not tokenizing yet.
718 bool mDecodingLocalFileWithoutTokenizing
;
721 * Whether we are keeping the incoming bytes.
723 bool mBufferingBytes
;
726 * Timer for flushing tree ops once in a while when not speculating.
728 nsCOMPtr
<nsITimer
> mFlushTimer
;
731 * Mutex for protecting access to mFlushTimer (but not for the two
732 * mFlushTimerFoo booleans below).
734 mozilla::Mutex mFlushTimerMutex
;
737 * Keeps track whether mFlushTimer has been armed. Unfortunately,
738 * nsITimer doesn't enable querying this from the timer itself.
740 bool mFlushTimerArmed
;
743 * False initially and true after the timer has fired at least once.
745 bool mFlushTimerEverFired
;
748 * Whether the parser is doing a normal parse, view source or plain text.
753 * If the associated docshell is being watched by the devtools, this is
754 * set to the URI associated with the parse. All parse data is sent to the
755 * devtools, along with this URI. This URI is cleared out after the parse has
756 * been marked as completed.
758 nsCOMPtr
<nsIURI
> mURIToSendToDevtools
;
761 * If content is being sent to the devtools, an encoded UUID for the parser.
763 nsString mUUIDForDevtools
;
766 #endif // nsHtml5StreamParser_h