1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
11 #include "MainThreadUtils.h"
12 #include "mozilla/AlreadyAddRefed.h"
13 #include "mozilla/Assertions.h"
14 #include "mozilla/Atomics.h"
15 #include "mozilla/Encoding.h"
16 #include "mozilla/Mutex.h"
17 #include "mozilla/NotNull.h"
18 #include "mozilla/ReentrantMonitor.h"
19 #include "mozilla/RefPtr.h"
20 #include "mozilla/Span.h"
21 #include "mozilla/TimeStamp.h"
22 #include "mozilla/UniquePtr.h"
23 #include "nsCharsetSource.h"
25 #include "nsCycleCollectionParticipant.h"
27 #include "nsHtml5AtomTable.h"
28 #include "nsIRequestObserver.h"
29 #include "nsISerialEventTarget.h"
30 #include "nsISupports.h"
31 #include "nsStringFwd.h"
35 class nsCycleCollectionTraversalCallback
;
36 class nsHtml5OwningUTF16Buffer
;
38 class nsHtml5Speculation
;
40 class nsHtml5Tokenizer
;
41 class nsHtml5TreeBuilder
;
42 class nsHtml5TreeOpExecutor
;
51 class EncodingDetector
;
58 } // namespace mozilla
62 * Parse a document normally as HTML.
67 * View document as HTML source.
72 * View document as XML source
77 * View document as plain text source
82 * View document as plain text
94 * BOM sniffing hasn't started.
96 BOM_SNIFFING_NOT_STARTED
,
99 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
102 SEEN_UTF_16_LE_FIRST_BYTE
,
105 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
108 SEEN_UTF_16_BE_FIRST_BYTE
,
111 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
114 SEEN_UTF_8_FIRST_BYTE
,
117 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
120 SEEN_UTF_8_SECOND_BYTE
,
123 * Seen \x00 in UTF-16BE bogo-XML declaration.
125 SEEN_UTF_16_BE_XML_FIRST
,
128 * Seen \x00< in UTF-16BE bogo-XML declaration.
130 SEEN_UTF_16_BE_XML_SECOND
,
133 * Seen \x00<\x00 in UTF-16BE bogo-XML declaration.
135 SEEN_UTF_16_BE_XML_THIRD
,
138 * Seen \x00<\x00? in UTF-16BE bogo-XML declaration.
140 SEEN_UTF_16_BE_XML_FOURTH
,
143 * Seen \x00<\x00?\x00 in UTF-16BE bogo-XML declaration.
145 SEEN_UTF_16_BE_XML_FIFTH
,
148 * Seen < in UTF-16BE bogo-XML declaration.
150 SEEN_UTF_16_LE_XML_FIRST
,
153 * Seen <\x00 in UTF-16BE bogo-XML declaration.
155 SEEN_UTF_16_LE_XML_SECOND
,
158 * Seen <\x00? in UTF-16BE bogo-XML declaration.
160 SEEN_UTF_16_LE_XML_THIRD
,
163 * Seen <\x00?\x00 in UTF-16BE bogo-XML declaration.
165 SEEN_UTF_16_LE_XML_FOURTH
,
168 * Seen <\x00?\x00x in UTF-16BE bogo-XML declaration.
170 SEEN_UTF_16_LE_XML_FIFTH
,
173 * BOM sniffing was started but is now over for whatever reason.
178 enum eHtml5StreamState
{
179 STREAM_NOT_STARTED
= 0,
180 STREAM_BEING_READ
= 1,
184 class nsHtml5StreamParser final
: public nsISupports
{
185 template <typename T
>
186 using NotNull
= mozilla::NotNull
<T
>;
187 using Encoding
= mozilla::Encoding
;
189 const uint32_t UNCONDITIONAL_META_SCAN_BOUNDARY
= 1024;
190 const uint32_t READ_BUFFER_SIZE
= 1024;
191 const uint32_t LOCAL_FILE_UTF_8_BUFFER_SIZE
= 1024 * 1024 * 4; // 4 MB
193 friend class nsHtml5RequestStopper
;
194 friend class nsHtml5DataAvailable
;
195 friend class nsHtml5StreamParserContinuation
;
196 friend class nsHtml5TimerKungFu
;
197 friend class nsHtml5StreamParserPtr
;
198 friend class nsHtml5StreamListener
;
201 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
202 NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser
)
204 nsHtml5StreamParser(nsHtml5TreeOpExecutor
* aExecutor
, nsHtml5Parser
* aOwner
,
207 nsresult
OnStartRequest(nsIRequest
* aRequest
);
209 nsresult
OnDataAvailable(nsIRequest
* aRequest
, nsIInputStream
* aInStream
,
210 uint64_t aSourceOffset
, uint32_t aLength
);
212 * ReentrantMonitorAutoEnter is used for protecting access to
213 * nsHtml5StreamParser::mOnStopCalled and should be obtained from
214 * nsHtml5StreamListener::mDelegateMonitor
216 nsresult
OnStopRequest(
217 nsIRequest
* aRequest
, nsresult status
,
218 const mozilla::ReentrantMonitorAutoEnter
& aProofOfLock
);
220 // EncodingDeclarationHandler
221 // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
223 * Tree builder uses this to report a late <meta charset>
225 bool internalEncodingDeclaration(nsHtml5String aEncoding
);
227 bool TemplatePushedOrHeadPopped();
229 void RememberGt(int32_t aPos
);
231 // Not from an external interface
234 * Post a runnable to the main thread to perform the speculative load
235 * operations without performing the tree operations.
237 * This should be called at the end of each data available or stop
238 * request runnable running on the parser thread.
240 void PostLoadFlusher();
243 * Pass a buffer to chardetng.
245 void FeedDetector(mozilla::Span
<const uint8_t> aBuffer
);
248 * Report EOF to chardetng.
253 * Call this method once you've created a parser, and want to instruct it
254 * about what charset to load
256 * @param aEncoding the charset of a document
257 * @param aCharsetSource the source of the charset
259 inline void SetDocumentCharset(NotNull
<const Encoding
*> aEncoding
,
260 nsCharsetSource aSource
,
261 bool aForceAutoDetection
) {
262 MOZ_ASSERT(mStreamState
== STREAM_NOT_STARTED
,
263 "SetDocumentCharset called too late.");
264 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
265 MOZ_ASSERT(!(aForceAutoDetection
&& aSource
>= kCharsetFromOtherComponent
),
266 "Can't force with high-ranking source.");
267 mEncoding
= aEncoding
;
268 mCharsetSource
= aSource
;
269 mForceAutoDetection
= aForceAutoDetection
;
270 mChannelHadCharset
= (aSource
== kCharsetFromChannel
);
273 nsresult
GetChannel(nsIChannel
** aChannel
);
276 * The owner parser must call this after script execution
277 * when no scripts are executing and the document.written
278 * buffer has been exhausted.
280 * If the first two arguments are nullptr, instead of
281 * continuing after scripts, this method commits to an
282 * internally-discovered encoding.
284 void ContinueAfterScriptsOrEncodingCommitment(
285 nsHtml5Tokenizer
* aTokenizer
, nsHtml5TreeBuilder
* aTreeBuilder
,
289 * Continues the stream parser if the charset switch failed.
291 void ContinueAfterFailedCharsetSwitch();
293 void Terminate() { mTerminated
= true; }
298 * Sets the URL for View Source title in case this parser ends up being
299 * used for View Source. If aURL is a view-source: URL, takes the inner
300 * URL. data: URLs are shown with an ellipsis instead of the actual data.
302 void SetViewSourceTitle(nsIURI
* aURL
);
305 virtual ~nsHtml5StreamParser();
308 bool IsParserThread() { return mEventTarget
->IsOnCurrentThread(); }
311 void MarkAsBroken(nsresult aRv
);
314 * Marks the stream parser as interrupted. If you ever add calls to this
315 * method, be sure to review Uninterrupt usage very, very carefully to
316 * avoid having a previous in-flight runnable cancel your Interrupt()
317 * call on the other thread too soon.
320 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
324 void Uninterrupt() MOZ_NO_THREAD_SAFETY_ANALYSIS
{
325 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
326 mTokenizerMutex
.AssertCurrentThreadOwns();
327 mInterrupted
= false;
331 * Flushes the tree ops from the tree builder and disarms the flush
334 void FlushTreeOpsAndDisarmTimer();
336 void SwitchDecoderIfAsciiSoFar(NotNull
<const Encoding
*> aEncoding
)
337 MOZ_REQUIRES(mTokenizerMutex
);
342 void DiscardMetaSpeculation();
344 bool ProcessLookingForMetaCharset(bool aEof
) MOZ_REQUIRES(mTokenizerMutex
);
346 void ParseAvailableData();
348 void DoStopRequest();
350 void DoDataAvailableBuffer(mozilla::Buffer
<uint8_t>&& aBuffer
)
351 MOZ_REQUIRES(mTokenizerMutex
);
353 void DoDataAvailable(mozilla::Span
<const uint8_t> aBuffer
)
354 MOZ_REQUIRES(mTokenizerMutex
);
356 static nsresult
CopySegmentsToParser(nsIInputStream
* aInStream
,
357 void* aClosure
, const char* aFromSegment
,
358 uint32_t aToOffset
, uint32_t aCount
,
359 uint32_t* aWriteCount
)
360 MOZ_REQUIRES(mTokenizerMutex
);
362 bool IsTerminatedOrInterrupted() { return mTerminated
|| mInterrupted
; }
364 bool IsTerminated() { return mTerminated
; }
367 * True when there is a Unicode decoder already
369 inline bool HasDecoder() { return !!mUnicodeDecoder
; }
372 * Returns 0 if 1) there aren't at least 2 buffers in mBufferedBytes
373 * or 2) there is no byte '>' in the second buffer.
374 * Otherwise, returns the length of the prefix of the second buffer
375 * that is long enough to contain the first byte '>' in the second
376 * buffer (including the '>' byte).
378 size_t LengthOfLtContainingPrefixInSecondBuffer();
381 * Push bytes from network when there is no Unicode decoder yet
383 nsresult
SniffStreamBytes(mozilla::Span
<const uint8_t> aFromSegment
,
384 bool aEof
) MOZ_REQUIRES(mTokenizerMutex
);
387 * Push bytes from network when there is a Unicode decoder already
389 nsresult
WriteStreamBytes(mozilla::Span
<const uint8_t> aFromSegment
)
390 MOZ_REQUIRES(mTokenizerMutex
);
393 * Set up the Unicode decoder and write the sniffing buffer into it
394 * followed by the current network buffer.
396 * @param aPrefix the part of the stream that has already been seen
397 * prior to aFromSegment. In practice, these are the
398 * bytes that are baked into the state of the BOM
399 * and UTF-16 XML declaration-like sniffing state
401 * @param aFromSegment The current network buffer
403 nsresult
SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
404 mozilla::Span
<const uint8_t> aPrefix
,
405 mozilla::Span
<const uint8_t> aFromSegment
) MOZ_REQUIRES(mTokenizerMutex
);
408 * Initialize the Unicode decoder, mark the BOM as the source and
411 * @param aDecoderCharsetName The name for the decoder's charset
412 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
415 void SetupDecodingFromBom(NotNull
<const Encoding
*> aEncoding
);
417 void SetupDecodingFromUtf16BogoXml(NotNull
<const Encoding
*> aEncoding
);
420 * When speculatively decoding from file: URL as UTF-8, commit
421 * to UTF-8 as the non-speculative encoding and start processing
424 [[nodiscard
]] nsresult
CommitLocalFileToEncoding();
427 * When speculatively decoding from file: URL as UTF-8, redecode
428 * using fallback and then continue normally with the fallback.
430 [[nodiscard
]] nsresult
ReDecodeLocalFile() MOZ_REQUIRES(mTokenizerMutex
);
433 * Potentially guess the encoding using mozilla::EncodingDetector.
434 * Returns the guessed encoding and a telemetry-appropriate source.
436 std::tuple
<NotNull
<const Encoding
*>, nsCharsetSource
> GuessEncoding(
440 * Become confident or resolve and encoding name to its preferred form.
441 * @param aEncoding the value of an internal encoding decl. Acts as an
442 * out param, too, when the method returns true.
443 * @return true if the parser needs to start using the new value of
444 * aEncoding and false if the parser became confident or if
445 * the encoding name did not specify a usable encoding
447 const Encoding
* PreferredForInternalEncodingDecl(const nsAString
& aEncoding
);
450 * Callback for mFlushTimer.
452 static void TimerCallback(nsITimer
* aTimer
, void* aClosure
);
455 * Parser thread entry point for (maybe) flushing the ops and posting
456 * a flush runnable back on the main thread.
461 * Called when speculation fails.
463 void MaybeDisableFutureSpeculation() { mSpeculationFailureCount
++; }
466 * Used to check whether we're getting too many speculation failures and
467 * should just stop trying. The 100 is picked pretty randomly to be not too
468 * small (so most pages are not affected) but small enough that we don't end
469 * up with failed speculations over and over in pathological cases.
471 bool IsSpeculationEnabled() { return mSpeculationFailureCount
< 100; }
474 * Dispatch an event to a Quantum DOM main thread-ish thread.
475 * (Not the parser thread.)
477 nsresult
DispatchToMain(already_AddRefed
<nsIRunnable
>&& aRunnable
);
480 * Notify any devtools listeners about content newly received for parsing.
482 inline void OnNewContent(mozilla::Span
<const char16_t
> aData
);
485 * Notify any devtools listeners after all parse content has been received.
487 inline void OnContentComplete();
489 nsCOMPtr
<nsIRequest
> mRequest
;
492 * The document title to use if this turns out to be a View Source parser.
494 nsCString mViewSourceTitle
;
497 * The Unicode decoder
499 mozilla::UniquePtr
<mozilla::Decoder
> mUnicodeDecoder
;
506 // encoding-related stuff
508 * The source (confidence) of the character encoding in use
510 nsCharsetSource mCharsetSource
;
512 nsCharsetSource mEncodingSwitchSource
;
515 * The character encoding in use
517 NotNull
<const Encoding
*> mEncoding
;
519 const Encoding
* mNeedsEncodingSwitchTo
;
521 bool mSeenEligibleMetaCharset
;
527 bool mStartedFeedingDetector
;
529 bool mStartedFeedingDevTools
;
534 * Whether reparse is forbidden
536 bool mReparseForbidden
;
539 * Whether the Repair Text Encoding menu item was invoked
541 bool mForceAutoDetection
;
544 * Whether there was a valid charset parameter on the HTTP layer.
546 bool mChannelHadCharset
;
549 * We are in the process of looking for <meta charset>
551 bool mLookingForMetaCharset
;
554 * Whether the byte stream started with ASCII <?
556 bool mStartsWithLtQuestion
;
559 * If we are viewing XML source and are waiting for a '>' form the network.
561 bool mLookingForXmlDeclarationForXmlViewSource
;
564 * Whether template has been pushed or head popped within the first 1024
567 bool mTemplatePushedOrHeadPopped
;
569 // Portable parser objects
571 * The first buffer in the pending UTF-16 buffer queue
573 RefPtr
<nsHtml5OwningUTF16Buffer
> mFirstBuffer
;
576 * Non-owning pointer to the most recent buffer that contains the most recent
577 * remembered greater-than sign. Used only while mLookingForMetaCharset is
578 * true. While mLookingForMetaCharset is true, mFirstBuffer is not changed and
579 * keeps the whole linked list of buffers alive. This pointer is non-owning to
580 * avoid frequent refcounting.
582 nsHtml5OwningUTF16Buffer
* mGtBuffer
;
587 * The last buffer in the pending UTF-16 buffer queue
589 nsHtml5OwningUTF16Buffer
*
590 mLastBuffer
; // weak ref; always points to
591 // a buffer of the size
592 // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
595 * The first buffer of the document if looking for <meta charset> or
596 * nullptr afterwards.
598 RefPtr
<nsHtml5OwningUTF16Buffer
> mFirstBufferOfMetaScan
;
601 * The tree operation executor
603 nsHtml5TreeOpExecutor
* mExecutor
;
606 * The HTML5 tree builder
608 mozilla::UniquePtr
<nsHtml5TreeBuilder
> mTreeBuilder
;
611 * The HTML5 tokenizer
613 mozilla::UniquePtr
<nsHtml5Tokenizer
> mTokenizer
;
616 * Makes sure the main thread can't mess the tokenizer state while it's
617 * tokenizing. This mutex also protects the current speculation.
619 mozilla::Mutex mTokenizerMutex
;
622 * The scoped atom table
624 nsHtml5AtomTable mAtomTable
;
629 RefPtr
<nsHtml5Parser
> mOwner
;
632 * Whether the last character tokenized was a carriage return (for CRLF)
637 * For tracking stream life cycle
639 eHtml5StreamState mStreamState
;
642 * Whether we are speculating.
647 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
652 * The speculations. The mutex protects the nsTArray itself.
653 * To access the queue of current speculation, mTokenizerMutex must be
655 * The current speculation is the last element
657 nsTArray
<mozilla::UniquePtr
<nsHtml5Speculation
>> mSpeculations
;
658 mozilla::Mutex mSpeculationMutex
;
661 * Number of times speculation has failed for this parser.
663 mozilla::Atomic
<uint32_t> mSpeculationFailureCount
;
666 * Number of bytes already buffered into mBufferedBytes.
668 uint32_t mNumBytesBuffered
;
670 nsTArray
<mozilla::Buffer
<uint8_t>> mBufferedBytes
;
673 * True to terminate early.
675 mozilla::Atomic
<bool> mTerminated
;
678 * True to release mTokenizerMutex early.
680 mozilla::Atomic
<bool> mInterrupted
;
683 * The thread this stream parser runs on.
685 nsCOMPtr
<nsISerialEventTarget
> mEventTarget
;
687 nsCOMPtr
<nsIRunnable
> mExecutorFlusher
;
689 nsCOMPtr
<nsIRunnable
> mLoadFlusher
;
692 * This runnable is distinct from the regular flushers to
693 * signal the intent of encoding commitment without having to
694 * protect mPendingEncodingCommitment in the executer with a
697 nsCOMPtr
<nsIRunnable
> mEncodingCommitter
;
700 * The generict detector.
702 mozilla::UniquePtr
<mozilla::EncodingDetector
> mDetector
;
705 * The TLD we're loading from or empty if unknown.
710 * Whether the initial charset source was kCharsetFromParentFrame
712 bool mInitialEncodingWasFromParentFrame
;
716 bool mDetectorHasSeenNonAscii
;
719 * If true, we are decoding a local file that lacks an encoding
720 * declaration and we are not tokenizing yet.
722 bool mDecodingLocalFileWithoutTokenizing
;
725 * Whether we are keeping the incoming bytes.
727 bool mBufferingBytes
;
730 * Timer for flushing tree ops once in a while when not speculating.
732 nsCOMPtr
<nsITimer
> mFlushTimer
;
735 * Mutex for protecting access to mFlushTimer (but not for the two
736 * mFlushTimerFoo booleans below).
738 mozilla::Mutex mFlushTimerMutex
;
741 * Keeps track whether mFlushTimer has been armed. Unfortunately,
742 * nsITimer doesn't enable querying this from the timer itself.
744 bool mFlushTimerArmed
;
747 * False initially and true after the timer has fired at least once.
749 bool mFlushTimerEverFired
;
752 * Whether the parser is doing a normal parse, view source or plain text.
757 * If the associated docshell is being watched by the devtools, this is
758 * set to the URI associated with the parse. All parse data is sent to the
759 * devtools, along with this URI. This URI is cleared out after the parse has
760 * been marked as completed.
762 nsCOMPtr
<nsIURI
> mURIToSendToDevtools
;
765 * If content is being sent to the devtools, an encoded UUID for the parser.
767 nsString mUUIDForDevtools
;
770 * prevent multiple calls to OnStopRequest
771 * This field can be called from multiple threads and is protected by
772 * nsHtml5StreamListener::mDelegateMonitor passed in the OnStopRequest
774 bool mOnStopCalled
{false};
777 * Used for telemetry about OnStopRequest vs OnDataFinished
779 // guarded by nsHtml5StreamListener::mDelegateMonitor
780 mozilla::TimeStamp mOnStopRequestTime
;
781 mozilla::TimeStamp mOnDataFinishedTime
;
784 #endif // nsHtml5StreamParser_h