Backed out changeset bcbab342eed8 (bug 1889658) for causing wpt reftest failures...
[gecko.git] / parser / html / nsHtml5StreamParser.h
blobacc700c0cbd3a3197de052181370e8e87d435576
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
9 #include <tuple>
11 #include "MainThreadUtils.h"
12 #include "mozilla/AlreadyAddRefed.h"
13 #include "mozilla/Assertions.h"
14 #include "mozilla/Atomics.h"
15 #include "mozilla/Encoding.h"
16 #include "mozilla/Mutex.h"
17 #include "mozilla/NotNull.h"
18 #include "mozilla/ReentrantMonitor.h"
19 #include "mozilla/RefPtr.h"
20 #include "mozilla/Span.h"
21 #include "mozilla/TimeStamp.h"
22 #include "mozilla/UniquePtr.h"
23 #include "nsCharsetSource.h"
24 #include "nsCOMPtr.h"
25 #include "nsCycleCollectionParticipant.h"
26 #include "nsDebug.h"
27 #include "nsHtml5AtomTable.h"
28 #include "nsIRequestObserver.h"
29 #include "nsISerialEventTarget.h"
30 #include "nsISupports.h"
31 #include "nsStringFwd.h"
32 #include "nsTArray.h"
33 #include "nscore.h"
35 class nsCycleCollectionTraversalCallback;
36 class nsHtml5OwningUTF16Buffer;
37 class nsHtml5Parser;
38 class nsHtml5Speculation;
39 class nsHtml5String;
40 class nsHtml5Tokenizer;
41 class nsHtml5TreeBuilder;
42 class nsHtml5TreeOpExecutor;
43 class nsIChannel;
44 class nsIInputStream;
45 class nsIRequest;
46 class nsIRunnable;
47 class nsITimer;
48 class nsIURI;
50 namespace mozilla {
51 class EncodingDetector;
52 template <typename T>
53 class Buffer;
55 namespace dom {
56 class DocGroup;
58 } // namespace mozilla
60 enum eParserMode {
61 /**
62 * Parse a document normally as HTML.
64 NORMAL,
66 /**
67 * View document as HTML source.
69 VIEW_SOURCE_HTML,
71 /**
72 * View document as XML source
74 VIEW_SOURCE_XML,
76 /**
77 * View document as plain text source
79 VIEW_SOURCE_PLAIN,
81 /**
82 * View document as plain text
84 PLAIN_TEXT,
86 /**
87 * Load as data (XHR)
89 LOAD_AS_DATA
92 enum eBomState {
93 /**
94 * BOM sniffing hasn't started.
96 BOM_SNIFFING_NOT_STARTED,
98 /**
99 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
100 * seen.
102 SEEN_UTF_16_LE_FIRST_BYTE,
105 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
106 * seen.
108 SEEN_UTF_16_BE_FIRST_BYTE,
111 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
112 * seen.
114 SEEN_UTF_8_FIRST_BYTE,
117 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
118 * have been seen.
120 SEEN_UTF_8_SECOND_BYTE,
123 * Seen \x00 in UTF-16BE bogo-XML declaration.
125 SEEN_UTF_16_BE_XML_FIRST,
128 * Seen \x00< in UTF-16BE bogo-XML declaration.
130 SEEN_UTF_16_BE_XML_SECOND,
133 * Seen \x00<\x00 in UTF-16BE bogo-XML declaration.
135 SEEN_UTF_16_BE_XML_THIRD,
138 * Seen \x00<\x00? in UTF-16BE bogo-XML declaration.
140 SEEN_UTF_16_BE_XML_FOURTH,
143 * Seen \x00<\x00?\x00 in UTF-16BE bogo-XML declaration.
145 SEEN_UTF_16_BE_XML_FIFTH,
148 * Seen < in UTF-16BE bogo-XML declaration.
150 SEEN_UTF_16_LE_XML_FIRST,
153 * Seen <\x00 in UTF-16BE bogo-XML declaration.
155 SEEN_UTF_16_LE_XML_SECOND,
158 * Seen <\x00? in UTF-16BE bogo-XML declaration.
160 SEEN_UTF_16_LE_XML_THIRD,
163 * Seen <\x00?\x00 in UTF-16BE bogo-XML declaration.
165 SEEN_UTF_16_LE_XML_FOURTH,
168 * Seen <\x00?\x00x in UTF-16BE bogo-XML declaration.
170 SEEN_UTF_16_LE_XML_FIFTH,
173 * BOM sniffing was started but is now over for whatever reason.
175 BOM_SNIFFING_OVER,
178 enum eHtml5StreamState {
179 STREAM_NOT_STARTED = 0,
180 STREAM_BEING_READ = 1,
181 STREAM_ENDED = 2
184 class nsHtml5StreamParser final : public nsISupports {
185 template <typename T>
186 using NotNull = mozilla::NotNull<T>;
187 using Encoding = mozilla::Encoding;
189 const uint32_t UNCONDITIONAL_META_SCAN_BOUNDARY = 1024;
190 const uint32_t READ_BUFFER_SIZE = 1024;
191 const uint32_t LOCAL_FILE_UTF_8_BUFFER_SIZE = 1024 * 1024 * 4; // 4 MB
193 friend class nsHtml5RequestStopper;
194 friend class nsHtml5DataAvailable;
195 friend class nsHtml5StreamParserContinuation;
196 friend class nsHtml5TimerKungFu;
197 friend class nsHtml5StreamParserPtr;
198 friend class nsHtml5StreamListener;
200 public:
201 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
202 NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
204 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, nsHtml5Parser* aOwner,
205 eParserMode aMode);
207 nsresult OnStartRequest(nsIRequest* aRequest);
209 nsresult OnDataAvailable(nsIRequest* aRequest, nsIInputStream* aInStream,
210 uint64_t aSourceOffset, uint32_t aLength);
212 * ReentrantMonitorAutoEnter is used for protecting access to
213 * nsHtml5StreamParser::mOnStopCalled and should be obtained from
214 * nsHtml5StreamListener::mDelegateMonitor
216 nsresult OnStopRequest(
217 nsIRequest* aRequest, nsresult status,
218 const mozilla::ReentrantMonitorAutoEnter& aProofOfLock);
220 // EncodingDeclarationHandler
221 // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
223 * Tree builder uses this to report a late <meta charset>
225 bool internalEncodingDeclaration(nsHtml5String aEncoding);
227 bool TemplatePushedOrHeadPopped();
229 void RememberGt(int32_t aPos);
231 // Not from an external interface
234 * Post a runnable to the main thread to perform the speculative load
235 * operations without performing the tree operations.
237 * This should be called at the end of each data available or stop
238 * request runnable running on the parser thread.
240 void PostLoadFlusher();
243 * Pass a buffer to chardetng.
245 void FeedDetector(mozilla::Span<const uint8_t> aBuffer);
248 * Report EOF to chardetng.
250 void DetectorEof();
253 * Call this method once you've created a parser, and want to instruct it
254 * about what charset to load
256 * @param aEncoding the charset of a document
257 * @param aCharsetSource the source of the charset
259 inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding,
260 nsCharsetSource aSource,
261 bool aForceAutoDetection) {
262 MOZ_ASSERT(mStreamState == STREAM_NOT_STARTED,
263 "SetDocumentCharset called too late.");
264 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
265 MOZ_ASSERT(!(aForceAutoDetection && aSource >= kCharsetFromOtherComponent),
266 "Can't force with high-ranking source.");
267 mEncoding = aEncoding;
268 mCharsetSource = aSource;
269 mForceAutoDetection = aForceAutoDetection;
270 mChannelHadCharset = (aSource == kCharsetFromChannel);
273 nsresult GetChannel(nsIChannel** aChannel);
276 * The owner parser must call this after script execution
277 * when no scripts are executing and the document.written
278 * buffer has been exhausted.
280 * If the first two arguments are nullptr, instead of
281 * continuing after scripts, this method commits to an
282 * internally-discovered encoding.
284 void ContinueAfterScriptsOrEncodingCommitment(
285 nsHtml5Tokenizer* aTokenizer, nsHtml5TreeBuilder* aTreeBuilder,
286 bool aLastWasCR);
289 * Continues the stream parser if the charset switch failed.
291 void ContinueAfterFailedCharsetSwitch();
293 void Terminate() { mTerminated = true; }
295 void DropTimer();
298 * Sets the URL for View Source title in case this parser ends up being
299 * used for View Source. If aURL is a view-source: URL, takes the inner
300 * URL. data: URLs are shown with an ellipsis instead of the actual data.
302 void SetViewSourceTitle(nsIURI* aURL);
304 private:
305 virtual ~nsHtml5StreamParser();
307 #ifdef DEBUG
308 bool IsParserThread() { return mEventTarget->IsOnCurrentThread(); }
309 #endif
311 void MarkAsBroken(nsresult aRv);
314 * Marks the stream parser as interrupted. If you ever add calls to this
315 * method, be sure to review Uninterrupt usage very, very carefully to
316 * avoid having a previous in-flight runnable cancel your Interrupt()
317 * call on the other thread too soon.
319 void Interrupt() {
320 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
321 mInterrupted = true;
324 void Uninterrupt() MOZ_NO_THREAD_SAFETY_ANALYSIS {
325 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
326 mTokenizerMutex.AssertCurrentThreadOwns();
327 mInterrupted = false;
331 * Flushes the tree ops from the tree builder and disarms the flush
332 * timer.
334 void FlushTreeOpsAndDisarmTimer();
336 void SwitchDecoderIfAsciiSoFar(NotNull<const Encoding*> aEncoding)
337 MOZ_REQUIRES(mTokenizerMutex);
340 size_t CountGts();
342 void DiscardMetaSpeculation();
344 bool ProcessLookingForMetaCharset(bool aEof) MOZ_REQUIRES(mTokenizerMutex);
346 void ParseAvailableData();
348 void DoStopRequest();
350 void DoDataAvailableBuffer(mozilla::Buffer<uint8_t>&& aBuffer)
351 MOZ_REQUIRES(mTokenizerMutex);
353 void DoDataAvailable(mozilla::Span<const uint8_t> aBuffer)
354 MOZ_REQUIRES(mTokenizerMutex);
356 static nsresult CopySegmentsToParser(nsIInputStream* aInStream,
357 void* aClosure, const char* aFromSegment,
358 uint32_t aToOffset, uint32_t aCount,
359 uint32_t* aWriteCount)
360 MOZ_REQUIRES(mTokenizerMutex);
362 bool IsTerminatedOrInterrupted() { return mTerminated || mInterrupted; }
364 bool IsTerminated() { return mTerminated; }
367 * True when there is a Unicode decoder already
369 inline bool HasDecoder() { return !!mUnicodeDecoder; }
372 * Returns 0 if 1) there aren't at least 2 buffers in mBufferedBytes
373 * or 2) there is no byte '>' in the second buffer.
374 * Otherwise, returns the length of the prefix of the second buffer
375 * that is long enough to contain the first byte '>' in the second
376 * buffer (including the '>' byte).
378 size_t LengthOfLtContainingPrefixInSecondBuffer();
381 * Push bytes from network when there is no Unicode decoder yet
383 nsresult SniffStreamBytes(mozilla::Span<const uint8_t> aFromSegment,
384 bool aEof) MOZ_REQUIRES(mTokenizerMutex);
387 * Push bytes from network when there is a Unicode decoder already
389 nsresult WriteStreamBytes(mozilla::Span<const uint8_t> aFromSegment)
390 MOZ_REQUIRES(mTokenizerMutex);
393 * Set up the Unicode decoder and write the sniffing buffer into it
394 * followed by the current network buffer.
396 * @param aPrefix the part of the stream that has already been seen
397 * prior to aFromSegment. In practice, these are the
398 * bytes that are baked into the state of the BOM
399 * and UTF-16 XML declaration-like sniffing state
400 * machine state.
401 * @param aFromSegment The current network buffer
403 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
404 mozilla::Span<const uint8_t> aPrefix,
405 mozilla::Span<const uint8_t> aFromSegment) MOZ_REQUIRES(mTokenizerMutex);
408 * Initialize the Unicode decoder, mark the BOM as the source and
409 * drop the sniffer.
411 * @param aDecoderCharsetName The name for the decoder's charset
412 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
413 * been swallowed)
415 void SetupDecodingFromBom(NotNull<const Encoding*> aEncoding);
417 void SetupDecodingFromUtf16BogoXml(NotNull<const Encoding*> aEncoding);
420 * When speculatively decoding from file: URL as UTF-8, commit
421 * to UTF-8 as the non-speculative encoding and start processing
422 * the decoded data.
424 [[nodiscard]] nsresult CommitLocalFileToEncoding();
427 * When speculatively decoding from file: URL as UTF-8, redecode
428 * using fallback and then continue normally with the fallback.
430 [[nodiscard]] nsresult ReDecodeLocalFile() MOZ_REQUIRES(mTokenizerMutex);
433 * Potentially guess the encoding using mozilla::EncodingDetector.
434 * Returns the guessed encoding and a telemetry-appropriate source.
436 std::tuple<NotNull<const Encoding*>, nsCharsetSource> GuessEncoding(
437 bool aInitial);
440 * Become confident or resolve and encoding name to its preferred form.
441 * @param aEncoding the value of an internal encoding decl. Acts as an
442 * out param, too, when the method returns true.
443 * @return true if the parser needs to start using the new value of
444 * aEncoding and false if the parser became confident or if
445 * the encoding name did not specify a usable encoding
447 const Encoding* PreferredForInternalEncodingDecl(const nsAString& aEncoding);
450 * Callback for mFlushTimer.
452 static void TimerCallback(nsITimer* aTimer, void* aClosure);
455 * Parser thread entry point for (maybe) flushing the ops and posting
456 * a flush runnable back on the main thread.
458 void TimerFlush();
461 * Called when speculation fails.
463 void MaybeDisableFutureSpeculation() { mSpeculationFailureCount++; }
466 * Used to check whether we're getting too many speculation failures and
467 * should just stop trying. The 100 is picked pretty randomly to be not too
468 * small (so most pages are not affected) but small enough that we don't end
469 * up with failed speculations over and over in pathological cases.
471 bool IsSpeculationEnabled() { return mSpeculationFailureCount < 100; }
474 * Dispatch an event to a Quantum DOM main thread-ish thread.
475 * (Not the parser thread.)
477 nsresult DispatchToMain(already_AddRefed<nsIRunnable>&& aRunnable);
480 * Notify any devtools listeners about content newly received for parsing.
482 inline void OnNewContent(mozilla::Span<const char16_t> aData);
485 * Notify any devtools listeners after all parse content has been received.
487 inline void OnContentComplete();
489 nsCOMPtr<nsIRequest> mRequest;
492 * The document title to use if this turns out to be a View Source parser.
494 nsCString mViewSourceTitle;
497 * The Unicode decoder
499 mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder;
502 * BOM sniffing state
504 eBomState mBomState;
506 // encoding-related stuff
508 * The source (confidence) of the character encoding in use
510 nsCharsetSource mCharsetSource;
512 nsCharsetSource mEncodingSwitchSource;
515 * The character encoding in use
517 NotNull<const Encoding*> mEncoding;
519 const Encoding* mNeedsEncodingSwitchTo;
521 bool mSeenEligibleMetaCharset;
523 bool mChardetEof;
525 #ifdef DEBUG
527 bool mStartedFeedingDetector;
529 bool mStartedFeedingDevTools;
531 #endif
534 * Whether reparse is forbidden
536 bool mReparseForbidden;
539 * Whether the Repair Text Encoding menu item was invoked
541 bool mForceAutoDetection;
544 * Whether there was a valid charset parameter on the HTTP layer.
546 bool mChannelHadCharset;
549 * We are in the process of looking for <meta charset>
551 bool mLookingForMetaCharset;
554 * Whether the byte stream started with ASCII <?
556 bool mStartsWithLtQuestion;
559 * If we are viewing XML source and are waiting for a '>' form the network.
561 bool mLookingForXmlDeclarationForXmlViewSource;
564 * Whether template has been pushed or head popped within the first 1024
565 * bytes.
567 bool mTemplatePushedOrHeadPopped;
569 // Portable parser objects
571 * The first buffer in the pending UTF-16 buffer queue
573 RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
576 * Non-owning pointer to the most recent buffer that contains the most recent
577 * remembered greater-than sign. Used only while mLookingForMetaCharset is
578 * true. While mLookingForMetaCharset is true, mFirstBuffer is not changed and
579 * keeps the whole linked list of buffers alive. This pointer is non-owning to
580 * avoid frequent refcounting.
582 nsHtml5OwningUTF16Buffer* mGtBuffer;
584 int32_t mGtPos;
587 * The last buffer in the pending UTF-16 buffer queue
589 nsHtml5OwningUTF16Buffer*
590 mLastBuffer; // weak ref; always points to
591 // a buffer of the size
592 // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
595 * The first buffer of the document if looking for <meta charset> or
596 * nullptr afterwards.
598 RefPtr<nsHtml5OwningUTF16Buffer> mFirstBufferOfMetaScan;
601 * The tree operation executor
603 nsHtml5TreeOpExecutor* mExecutor;
606 * The HTML5 tree builder
608 mozilla::UniquePtr<nsHtml5TreeBuilder> mTreeBuilder;
611 * The HTML5 tokenizer
613 mozilla::UniquePtr<nsHtml5Tokenizer> mTokenizer;
616 * Makes sure the main thread can't mess the tokenizer state while it's
617 * tokenizing. This mutex also protects the current speculation.
619 mozilla::Mutex mTokenizerMutex;
622 * The scoped atom table
624 nsHtml5AtomTable mAtomTable;
627 * The owner parser.
629 RefPtr<nsHtml5Parser> mOwner;
632 * Whether the last character tokenized was a carriage return (for CRLF)
634 bool mLastWasCR;
637 * For tracking stream life cycle
639 eHtml5StreamState mStreamState;
642 * Whether we are speculating.
644 bool mSpeculating;
647 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
649 bool mAtEOF;
652 * The speculations. The mutex protects the nsTArray itself.
653 * To access the queue of current speculation, mTokenizerMutex must be
654 * obtained.
655 * The current speculation is the last element
657 nsTArray<mozilla::UniquePtr<nsHtml5Speculation>> mSpeculations;
658 mozilla::Mutex mSpeculationMutex;
661 * Number of times speculation has failed for this parser.
663 mozilla::Atomic<uint32_t> mSpeculationFailureCount;
666 * Number of bytes already buffered into mBufferedBytes.
668 uint32_t mNumBytesBuffered;
670 nsTArray<mozilla::Buffer<uint8_t>> mBufferedBytes;
673 * True to terminate early.
675 mozilla::Atomic<bool> mTerminated;
678 * True to release mTokenizerMutex early.
680 mozilla::Atomic<bool> mInterrupted;
683 * The thread this stream parser runs on.
685 nsCOMPtr<nsISerialEventTarget> mEventTarget;
687 nsCOMPtr<nsIRunnable> mExecutorFlusher;
689 nsCOMPtr<nsIRunnable> mLoadFlusher;
692 * This runnable is distinct from the regular flushers to
693 * signal the intent of encoding commitment without having to
694 * protect mPendingEncodingCommitment in the executer with a
695 * mutex.
697 nsCOMPtr<nsIRunnable> mEncodingCommitter;
700 * The generict detector.
702 mozilla::UniquePtr<mozilla::EncodingDetector> mDetector;
705 * The TLD we're loading from or empty if unknown.
707 nsCString mTLD;
710 * Whether the initial charset source was kCharsetFromParentFrame
712 bool mInitialEncodingWasFromParentFrame;
714 bool mHasHadErrors;
716 bool mDetectorHasSeenNonAscii;
719 * If true, we are decoding a local file that lacks an encoding
720 * declaration and we are not tokenizing yet.
722 bool mDecodingLocalFileWithoutTokenizing;
725 * Whether we are keeping the incoming bytes.
727 bool mBufferingBytes;
730 * Timer for flushing tree ops once in a while when not speculating.
732 nsCOMPtr<nsITimer> mFlushTimer;
735 * Mutex for protecting access to mFlushTimer (but not for the two
736 * mFlushTimerFoo booleans below).
738 mozilla::Mutex mFlushTimerMutex;
741 * Keeps track whether mFlushTimer has been armed. Unfortunately,
742 * nsITimer doesn't enable querying this from the timer itself.
744 bool mFlushTimerArmed;
747 * False initially and true after the timer has fired at least once.
749 bool mFlushTimerEverFired;
752 * Whether the parser is doing a normal parse, view source or plain text.
754 eParserMode mMode;
757 * If the associated docshell is being watched by the devtools, this is
758 * set to the URI associated with the parse. All parse data is sent to the
759 * devtools, along with this URI. This URI is cleared out after the parse has
760 * been marked as completed.
762 nsCOMPtr<nsIURI> mURIToSendToDevtools;
765 * If content is being sent to the devtools, an encoded UUID for the parser.
767 nsString mUUIDForDevtools;
770 * prevent multiple calls to OnStopRequest
771 * This field can be called from multiple threads and is protected by
772 * nsHtml5StreamListener::mDelegateMonitor passed in the OnStopRequest
774 bool mOnStopCalled{false};
777 * Used for telemetry about OnStopRequest vs OnDataFinished
779 // guarded by nsHtml5StreamListener::mDelegateMonitor
780 mozilla::TimeStamp mOnStopRequestTime;
781 mozilla::TimeStamp mOnDataFinishedTime;
784 #endif // nsHtml5StreamParser_h