Bug 1845715 - Check for failure when getting RegExp match result template r=iain
[gecko.git] / parser / html / nsHtml5StreamParser.h
blob0dacf257bf3411a219f75ee89a6bb93401f6269f
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
9 #include <tuple>
11 #include "MainThreadUtils.h"
12 #include "mozilla/AlreadyAddRefed.h"
13 #include "mozilla/Assertions.h"
14 #include "mozilla/Encoding.h"
15 #include "mozilla/Mutex.h"
16 #include "mozilla/NotNull.h"
17 #include "mozilla/RefPtr.h"
18 #include "mozilla/Span.h"
19 #include "mozilla/UniquePtr.h"
20 #include "nsCharsetSource.h"
21 #include "nsCOMPtr.h"
22 #include "nsCycleCollectionParticipant.h"
23 #include "nsDebug.h"
24 #include "nsHtml5AtomTable.h"
25 #include "nsIRequestObserver.h"
26 #include "nsISerialEventTarget.h"
27 #include "nsISupports.h"
28 #include "nsStringFwd.h"
29 #include "nsTArray.h"
30 #include "nscore.h"
32 class nsCycleCollectionTraversalCallback;
33 class nsHtml5OwningUTF16Buffer;
34 class nsHtml5Parser;
35 class nsHtml5Speculation;
36 class nsHtml5String;
37 class nsHtml5Tokenizer;
38 class nsHtml5TreeBuilder;
39 class nsHtml5TreeOpExecutor;
40 class nsIChannel;
41 class nsIInputStream;
42 class nsIRequest;
43 class nsIRunnable;
44 class nsITimer;
45 class nsIURI;
47 namespace mozilla {
48 class EncodingDetector;
49 template <typename T>
50 class Buffer;
52 namespace dom {
53 class DocGroup;
55 } // namespace mozilla
57 enum eParserMode {
58 /**
59 * Parse a document normally as HTML.
61 NORMAL,
63 /**
64 * View document as HTML source.
66 VIEW_SOURCE_HTML,
68 /**
69 * View document as XML source
71 VIEW_SOURCE_XML,
73 /**
74 * View document as plain text source
76 VIEW_SOURCE_PLAIN,
78 /**
79 * View document as plain text
81 PLAIN_TEXT,
83 /**
84 * Load as data (XHR)
86 LOAD_AS_DATA
89 enum eBomState {
90 /**
91 * BOM sniffing hasn't started.
93 BOM_SNIFFING_NOT_STARTED,
95 /**
96 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
97 * seen.
99 SEEN_UTF_16_LE_FIRST_BYTE,
102 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
103 * seen.
105 SEEN_UTF_16_BE_FIRST_BYTE,
108 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
109 * seen.
111 SEEN_UTF_8_FIRST_BYTE,
114 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
115 * have been seen.
117 SEEN_UTF_8_SECOND_BYTE,
120 * Seen \x00 in UTF-16BE bogo-XML declaration.
122 SEEN_UTF_16_BE_XML_FIRST,
125 * Seen \x00< in UTF-16BE bogo-XML declaration.
127 SEEN_UTF_16_BE_XML_SECOND,
130 * Seen \x00<\x00 in UTF-16BE bogo-XML declaration.
132 SEEN_UTF_16_BE_XML_THIRD,
135 * Seen \x00<\x00? in UTF-16BE bogo-XML declaration.
137 SEEN_UTF_16_BE_XML_FOURTH,
140 * Seen \x00<\x00?\x00 in UTF-16BE bogo-XML declaration.
142 SEEN_UTF_16_BE_XML_FIFTH,
145 * Seen < in UTF-16BE bogo-XML declaration.
147 SEEN_UTF_16_LE_XML_FIRST,
150 * Seen <\x00 in UTF-16BE bogo-XML declaration.
152 SEEN_UTF_16_LE_XML_SECOND,
155 * Seen <\x00? in UTF-16BE bogo-XML declaration.
157 SEEN_UTF_16_LE_XML_THIRD,
160 * Seen <\x00?\x00 in UTF-16BE bogo-XML declaration.
162 SEEN_UTF_16_LE_XML_FOURTH,
165 * Seen <\x00?\x00x in UTF-16BE bogo-XML declaration.
167 SEEN_UTF_16_LE_XML_FIFTH,
170 * BOM sniffing was started but is now over for whatever reason.
172 BOM_SNIFFING_OVER,
175 enum eHtml5StreamState {
176 STREAM_NOT_STARTED = 0,
177 STREAM_BEING_READ = 1,
178 STREAM_ENDED = 2
181 class nsHtml5StreamParser final : public nsISupports {
182 template <typename T>
183 using NotNull = mozilla::NotNull<T>;
184 using Encoding = mozilla::Encoding;
186 const uint32_t UNCONDITIONAL_META_SCAN_BOUNDARY = 1024;
187 const uint32_t READ_BUFFER_SIZE = 1024;
188 const uint32_t LOCAL_FILE_UTF_8_BUFFER_SIZE = 1024 * 1024 * 4; // 4 MB
190 friend class nsHtml5RequestStopper;
191 friend class nsHtml5DataAvailable;
192 friend class nsHtml5StreamParserContinuation;
193 friend class nsHtml5TimerKungFu;
194 friend class nsHtml5StreamParserPtr;
195 friend class nsHtml5StreamListener;
197 public:
198 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
199 NS_DECL_CYCLE_COLLECTION_CLASS(nsHtml5StreamParser)
201 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor, nsHtml5Parser* aOwner,
202 eParserMode aMode);
204 nsresult OnStartRequest(nsIRequest* aRequest);
206 nsresult OnDataAvailable(nsIRequest* aRequest, nsIInputStream* aInStream,
207 uint64_t aSourceOffset, uint32_t aLength);
209 nsresult OnStopRequest(nsIRequest* aRequest, nsresult status);
211 // EncodingDeclarationHandler
212 // https://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
214 * Tree builder uses this to report a late <meta charset>
216 bool internalEncodingDeclaration(nsHtml5String aEncoding);
218 bool TemplatePushedOrHeadPopped();
220 void RememberGt(int32_t aPos);
222 // Not from an external interface
225 * Post a runnable to the main thread to perform the speculative load
226 * operations without performing the tree operations.
228 * This should be called at the end of each data available or stop
229 * request runnable running on the parser thread.
231 void PostLoadFlusher();
234 * Pass a buffer to chardetng.
236 void FeedDetector(mozilla::Span<const uint8_t> aBuffer);
239 * Report EOF to chardetng.
241 void DetectorEof();
244 * Call this method once you've created a parser, and want to instruct it
245 * about what charset to load
247 * @param aEncoding the charset of a document
248 * @param aCharsetSource the source of the charset
250 inline void SetDocumentCharset(NotNull<const Encoding*> aEncoding,
251 nsCharsetSource aSource,
252 bool aForceAutoDetection) {
253 MOZ_ASSERT(mStreamState == STREAM_NOT_STARTED,
254 "SetDocumentCharset called too late.");
255 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
256 MOZ_ASSERT(!(aForceAutoDetection && aSource >= kCharsetFromOtherComponent),
257 "Can't force with high-ranking source.");
258 mEncoding = aEncoding;
259 mCharsetSource = aSource;
260 mForceAutoDetection = aForceAutoDetection;
261 mChannelHadCharset = (aSource == kCharsetFromChannel);
264 nsresult GetChannel(nsIChannel** aChannel);
267 * The owner parser must call this after script execution
268 * when no scripts are executing and the document.written
269 * buffer has been exhausted.
271 * If the first two arguments are nullptr, instead of
272 * continuing after scripts, this method commits to an
273 * internally-discovered encoding.
275 void ContinueAfterScriptsOrEncodingCommitment(
276 nsHtml5Tokenizer* aTokenizer, nsHtml5TreeBuilder* aTreeBuilder,
277 bool aLastWasCR);
280 * Continues the stream parser if the charset switch failed.
282 void ContinueAfterFailedCharsetSwitch();
284 void Terminate() { mTerminated = true; }
286 void DropTimer();
289 * Sets the URL for View Source title in case this parser ends up being
290 * used for View Source. If aURL is a view-source: URL, takes the inner
291 * URL. data: URLs are shown with an ellipsis instead of the actual data.
293 void SetViewSourceTitle(nsIURI* aURL);
295 private:
296 virtual ~nsHtml5StreamParser();
298 #ifdef DEBUG
299 bool IsParserThread() { return mEventTarget->IsOnCurrentThread(); }
300 #endif
302 void MarkAsBroken(nsresult aRv);
305 * Marks the stream parser as interrupted. If you ever add calls to this
306 * method, be sure to review Uninterrupt usage very, very carefully to
307 * avoid having a previous in-flight runnable cancel your Interrupt()
308 * call on the other thread too soon.
310 void Interrupt() {
311 MOZ_ASSERT(NS_IsMainThread(), "Wrong thread!");
312 mInterrupted = true;
315 void Uninterrupt() MOZ_NO_THREAD_SAFETY_ANALYSIS {
316 MOZ_ASSERT(IsParserThread(), "Wrong thread!");
317 mTokenizerMutex.AssertCurrentThreadOwns();
318 mInterrupted = false;
322 * Flushes the tree ops from the tree builder and disarms the flush
323 * timer.
325 void FlushTreeOpsAndDisarmTimer();
327 void SwitchDecoderIfAsciiSoFar(NotNull<const Encoding*> aEncoding)
328 MOZ_REQUIRES(mTokenizerMutex);
331 size_t CountGts();
333 void DiscardMetaSpeculation();
335 bool ProcessLookingForMetaCharset(bool aEof) MOZ_REQUIRES(mTokenizerMutex);
337 void ParseAvailableData();
339 void DoStopRequest();
341 void DoDataAvailableBuffer(mozilla::Buffer<uint8_t>&& aBuffer)
342 MOZ_REQUIRES(mTokenizerMutex);
344 void DoDataAvailable(mozilla::Span<const uint8_t> aBuffer)
345 MOZ_REQUIRES(mTokenizerMutex);
347 static nsresult CopySegmentsToParser(nsIInputStream* aInStream,
348 void* aClosure, const char* aFromSegment,
349 uint32_t aToOffset, uint32_t aCount,
350 uint32_t* aWriteCount)
351 MOZ_REQUIRES(mTokenizerMutex);
353 bool IsTerminatedOrInterrupted() { return mTerminated || mInterrupted; }
355 bool IsTerminated() { return mTerminated; }
358 * True when there is a Unicode decoder already
360 inline bool HasDecoder() { return !!mUnicodeDecoder; }
363 * Returns 0 if 1) there aren't at least 2 buffers in mBufferedBytes
364 * or 2) there is no byte '>' in the second buffer.
365 * Otherwise, returns the length of the prefix of the second buffer
366 * that is long enough to contain the first byte '>' in the second
367 * buffer (including the '>' byte).
369 size_t LengthOfLtContainingPrefixInSecondBuffer();
372 * Push bytes from network when there is no Unicode decoder yet
374 nsresult SniffStreamBytes(mozilla::Span<const uint8_t> aFromSegment,
375 bool aEof) MOZ_REQUIRES(mTokenizerMutex);
378 * Push bytes from network when there is a Unicode decoder already
380 nsresult WriteStreamBytes(mozilla::Span<const uint8_t> aFromSegment)
381 MOZ_REQUIRES(mTokenizerMutex);
384 * Set up the Unicode decoder and write the sniffing buffer into it
385 * followed by the current network buffer.
387 * @param aPrefix the part of the stream that has already been seen
388 * prior to aFromSegment. In practice, these are the
389 * bytes that are baked into the state of the BOM
390 * and UTF-16 XML declaration-like sniffing state
391 * machine state.
392 * @param aFromSegment The current network buffer
394 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
395 mozilla::Span<const uint8_t> aPrefix,
396 mozilla::Span<const uint8_t> aFromSegment) MOZ_REQUIRES(mTokenizerMutex);
399 * Initialize the Unicode decoder, mark the BOM as the source and
400 * drop the sniffer.
402 * @param aDecoderCharsetName The name for the decoder's charset
403 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
404 * been swallowed)
406 void SetupDecodingFromBom(NotNull<const Encoding*> aEncoding);
408 void SetupDecodingFromUtf16BogoXml(NotNull<const Encoding*> aEncoding);
411 * When speculatively decoding from file: URL as UTF-8, commit
412 * to UTF-8 as the non-speculative encoding and start processing
413 * the decoded data.
415 [[nodiscard]] nsresult CommitLocalFileToEncoding();
418 * When speculatively decoding from file: URL as UTF-8, redecode
419 * using fallback and then continue normally with the fallback.
421 [[nodiscard]] nsresult ReDecodeLocalFile() MOZ_REQUIRES(mTokenizerMutex);
424 * Potentially guess the encoding using mozilla::EncodingDetector.
425 * Returns the guessed encoding and a telemetry-appropriate source.
427 std::tuple<NotNull<const Encoding*>, nsCharsetSource> GuessEncoding(
428 bool aInitial);
431 * Become confident or resolve and encoding name to its preferred form.
432 * @param aEncoding the value of an internal encoding decl. Acts as an
433 * out param, too, when the method returns true.
434 * @return true if the parser needs to start using the new value of
435 * aEncoding and false if the parser became confident or if
436 * the encoding name did not specify a usable encoding
438 const Encoding* PreferredForInternalEncodingDecl(const nsAString& aEncoding);
441 * Callback for mFlushTimer.
443 static void TimerCallback(nsITimer* aTimer, void* aClosure);
446 * Parser thread entry point for (maybe) flushing the ops and posting
447 * a flush runnable back on the main thread.
449 void TimerFlush();
452 * Called when speculation fails.
454 void MaybeDisableFutureSpeculation() { mSpeculationFailureCount++; }
457 * Used to check whether we're getting too many speculation failures and
458 * should just stop trying. The 100 is picked pretty randomly to be not too
459 * small (so most pages are not affected) but small enough that we don't end
460 * up with failed speculations over and over in pathological cases.
462 bool IsSpeculationEnabled() { return mSpeculationFailureCount < 100; }
465 * Dispatch an event to a Quantum DOM main thread-ish thread.
466 * (Not the parser thread.)
468 nsresult DispatchToMain(already_AddRefed<nsIRunnable>&& aRunnable);
471 * Notify any devtools listeners about content newly received for parsing.
473 inline void OnNewContent(mozilla::Span<const char16_t> aData);
476 * Notify any devtools listeners after all parse content has been received.
478 inline void OnContentComplete();
480 nsCOMPtr<nsIRequest> mRequest;
483 * The document title to use if this turns out to be a View Source parser.
485 nsCString mViewSourceTitle;
488 * The Unicode decoder
490 mozilla::UniquePtr<mozilla::Decoder> mUnicodeDecoder;
493 * BOM sniffing state
495 eBomState mBomState;
497 // encoding-related stuff
499 * The source (confidence) of the character encoding in use
501 nsCharsetSource mCharsetSource;
503 nsCharsetSource mEncodingSwitchSource;
506 * The character encoding in use
508 NotNull<const Encoding*> mEncoding;
510 const Encoding* mNeedsEncodingSwitchTo;
512 bool mSeenEligibleMetaCharset;
514 bool mChardetEof;
516 #ifdef DEBUG
518 bool mStartedFeedingDetector;
520 bool mStartedFeedingDevTools;
522 #endif
525 * Whether reparse is forbidden
527 bool mReparseForbidden;
530 * Whether the Repair Text Encoding menu item was invoked
532 bool mForceAutoDetection;
535 * Whether there was a valid charset parameter on the HTTP layer.
537 bool mChannelHadCharset;
540 * We are in the process of looking for <meta charset>
542 bool mLookingForMetaCharset;
545 * Whether the byte stream started with ASCII <?
547 bool mStartsWithLtQuestion;
550 * If we are viewing XML source and are waiting for a '>' form the network.
552 bool mLookingForXmlDeclarationForXmlViewSource;
555 * Whether template has been pushed or head popped within the first 1024
556 * bytes.
558 bool mTemplatePushedOrHeadPopped;
560 // Portable parser objects
562 * The first buffer in the pending UTF-16 buffer queue
564 RefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
567 * Non-owning pointer to the most recent buffer that contains the most recent
568 * remembered greater-than sign. Used only while mLookingForMetaCharset is
569 * true. While mLookingForMetaCharset is true, mFirstBuffer is not changed and
570 * keeps the whole linked list of buffers alive. This pointer is non-owning to
571 * avoid frequent refcounting.
573 nsHtml5OwningUTF16Buffer* mGtBuffer;
575 int32_t mGtPos;
578 * The last buffer in the pending UTF-16 buffer queue
580 nsHtml5OwningUTF16Buffer*
581 mLastBuffer; // weak ref; always points to
582 // a buffer of the size
583 // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
586 * The first buffer of the document if looking for <meta charset> or
587 * nullptr afterwards.
589 RefPtr<nsHtml5OwningUTF16Buffer> mFirstBufferOfMetaScan;
592 * The tree operation executor
594 nsHtml5TreeOpExecutor* mExecutor;
597 * Network event target for mExecutor->mDocument
599 nsCOMPtr<nsISerialEventTarget> mNetworkEventTarget;
602 * The HTML5 tree builder
604 mozilla::UniquePtr<nsHtml5TreeBuilder> mTreeBuilder;
607 * The HTML5 tokenizer
609 mozilla::UniquePtr<nsHtml5Tokenizer> mTokenizer;
612 * Makes sure the main thread can't mess the tokenizer state while it's
613 * tokenizing. This mutex also protects the current speculation.
615 mozilla::Mutex mTokenizerMutex;
618 * The scoped atom table
620 nsHtml5AtomTable mAtomTable;
623 * The owner parser.
625 RefPtr<nsHtml5Parser> mOwner;
628 * Whether the last character tokenized was a carriage return (for CRLF)
630 bool mLastWasCR;
633 * For tracking stream life cycle
635 eHtml5StreamState mStreamState;
638 * Whether we are speculating.
640 bool mSpeculating;
643 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
645 bool mAtEOF;
648 * The speculations. The mutex protects the nsTArray itself.
649 * To access the queue of current speculation, mTokenizerMutex must be
650 * obtained.
651 * The current speculation is the last element
653 nsTArray<mozilla::UniquePtr<nsHtml5Speculation>> mSpeculations;
654 mozilla::Mutex mSpeculationMutex;
657 * Number of times speculation has failed for this parser.
659 mozilla::Atomic<uint32_t> mSpeculationFailureCount;
662 * Number of bytes already buffered into mBufferedBytes.
664 uint32_t mNumBytesBuffered;
666 nsTArray<mozilla::Buffer<uint8_t>> mBufferedBytes;
669 * True to terminate early.
671 mozilla::Atomic<bool> mTerminated;
674 * True to release mTokenizerMutex early.
676 mozilla::Atomic<bool> mInterrupted;
679 * The thread this stream parser runs on.
681 nsCOMPtr<nsISerialEventTarget> mEventTarget;
683 nsCOMPtr<nsIRunnable> mExecutorFlusher;
685 nsCOMPtr<nsIRunnable> mLoadFlusher;
688 * This runnable is distinct from the regular flushers to
689 * signal the intent of encoding commitment without having to
690 * protect mPendingEncodingCommitment in the executer with a
691 * mutex.
693 nsCOMPtr<nsIRunnable> mEncodingCommitter;
696 * The generict detector.
698 mozilla::UniquePtr<mozilla::EncodingDetector> mDetector;
701 * The TLD we're loading from or empty if unknown.
703 nsCString mTLD;
706 * Whether the initial charset source was kCharsetFromParentFrame
708 bool mInitialEncodingWasFromParentFrame;
710 bool mHasHadErrors;
712 bool mDetectorHasSeenNonAscii;
715 * If true, we are decoding a local file that lacks an encoding
716 * declaration and we are not tokenizing yet.
718 bool mDecodingLocalFileWithoutTokenizing;
721 * Whether we are keeping the incoming bytes.
723 bool mBufferingBytes;
726 * Timer for flushing tree ops once in a while when not speculating.
728 nsCOMPtr<nsITimer> mFlushTimer;
731 * Mutex for protecting access to mFlushTimer (but not for the two
732 * mFlushTimerFoo booleans below).
734 mozilla::Mutex mFlushTimerMutex;
737 * Keeps track whether mFlushTimer has been armed. Unfortunately,
738 * nsITimer doesn't enable querying this from the timer itself.
740 bool mFlushTimerArmed;
743 * False initially and true after the timer has fired at least once.
745 bool mFlushTimerEverFired;
748 * Whether the parser is doing a normal parse, view source or plain text.
750 eParserMode mMode;
753 * If the associated docshell is being watched by the devtools, this is
754 * set to the URI associated with the parse. All parse data is sent to the
755 * devtools, along with this URI. This URI is cleared out after the parse has
756 * been marked as completed.
758 nsCOMPtr<nsIURI> mURIToSendToDevtools;
761 * If content is being sent to the devtools, an encoded UUID for the parser.
763 nsString mUUIDForDevtools;
766 #endif // nsHtml5StreamParser_h