1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
11 #include "nsICharsetDetectionObserver.h"
12 #include "nsHtml5MetaScanner.h"
13 #include "nsIUnicodeDecoder.h"
14 #include "nsHtml5TreeOpExecutor.h"
15 #include "nsHtml5OwningUTF16Buffer.h"
16 #include "nsIInputStream.h"
17 #include "mozilla/Mutex.h"
18 #include "nsHtml5AtomTable.h"
19 #include "nsHtml5Speculation.h"
21 #include "nsICharsetDetector.h"
25 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
26 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
30 * Parse a document normally as HTML.
35 * View document as HTML source.
40 * View document as XML source
45 * View document as plain text source
50 * View document as plain text
62 * BOM sniffing hasn't started.
64 BOM_SNIFFING_NOT_STARTED
= 0,
67 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
70 SEEN_UTF_16_LE_FIRST_BYTE
= 1,
73 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
76 SEEN_UTF_16_BE_FIRST_BYTE
= 2,
79 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
82 SEEN_UTF_8_FIRST_BYTE
= 3,
85 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
88 SEEN_UTF_8_SECOND_BYTE
= 4,
91 * BOM sniffing was started but is now over for whatever reason.
96 enum eHtml5StreamState
{
97 STREAM_NOT_STARTED
= 0,
98 STREAM_BEING_READ
= 1,
102 class nsHtml5StreamParser
: public nsICharsetDetectionObserver
{
104 friend class nsHtml5RequestStopper
;
105 friend class nsHtml5DataAvailable
;
106 friend class nsHtml5StreamParserContinuation
;
107 friend class nsHtml5TimerKungFu
;
110 NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
111 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
112 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser
,
113 nsICharsetDetectionObserver
)
115 static void InitializeStatics();
117 nsHtml5StreamParser(nsHtml5TreeOpExecutor
* aExecutor
,
118 nsHtml5Parser
* aOwner
,
121 // Methods that nsHtml5StreamListener calls
122 nsresult
CheckListenerChain();
124 nsresult
OnStartRequest(nsIRequest
* aRequest
, nsISupports
* aContext
);
126 nsresult
OnDataAvailable(nsIRequest
* aRequest
,
127 nsISupports
* aContext
,
128 nsIInputStream
* aInStream
,
129 uint64_t aSourceOffset
,
132 nsresult
OnStopRequest(nsIRequest
* aRequest
,
133 nsISupports
* aContext
,
136 // nsICharsetDetectionObserver
138 * Chardet calls this to report the detection result
140 NS_IMETHOD
Notify(const char* aCharset
, nsDetectionConfident aConf
);
142 // EncodingDeclarationHandler
143 // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
145 * Tree builder uses this to report a late <meta charset>
147 bool internalEncodingDeclaration(nsString
* aEncoding
);
149 // Not from an external interface
152 * Call this method once you've created a parser, and want to instruct it
153 * about what charset to load
155 * @param aCharset the charset of a document
156 * @param aCharsetSource the source of the charset
158 inline void SetDocumentCharset(const nsACString
& aCharset
, int32_t aSource
) {
159 NS_PRECONDITION(mStreamState
== STREAM_NOT_STARTED
,
160 "SetDocumentCharset called too late.");
161 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
163 mCharsetSource
= aSource
;
166 inline void SetObserver(nsIRequestObserver
* aObserver
) {
167 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
168 mObserver
= aObserver
;
171 nsresult
GetChannel(nsIChannel
** aChannel
);
174 * The owner parser must call this after script execution
175 * when no scripts are executing and the document.written
176 * buffer has been exhausted.
178 void ContinueAfterScripts(nsHtml5Tokenizer
* aTokenizer
,
179 nsHtml5TreeBuilder
* aTreeBuilder
,
183 * Continues the stream parser if the charset switch failed.
185 void ContinueAfterFailedCharsetSwitch();
189 mozilla::MutexAutoLock
autoLock(mTerminatedMutex
);
196 * Sets mCharset and mCharsetSource appropriately for the XML View Source
197 * case if aEncoding names a supported rough ASCII superset and sets
198 * the mCharset and mCharsetSource to the UTF-8 default otherwise.
200 void SetEncodingFromExpat(const char16_t
* aEncoding
);
203 * Sets the URL for View Source title in case this parser ends up being
204 * used for View Source. If aURL is a view-source: URL, takes the inner
205 * URL. data: URLs are shown with an ellipsis instead of the actual data.
207 void SetViewSourceTitle(nsIURI
* aURL
);
210 virtual ~nsHtml5StreamParser();
213 bool IsParserThread() {
215 mThread
->IsOnCurrentThread(&ret
);
220 void MarkAsBroken(nsresult aRv
);
223 * Marks the stream parser as interrupted. If you ever add calls to this
224 * method, be sure to review Uninterrupt usage very, very carefully to
225 * avoid having a previous in-flight runnable cancel your Interrupt()
226 * call on the other thread too soon.
230 mozilla::MutexAutoLock
autoLock(mTerminatedMutex
);
236 NS_ASSERTION(IsParserThread(), "Wrong thread!");
237 mTokenizerMutex
.AssertCurrentThreadOwns();
238 // Not acquiring mTerminatedMutex because mTokenizerMutex is already
239 // held at this point and is already stronger.
240 mInterrupted
= false;
244 * Flushes the tree ops from the tree builder and disarms the flush
247 void FlushTreeOpsAndDisarmTimer();
249 void ParseAvailableData();
251 void DoStopRequest();
253 void DoDataAvailable(const uint8_t* aBuffer
, uint32_t aLength
);
255 static NS_METHOD
CopySegmentsToParser(nsIInputStream
*aInStream
,
257 const char *aFromSegment
,
260 uint32_t *aWriteCount
);
262 bool IsTerminatedOrInterrupted()
264 mozilla::MutexAutoLock
autoLock(mTerminatedMutex
);
265 return mTerminated
|| mInterrupted
;
270 mozilla::MutexAutoLock
autoLock(mTerminatedMutex
);
275 * True when there is a Unicode decoder already
277 inline bool HasDecoder()
279 return !!mUnicodeDecoder
;
283 * Push bytes from network when there is no Unicode decoder yet
285 nsresult
SniffStreamBytes(const uint8_t* aFromSegment
,
287 uint32_t* aWriteCount
);
290 * Push bytes from network when there is a Unicode decoder already
292 nsresult
WriteStreamBytes(const uint8_t* aFromSegment
,
294 uint32_t* aWriteCount
);
297 * Check whether every other byte in the sniffing buffer is zero.
299 void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment
,
300 uint32_t aCountToSniffingLimit
);
303 * <meta charset> scan failed. Try chardet if applicable. After this, the
304 * the parser will have some encoding even if a last resolt fallback.
306 * @param aFromSegment The current network buffer or null if the sniffing
307 * buffer is being flushed due to network stream ending.
308 * @param aCount The number of bytes in aFromSegment (ignored if
309 * aFromSegment is null)
310 * @param aWriteCount Return value for how many bytes got read from the
312 * @param aCountToSniffingLimit The number of unfilled slots in
315 nsresult
FinalizeSniffing(const uint8_t* aFromSegment
,
317 uint32_t* aWriteCount
,
318 uint32_t aCountToSniffingLimit
);
321 * Set up the Unicode decoder and write the sniffing buffer into it
322 * followed by the current network buffer.
324 * @param aFromSegment The current network buffer or null if the sniffing
325 * buffer is being flushed due to network stream ending.
326 * @param aCount The number of bytes in aFromSegment (ignored if
327 * aFromSegment is null)
328 * @param aWriteCount Return value for how many bytes got read from the
331 nsresult
SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment
,
333 uint32_t* aWriteCount
);
336 * Initialize the Unicode decoder, mark the BOM as the source and
339 * @param aDecoderCharsetName The name for the decoder's charset
340 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
343 nsresult
SetupDecodingFromBom(const char* aDecoderCharsetName
);
346 * Become confident or resolve and encoding name to its preferred form.
347 * @param aEncoding the value of an internal encoding decl. Acts as an
348 * out param, too, when the method returns true.
349 * @return true if the parser needs to start using the new value of
350 * aEncoding and false if the parser became confident or if
351 * the encoding name did not specify a usable encoding
353 bool PreferredForInternalEncodingDecl(nsACString
& aEncoding
);
356 * Callback for mFlushTimer.
358 static void TimerCallback(nsITimer
* aTimer
, void* aClosure
);
361 * Parser thread entry point for (maybe) flushing the ops and posting
362 * a flush runnable back on the main thread.
366 nsCOMPtr
<nsIRequest
> mRequest
;
367 nsCOMPtr
<nsIRequestObserver
> mObserver
;
370 * The document title to use if this turns out to be a View Source parser.
372 nsCString mViewSourceTitle
;
375 * The Unicode decoder
377 nsCOMPtr
<nsIUnicodeDecoder
> mUnicodeDecoder
;
380 * The buffer for sniffing the character encoding
382 nsAutoArrayPtr
<uint8_t> mSniffingBuffer
;
385 * The number of meaningful bytes in mSniffingBuffer
387 uint32_t mSniffingLength
;
395 * <meta> prescan implementation
397 nsAutoPtr
<nsHtml5MetaScanner
> mMetaScanner
;
399 // encoding-related stuff
401 * The source (confidence) of the character encoding in use
403 int32_t mCharsetSource
;
406 * The character encoding in use
411 * Whether reparse is forbidden
413 bool mReparseForbidden
;
415 // Portable parser objects
417 * The first buffer in the pending UTF-16 buffer queue
419 nsRefPtr
<nsHtml5OwningUTF16Buffer
> mFirstBuffer
;
422 * The last buffer in the pending UTF-16 buffer queue
424 nsHtml5OwningUTF16Buffer
* mLastBuffer
; // weak ref; always points to
425 // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
428 * The tree operation executor
430 nsHtml5TreeOpExecutor
* mExecutor
;
433 * The HTML5 tree builder
435 nsAutoPtr
<nsHtml5TreeBuilder
> mTreeBuilder
;
438 * The HTML5 tokenizer
440 nsAutoPtr
<nsHtml5Tokenizer
> mTokenizer
;
443 * Makes sure the main thread can't mess the tokenizer state while it's
444 * tokenizing. This mutex also protects the current speculation.
446 mozilla::Mutex mTokenizerMutex
;
449 * The scoped atom table
451 nsHtml5AtomTable mAtomTable
;
456 nsRefPtr
<nsHtml5Parser
> mOwner
;
459 * Whether the last character tokenized was a carriage return (for CRLF)
464 * For tracking stream life cycle
466 eHtml5StreamState mStreamState
;
469 * Whether we are speculating.
474 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
479 * The speculations. The mutex protects the nsTArray itself.
480 * To access the queue of current speculation, mTokenizerMutex must be
482 * The current speculation is the last element
484 nsTArray
<nsAutoPtr
<nsHtml5Speculation
> > mSpeculations
;
485 mozilla::Mutex mSpeculationMutex
;
488 * True to terminate early; protected by mTerminatedMutex
492 mozilla::Mutex mTerminatedMutex
;
495 * The thread this stream parser runs on.
497 nsCOMPtr
<nsIThread
> mThread
;
499 nsCOMPtr
<nsIRunnable
> mExecutorFlusher
;
501 nsCOMPtr
<nsIRunnable
> mLoadFlusher
;
504 * The chardet instance if chardet is enabled.
506 nsCOMPtr
<nsICharsetDetector
> mChardet
;
509 * If false, don't push data to chardet.
514 * Whether the initial charset source was kCharsetFromParentFrame
516 bool mInitialEncodingWasFromParentFrame
;
519 * Timer for flushing tree ops once in a while when not speculating.
521 nsCOMPtr
<nsITimer
> mFlushTimer
;
524 * Keeps track whether mFlushTimer has been armed. Unfortunately,
525 * nsITimer doesn't enable querying this from the timer itself.
527 bool mFlushTimerArmed
;
530 * False initially and true after the timer has fired at least once.
532 bool mFlushTimerEverFired
;
535 * Whether the parser is doing a normal parse, view source or plain text.
540 * The pref html5.flushtimer.initialdelay: Time in milliseconds between
541 * the time a network buffer is seen and the timer firing when the
542 * timer hasn't fired previously in this parse.
544 static int32_t sTimerInitialDelay
;
547 * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
548 * the time a network buffer is seen and the timer firing when the
549 * timer has already fired previously in this parse.
551 static int32_t sTimerSubsequentDelay
;
554 #endif // nsHtml5StreamParser_h