Bumping gaia.json for 1 gaia revision(s) a=gaia-bump
[gecko.git] / parser / html / nsHtml5StreamParser.h
blobe4ff429194c4f1e332c6b91fca2b6f4d8d16c862
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsHtml5StreamParser_h
7 #define nsHtml5StreamParser_h
9 #include "nsAutoPtr.h"
10 #include "nsCOMPtr.h"
11 #include "nsICharsetDetectionObserver.h"
12 #include "nsHtml5MetaScanner.h"
13 #include "nsIUnicodeDecoder.h"
14 #include "nsHtml5TreeOpExecutor.h"
15 #include "nsHtml5OwningUTF16Buffer.h"
16 #include "nsIInputStream.h"
17 #include "mozilla/Mutex.h"
18 #include "nsHtml5AtomTable.h"
19 #include "nsHtml5Speculation.h"
20 #include "nsITimer.h"
21 #include "nsICharsetDetector.h"
23 class nsHtml5Parser;
25 #define NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE 1024
26 #define NS_HTML5_STREAM_PARSER_SNIFFING_BUFFER_SIZE 1024
28 enum eParserMode {
29 /**
30 * Parse a document normally as HTML.
32 NORMAL,
34 /**
35 * View document as HTML source.
37 VIEW_SOURCE_HTML,
39 /**
40 * View document as XML source
42 VIEW_SOURCE_XML,
44 /**
45 * View document as plain text source
47 VIEW_SOURCE_PLAIN,
49 /**
50 * View document as plain text
52 PLAIN_TEXT,
54 /**
55 * Load as data (XHR)
57 LOAD_AS_DATA
60 enum eBomState {
61 /**
62 * BOM sniffing hasn't started.
64 BOM_SNIFFING_NOT_STARTED = 0,
66 /**
67 * BOM sniffing is ongoing, and the first byte of an UTF-16LE BOM has been
68 * seen.
70 SEEN_UTF_16_LE_FIRST_BYTE = 1,
72 /**
73 * BOM sniffing is ongoing, and the first byte of an UTF-16BE BOM has been
74 * seen.
76 SEEN_UTF_16_BE_FIRST_BYTE = 2,
78 /**
79 * BOM sniffing is ongoing, and the first byte of an UTF-8 BOM has been
80 * seen.
82 SEEN_UTF_8_FIRST_BYTE = 3,
84 /**
85 * BOM sniffing is ongoing, and the first and second bytes of an UTF-8 BOM
86 * have been seen.
88 SEEN_UTF_8_SECOND_BYTE = 4,
90 /**
91 * BOM sniffing was started but is now over for whatever reason.
93 BOM_SNIFFING_OVER = 5
96 enum eHtml5StreamState {
97 STREAM_NOT_STARTED = 0,
98 STREAM_BEING_READ = 1,
99 STREAM_ENDED = 2
102 class nsHtml5StreamParser : public nsICharsetDetectionObserver {
104 friend class nsHtml5RequestStopper;
105 friend class nsHtml5DataAvailable;
106 friend class nsHtml5StreamParserContinuation;
107 friend class nsHtml5TimerKungFu;
109 public:
110 NS_DECL_AND_IMPL_ZEROING_OPERATOR_NEW
111 NS_DECL_CYCLE_COLLECTING_ISUPPORTS
112 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsHtml5StreamParser,
113 nsICharsetDetectionObserver)
115 static void InitializeStatics();
117 nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
118 nsHtml5Parser* aOwner,
119 eParserMode aMode);
121 // Methods that nsHtml5StreamListener calls
122 nsresult CheckListenerChain();
124 nsresult OnStartRequest(nsIRequest* aRequest, nsISupports* aContext);
126 nsresult OnDataAvailable(nsIRequest* aRequest,
127 nsISupports* aContext,
128 nsIInputStream* aInStream,
129 uint64_t aSourceOffset,
130 uint32_t aLength);
132 nsresult OnStopRequest(nsIRequest* aRequest,
133 nsISupports* aContext,
134 nsresult status);
136 // nsICharsetDetectionObserver
138 * Chardet calls this to report the detection result
140 NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf);
142 // EncodingDeclarationHandler
143 // http://hg.mozilla.org/projects/htmlparser/file/tip/src/nu/validator/htmlparser/common/EncodingDeclarationHandler.java
145 * Tree builder uses this to report a late <meta charset>
147 bool internalEncodingDeclaration(nsString* aEncoding);
149 // Not from an external interface
152 * Call this method once you've created a parser, and want to instruct it
153 * about what charset to load
155 * @param aCharset the charset of a document
156 * @param aCharsetSource the source of the charset
158 inline void SetDocumentCharset(const nsACString& aCharset, int32_t aSource) {
159 NS_PRECONDITION(mStreamState == STREAM_NOT_STARTED,
160 "SetDocumentCharset called too late.");
161 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
162 mCharset = aCharset;
163 mCharsetSource = aSource;
166 inline void SetObserver(nsIRequestObserver* aObserver) {
167 NS_ASSERTION(NS_IsMainThread(), "Wrong thread!");
168 mObserver = aObserver;
171 nsresult GetChannel(nsIChannel** aChannel);
174 * The owner parser must call this after script execution
175 * when no scripts are executing and the document.written
176 * buffer has been exhausted.
178 void ContinueAfterScripts(nsHtml5Tokenizer* aTokenizer,
179 nsHtml5TreeBuilder* aTreeBuilder,
180 bool aLastWasCR);
183 * Continues the stream parser if the charset switch failed.
185 void ContinueAfterFailedCharsetSwitch();
187 void Terminate()
189 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
190 mTerminated = true;
193 void DropTimer();
196 * Sets mCharset and mCharsetSource appropriately for the XML View Source
197 * case if aEncoding names a supported rough ASCII superset and sets
198 * the mCharset and mCharsetSource to the UTF-8 default otherwise.
200 void SetEncodingFromExpat(const char16_t* aEncoding);
203 * Sets the URL for View Source title in case this parser ends up being
204 * used for View Source. If aURL is a view-source: URL, takes the inner
205 * URL. data: URLs are shown with an ellipsis instead of the actual data.
207 void SetViewSourceTitle(nsIURI* aURL);
209 private:
210 virtual ~nsHtml5StreamParser();
212 #ifdef DEBUG
213 bool IsParserThread() {
214 bool ret;
215 mThread->IsOnCurrentThread(&ret);
216 return ret;
218 #endif
220 void MarkAsBroken(nsresult aRv);
223 * Marks the stream parser as interrupted. If you ever add calls to this
224 * method, be sure to review Uninterrupt usage very, very carefully to
225 * avoid having a previous in-flight runnable cancel your Interrupt()
226 * call on the other thread too soon.
228 void Interrupt()
230 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
231 mInterrupted = true;
234 void Uninterrupt()
236 NS_ASSERTION(IsParserThread(), "Wrong thread!");
237 mTokenizerMutex.AssertCurrentThreadOwns();
238 // Not acquiring mTerminatedMutex because mTokenizerMutex is already
239 // held at this point and is already stronger.
240 mInterrupted = false;
244 * Flushes the tree ops from the tree builder and disarms the flush
245 * timer.
247 void FlushTreeOpsAndDisarmTimer();
249 void ParseAvailableData();
251 void DoStopRequest();
253 void DoDataAvailable(const uint8_t* aBuffer, uint32_t aLength);
255 static NS_METHOD CopySegmentsToParser(nsIInputStream *aInStream,
256 void *aClosure,
257 const char *aFromSegment,
258 uint32_t aToOffset,
259 uint32_t aCount,
260 uint32_t *aWriteCount);
262 bool IsTerminatedOrInterrupted()
264 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
265 return mTerminated || mInterrupted;
268 bool IsTerminated()
270 mozilla::MutexAutoLock autoLock(mTerminatedMutex);
271 return mTerminated;
275 * True when there is a Unicode decoder already
277 inline bool HasDecoder()
279 return !!mUnicodeDecoder;
283 * Push bytes from network when there is no Unicode decoder yet
285 nsresult SniffStreamBytes(const uint8_t* aFromSegment,
286 uint32_t aCount,
287 uint32_t* aWriteCount);
290 * Push bytes from network when there is a Unicode decoder already
292 nsresult WriteStreamBytes(const uint8_t* aFromSegment,
293 uint32_t aCount,
294 uint32_t* aWriteCount);
297 * Check whether every other byte in the sniffing buffer is zero.
299 void SniffBOMlessUTF16BasicLatin(const uint8_t* aFromSegment,
300 uint32_t aCountToSniffingLimit);
303 * <meta charset> scan failed. Try chardet if applicable. After this, the
304 * the parser will have some encoding even if a last resolt fallback.
306 * @param aFromSegment The current network buffer or null if the sniffing
307 * buffer is being flushed due to network stream ending.
308 * @param aCount The number of bytes in aFromSegment (ignored if
309 * aFromSegment is null)
310 * @param aWriteCount Return value for how many bytes got read from the
311 * buffer.
312 * @param aCountToSniffingLimit The number of unfilled slots in
313 * mSniffingBuffer
315 nsresult FinalizeSniffing(const uint8_t* aFromSegment,
316 uint32_t aCount,
317 uint32_t* aWriteCount,
318 uint32_t aCountToSniffingLimit);
321 * Set up the Unicode decoder and write the sniffing buffer into it
322 * followed by the current network buffer.
324 * @param aFromSegment The current network buffer or null if the sniffing
325 * buffer is being flushed due to network stream ending.
326 * @param aCount The number of bytes in aFromSegment (ignored if
327 * aFromSegment is null)
328 * @param aWriteCount Return value for how many bytes got read from the
329 * buffer.
331 nsresult SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
332 uint32_t aCount,
333 uint32_t* aWriteCount);
336 * Initialize the Unicode decoder, mark the BOM as the source and
337 * drop the sniffer.
339 * @param aDecoderCharsetName The name for the decoder's charset
340 * (UTF-16BE, UTF-16LE or UTF-8; the BOM has
341 * been swallowed)
343 nsresult SetupDecodingFromBom(const char* aDecoderCharsetName);
346 * Become confident or resolve and encoding name to its preferred form.
347 * @param aEncoding the value of an internal encoding decl. Acts as an
348 * out param, too, when the method returns true.
349 * @return true if the parser needs to start using the new value of
350 * aEncoding and false if the parser became confident or if
351 * the encoding name did not specify a usable encoding
353 bool PreferredForInternalEncodingDecl(nsACString& aEncoding);
356 * Callback for mFlushTimer.
358 static void TimerCallback(nsITimer* aTimer, void* aClosure);
361 * Parser thread entry point for (maybe) flushing the ops and posting
362 * a flush runnable back on the main thread.
364 void TimerFlush();
366 nsCOMPtr<nsIRequest> mRequest;
367 nsCOMPtr<nsIRequestObserver> mObserver;
370 * The document title to use if this turns out to be a View Source parser.
372 nsCString mViewSourceTitle;
375 * The Unicode decoder
377 nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
380 * The buffer for sniffing the character encoding
382 nsAutoArrayPtr<uint8_t> mSniffingBuffer;
385 * The number of meaningful bytes in mSniffingBuffer
387 uint32_t mSniffingLength;
390 * BOM sniffing state
392 eBomState mBomState;
395 * <meta> prescan implementation
397 nsAutoPtr<nsHtml5MetaScanner> mMetaScanner;
399 // encoding-related stuff
401 * The source (confidence) of the character encoding in use
403 int32_t mCharsetSource;
406 * The character encoding in use
408 nsCString mCharset;
411 * Whether reparse is forbidden
413 bool mReparseForbidden;
415 // Portable parser objects
417 * The first buffer in the pending UTF-16 buffer queue
419 nsRefPtr<nsHtml5OwningUTF16Buffer> mFirstBuffer;
422 * The last buffer in the pending UTF-16 buffer queue
424 nsHtml5OwningUTF16Buffer* mLastBuffer; // weak ref; always points to
425 // a buffer of the size NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE
428 * The tree operation executor
430 nsHtml5TreeOpExecutor* mExecutor;
433 * The HTML5 tree builder
435 nsAutoPtr<nsHtml5TreeBuilder> mTreeBuilder;
438 * The HTML5 tokenizer
440 nsAutoPtr<nsHtml5Tokenizer> mTokenizer;
443 * Makes sure the main thread can't mess the tokenizer state while it's
444 * tokenizing. This mutex also protects the current speculation.
446 mozilla::Mutex mTokenizerMutex;
449 * The scoped atom table
451 nsHtml5AtomTable mAtomTable;
454 * The owner parser.
456 nsRefPtr<nsHtml5Parser> mOwner;
459 * Whether the last character tokenized was a carriage return (for CRLF)
461 bool mLastWasCR;
464 * For tracking stream life cycle
466 eHtml5StreamState mStreamState;
469 * Whether we are speculating.
471 bool mSpeculating;
474 * Whether the tokenizer has reached EOF. (Reset when stream rewinded.)
476 bool mAtEOF;
479 * The speculations. The mutex protects the nsTArray itself.
480 * To access the queue of current speculation, mTokenizerMutex must be
481 * obtained.
482 * The current speculation is the last element
484 nsTArray<nsAutoPtr<nsHtml5Speculation> > mSpeculations;
485 mozilla::Mutex mSpeculationMutex;
488 * True to terminate early; protected by mTerminatedMutex
490 bool mTerminated;
491 bool mInterrupted;
492 mozilla::Mutex mTerminatedMutex;
495 * The thread this stream parser runs on.
497 nsCOMPtr<nsIThread> mThread;
499 nsCOMPtr<nsIRunnable> mExecutorFlusher;
501 nsCOMPtr<nsIRunnable> mLoadFlusher;
504 * The chardet instance if chardet is enabled.
506 nsCOMPtr<nsICharsetDetector> mChardet;
509 * If false, don't push data to chardet.
511 bool mFeedChardet;
514 * Whether the initial charset source was kCharsetFromParentFrame
516 bool mInitialEncodingWasFromParentFrame;
519 * Timer for flushing tree ops once in a while when not speculating.
521 nsCOMPtr<nsITimer> mFlushTimer;
524 * Keeps track whether mFlushTimer has been armed. Unfortunately,
525 * nsITimer doesn't enable querying this from the timer itself.
527 bool mFlushTimerArmed;
530 * False initially and true after the timer has fired at least once.
532 bool mFlushTimerEverFired;
535 * Whether the parser is doing a normal parse, view source or plain text.
537 eParserMode mMode;
540 * The pref html5.flushtimer.initialdelay: Time in milliseconds between
541 * the time a network buffer is seen and the timer firing when the
542 * timer hasn't fired previously in this parse.
544 static int32_t sTimerInitialDelay;
547 * The pref html5.flushtimer.subsequentdelay: Time in milliseconds between
548 * the time a network buffer is seen and the timer firing when the
549 * timer has already fired previously in this parse.
551 static int32_t sTimerSubsequentDelay;
554 #endif // nsHtml5StreamParser_h