2 * Copyright (c) 2005-2007 Henri Sivonen
3 * Copyright (c) 2007-2015 Mozilla Foundation
4 * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
5 * Foundation, and Opera Software ASA.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
27 * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
28 * Please edit Tokenizer.java instead and regenerate.
31 #ifndef nsHtml5Tokenizer_h
32 #define nsHtml5Tokenizer_h
35 #include "nsHtml5AtomTable.h"
36 #include "nsHtml5String.h"
37 #include "nsIContent.h"
38 #include "nsTraceRefcnt.h"
40 #include "nsHtml5DocumentMode.h"
41 #include "nsHtml5ArrayCopy.h"
42 #include "nsHtml5NamedCharacters.h"
43 #include "nsHtml5NamedCharactersAccel.h"
44 #include "nsGkAtoms.h"
45 #include "nsAHtml5TreeBuilderState.h"
46 #include "nsHtml5Macros.h"
47 #include "nsHtml5Highlighter.h"
48 #include "nsHtml5TokenizerLoopPolicies.h"
50 class nsHtml5StreamParser
;
52 class nsHtml5AttributeName
;
53 class nsHtml5ElementName
;
54 class nsHtml5TreeBuilder
;
55 class nsHtml5MetaScanner
;
56 class nsHtml5UTF16Buffer
;
57 class nsHtml5StateSnapshot
;
58 class nsHtml5Portability
;
60 class nsHtml5Tokenizer
63 static const int32_t DATA_AND_RCDATA_MASK
= ~1;
66 static const int32_t DATA
= 0;
68 static const int32_t RCDATA
= 1;
70 static const int32_t SCRIPT_DATA
= 2;
72 static const int32_t RAWTEXT
= 3;
74 static const int32_t SCRIPT_DATA_ESCAPED
= 4;
76 static const int32_t ATTRIBUTE_VALUE_DOUBLE_QUOTED
= 5;
78 static const int32_t ATTRIBUTE_VALUE_SINGLE_QUOTED
= 6;
80 static const int32_t ATTRIBUTE_VALUE_UNQUOTED
= 7;
82 static const int32_t PLAINTEXT
= 8;
84 static const int32_t TAG_OPEN
= 9;
86 static const int32_t CLOSE_TAG_OPEN
= 10;
88 static const int32_t TAG_NAME
= 11;
90 static const int32_t BEFORE_ATTRIBUTE_NAME
= 12;
92 static const int32_t ATTRIBUTE_NAME
= 13;
94 static const int32_t AFTER_ATTRIBUTE_NAME
= 14;
96 static const int32_t BEFORE_ATTRIBUTE_VALUE
= 15;
98 static const int32_t AFTER_ATTRIBUTE_VALUE_QUOTED
= 16;
100 static const int32_t BOGUS_COMMENT
= 17;
102 static const int32_t MARKUP_DECLARATION_OPEN
= 18;
104 static const int32_t DOCTYPE
= 19;
106 static const int32_t BEFORE_DOCTYPE_NAME
= 20;
108 static const int32_t DOCTYPE_NAME
= 21;
110 static const int32_t AFTER_DOCTYPE_NAME
= 22;
112 static const int32_t BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
= 23;
114 static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
= 24;
116 static const int32_t DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
= 25;
118 static const int32_t AFTER_DOCTYPE_PUBLIC_IDENTIFIER
= 26;
120 static const int32_t BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
= 27;
122 static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
= 28;
124 static const int32_t DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
= 29;
126 static const int32_t AFTER_DOCTYPE_SYSTEM_IDENTIFIER
= 30;
128 static const int32_t BOGUS_DOCTYPE
= 31;
130 static const int32_t COMMENT_START
= 32;
132 static const int32_t COMMENT_START_DASH
= 33;
134 static const int32_t COMMENT
= 34;
136 static const int32_t COMMENT_END_DASH
= 35;
138 static const int32_t COMMENT_END
= 36;
140 static const int32_t COMMENT_END_BANG
= 37;
142 static const int32_t NON_DATA_END_TAG_NAME
= 38;
144 static const int32_t MARKUP_DECLARATION_HYPHEN
= 39;
146 static const int32_t MARKUP_DECLARATION_OCTYPE
= 40;
148 static const int32_t DOCTYPE_UBLIC
= 41;
150 static const int32_t DOCTYPE_YSTEM
= 42;
152 static const int32_t AFTER_DOCTYPE_PUBLIC_KEYWORD
= 43;
154 static const int32_t BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
= 44;
156 static const int32_t AFTER_DOCTYPE_SYSTEM_KEYWORD
= 45;
158 static const int32_t CONSUME_CHARACTER_REFERENCE
= 46;
160 static const int32_t CONSUME_NCR
= 47;
162 static const int32_t CHARACTER_REFERENCE_TAIL
= 48;
164 static const int32_t HEX_NCR_LOOP
= 49;
166 static const int32_t DECIMAL_NRC_LOOP
= 50;
168 static const int32_t HANDLE_NCR_VALUE
= 51;
170 static const int32_t HANDLE_NCR_VALUE_RECONSUME
= 52;
172 static const int32_t CHARACTER_REFERENCE_HILO_LOOKUP
= 53;
174 static const int32_t SELF_CLOSING_START_TAG
= 54;
176 static const int32_t CDATA_START
= 55;
178 static const int32_t CDATA_SECTION
= 56;
180 static const int32_t CDATA_RSQB
= 57;
182 static const int32_t CDATA_RSQB_RSQB
= 58;
184 static const int32_t SCRIPT_DATA_LESS_THAN_SIGN
= 59;
186 static const int32_t SCRIPT_DATA_ESCAPE_START
= 60;
188 static const int32_t SCRIPT_DATA_ESCAPE_START_DASH
= 61;
190 static const int32_t SCRIPT_DATA_ESCAPED_DASH
= 62;
192 static const int32_t SCRIPT_DATA_ESCAPED_DASH_DASH
= 63;
194 static const int32_t BOGUS_COMMENT_HYPHEN
= 64;
196 static const int32_t RAWTEXT_RCDATA_LESS_THAN_SIGN
= 65;
198 static const int32_t SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
= 66;
200 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_START
= 67;
202 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED
= 68;
204 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
= 69;
206 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH
= 70;
208 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH
= 71;
210 static const int32_t SCRIPT_DATA_DOUBLE_ESCAPE_END
= 72;
212 static const int32_t PROCESSING_INSTRUCTION
= 73;
214 static const int32_t PROCESSING_INSTRUCTION_QUESTION_MARK
= 74;
217 static const int32_t LEAD_OFFSET
= (0xD800 - (0x10000 >> 10));
219 static char16_t LT_GT
[];
220 static char16_t LT_SOLIDUS
[];
221 static char16_t RSQB_RSQB
[];
222 static char16_t REPLACEMENT_CHARACTER
[];
223 static char16_t LF
[];
224 static char16_t CDATA_LSQB
[];
225 static char16_t OCTYPE
[];
226 static char16_t UBLIC
[];
227 static char16_t YSTEM
[];
228 static staticJArray
<char16_t
, int32_t> TITLE_ARR
;
229 static staticJArray
<char16_t
, int32_t> SCRIPT_ARR
;
230 static staticJArray
<char16_t
, int32_t> STYLE_ARR
;
231 static staticJArray
<char16_t
, int32_t> PLAINTEXT_ARR
;
232 static staticJArray
<char16_t
, int32_t> XMP_ARR
;
233 static staticJArray
<char16_t
, int32_t> TEXTAREA_ARR
;
234 static staticJArray
<char16_t
, int32_t> IFRAME_ARR
;
235 static staticJArray
<char16_t
, int32_t> NOEMBED_ARR
;
236 static staticJArray
<char16_t
, int32_t> NOSCRIPT_ARR
;
237 static staticJArray
<char16_t
, int32_t> NOFRAMES_ARR
;
240 nsHtml5TreeBuilder
* tokenHandler
;
241 nsHtml5StreamParser
* encodingDeclarationHandler
;
246 int32_t returnStateSave
;
255 int32_t firstCharKey
;
259 int32_t charRefBufMark
;
271 nsHtml5String publicId
;
272 nsHtml5String systemId
;
273 autoJArray
<char16_t
, int32_t> strBuf
;
275 autoJArray
<char16_t
, int32_t> charRefBuf
;
276 int32_t charRefBufLen
;
277 autoJArray
<char16_t
, int32_t> bmpChar
;
278 autoJArray
<char16_t
, int32_t> astralChar
;
281 nsHtml5ElementName
* endTagExpectation
;
284 jArray
<char16_t
, int32_t> endTagExpectationAsArray
;
291 nsHtml5ElementName
* tagName
;
292 nsHtml5ElementName
* nonInternedTagName
;
295 nsHtml5AttributeName
* attributeName
;
298 nsHtml5AttributeName
* nonInternedAttributeName
;
300 nsHtml5String publicIdentifier
;
301 nsHtml5String systemIdentifier
;
302 nsHtml5HtmlAttributes
* attributes
;
303 bool newAttributesEachTime
;
311 int32_t attributeLine
;
312 nsHtml5AtomTable
* interner
;
313 bool viewingXmlSource
;
316 nsHtml5Tokenizer(nsHtml5TreeBuilder
* tokenHandler
, bool viewingXmlSource
);
317 void setInterner(nsHtml5AtomTable
* interner
);
318 void initLocation(nsHtml5String newPublicId
, nsHtml5String newSystemId
);
319 bool isViewingXmlSource();
320 void setStateAndEndTagExpectation(int32_t specialTokenizerState
,
321 nsAtom
* endTagExpectation
);
322 void setStateAndEndTagExpectation(int32_t specialTokenizerState
,
323 nsHtml5ElementName
* endTagExpectation
);
326 void endTagExpectationToArray();
329 void setLineNumber(int32_t line
);
330 inline int32_t getLineNumber() { return line
; }
332 nsHtml5HtmlAttributes
* emptyAttributes();
335 inline void appendCharRefBuf(char16_t c
)
337 MOZ_RELEASE_ASSERT(charRefBufLen
< charRefBuf
.length
,
338 "Attempted to overrun charRefBuf!");
339 charRefBuf
[charRefBufLen
++] = c
;
342 void emitOrAppendCharRefBuf(int32_t returnState
);
343 inline void clearStrBufAfterUse() { strBufLen
= 0; }
345 inline void clearStrBufBeforeUse()
347 MOZ_ASSERT(!strBufLen
, "strBufLen not reset after previous use!");
351 inline void clearStrBufAfterOneHyphen()
353 MOZ_ASSERT(strBufLen
== 1, "strBufLen length not one!");
354 MOZ_ASSERT(strBuf
[0] == '-', "strBuf does not start with a hyphen!");
358 inline void appendStrBuf(char16_t c
)
360 MOZ_ASSERT(strBufLen
< strBuf
.length
,
361 "Previous buffer length insufficient.");
362 if (MOZ_UNLIKELY(strBufLen
== strBuf
.length
)) {
363 if (MOZ_UNLIKELY(!EnsureBufferSpace(1))) {
364 MOZ_CRASH("Unable to recover from buffer reallocation failure");
367 strBuf
[strBufLen
++] = c
;
371 nsHtml5String
strBufToString();
374 void strBufToDoctypeName();
376 inline void appendSecondHyphenToBogusComment() { appendStrBuf('-'); }
378 inline void adjustDoubleHyphenAndAppendToStrBufAndErr(char16_t c
)
380 errConsecutiveHyphens();
384 void appendStrBuf(char16_t
* buffer
, int32_t offset
, int32_t length
);
385 inline void appendCharRefBufToStrBuf()
387 appendStrBuf(charRefBuf
, 0, charRefBufLen
);
391 void emitComment(int32_t provisionalHyphens
, int32_t pos
);
394 void flushChars(char16_t
* buf
, int32_t pos
);
397 void strBufToElementNameString();
398 int32_t emitCurrentTagToken(bool selfClosing
, int32_t pos
);
399 void attributeNameComplete();
400 void addAttributeWithoutValue();
401 void addAttributeWithValue();
405 bool tokenizeBuffer(nsHtml5UTF16Buffer
* buffer
);
409 int32_t stateLoop(int32_t state
,
416 void initDoctypeFields();
417 inline void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
419 silentCarriageReturn();
420 adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
423 inline void adjustDoubleHyphenAndAppendToStrBufLineFeed()
426 adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
429 inline void appendStrBufLineFeed()
435 inline void appendStrBufCarriageReturn()
437 silentCarriageReturn();
442 inline void silentCarriageReturn()
448 inline void silentLineFeed() { ++line
; }
451 void emitCarriageReturn(char16_t
* buf
, int32_t pos
);
452 void emitReplacementCharacter(char16_t
* buf
, int32_t pos
);
453 void emitPlaintextReplacementCharacter(char16_t
* buf
, int32_t pos
);
454 void setAdditionalAndRememberAmpersandLocation(char16_t add
);
456 void bogusDoctypeWithoutQuirks();
457 void handleNcrValue(int32_t returnState
);
463 void emitDoctypeToken(int32_t pos
);
466 inline char16_t
checkChar(char16_t
* buf
, int32_t pos
) { return buf
[pos
]; }
469 bool internalEncodingDeclaration(nsHtml5String internalCharset
);
472 void emitOrAppendTwo(const char16_t
* val
, int32_t returnState
);
473 void emitOrAppendOne(const char16_t
* val
, int32_t returnState
);
477 void requestSuspension();
478 bool isInDataState();
479 void resetToDataState();
480 void loadState(nsHtml5Tokenizer
* other
);
481 void initializeWithoutStarting();
482 void setEncodingDeclarationHandler(
483 nsHtml5StreamParser
* encodingDeclarationHandler
);
485 static void initializeStatics();
486 static void releaseStatics();
488 #include "nsHtml5TokenizerHSupplement.h"