2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1998 Waldo Bastian (bastian@kde.org)
5 (C) 2001 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
8 This library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Library General Public
10 License as published by the Free Software Foundation; either
11 version 2 of the License, or (at your option) any later version.
13 This library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Library General Public License for more details.
18 You should have received a copy of the GNU Library General Public License
19 along with this library; see the file COPYING.LIB. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA.
24 #ifndef HTMLTokenizer_h
25 #define HTMLTokenizer_h
27 #include "CachedResourceClient.h"
28 #include "CachedResourceHandle.h"
29 #include "NamedMappedAttrMap.h"
30 #include "SegmentedString.h"
32 #include "Tokenizer.h"
33 #include <wtf/Deque.h>
34 #include <wtf/OwnPtr.h>
35 #include <wtf/Vector.h>
40 class DocumentFragment
;
43 class HTMLViewSourceDocument
;
51 * represents one HTML tag. Consists of a numerical id, and the list
52 * of attributes. Can also represent text. In this case the id = 0 and
53 * text contains the text.
57 Token() : beginTag(true), flat(false), brokenXMLStyle(false), m_sourceInfo(0) { }
60 void addAttribute(Document
*, AtomicString
& attrName
, const AtomicString
& v
, bool viewSourceMode
);
62 bool isOpenTag(const QualifiedName
& fullName
) const { return beginTag
&& fullName
.localName() == tagName
; }
63 bool isCloseTag(const QualifiedName
& fullName
) const { return !beginTag
&& fullName
.localName() == tagName
; }
72 brokenXMLStyle
= false;
74 m_sourceInfo
->clear();
77 void addViewSourceChar(UChar c
) { if (!m_sourceInfo
.get()) m_sourceInfo
.set(new Vector
<UChar
>); m_sourceInfo
->append(c
); }
79 RefPtr
<NamedMappedAttrMap
> attrs
;
80 RefPtr
<StringImpl
> text
;
85 OwnPtr
<Vector
<UChar
> > m_sourceInfo
;
93 DoctypeBeforePublicID
,
96 DoctypeBeforeSystemID
,
111 m_state
= DoctypeBegin
;
115 DoctypeState
state() { return m_state
; }
116 void setState(DoctypeState s
) { m_state
= s
; }
118 Vector
<UChar
> m_name
;
119 Vector
<UChar
> m_publicID
;
120 Vector
<UChar
> m_systemID
;
121 DoctypeState m_state
;
123 Vector
<UChar
> m_source
;
126 //-----------------------------------------------------------------------------
128 class HTMLTokenizer
: public Tokenizer
, public CachedResourceClient
{
130 HTMLTokenizer(HTMLDocument
*, bool reportErrors
);
131 HTMLTokenizer(HTMLViewSourceDocument
*);
132 HTMLTokenizer(DocumentFragment
*);
133 virtual ~HTMLTokenizer();
135 virtual bool write(const SegmentedString
&, bool appendData
);
136 virtual void finish();
137 virtual void setForceSynchronous(bool force
);
138 virtual bool isWaitingForScripts() const;
139 virtual void stopParsing();
140 virtual bool processingData() const;
141 virtual int executingScript() const { return m_executingScript
; }
143 virtual int lineNumber() const { return m_lineNumber
; }
144 virtual int columnNumber() const { return 1; }
146 bool processingContentWrittenByScript() const { return src
.excludeLineNumbers(); }
148 virtual void executeScriptsWaitingForStylesheets();
150 virtual bool isHTMLTokenizer() const { return true; }
151 HTMLParser
* htmlParser() const { return parser
; }
156 // Where we are in parsing a tag
162 PassRefPtr
<Node
> processToken();
163 void processDoctypeToken();
165 State
processListing(SegmentedString
, State
);
166 State
parseComment(SegmentedString
&, State
);
167 State
parseDoctype(SegmentedString
&, State
);
168 State
parseServer(SegmentedString
&, State
);
169 State
parseText(SegmentedString
&, State
);
170 State
parseSpecial(SegmentedString
&, State
);
171 State
parseTag(SegmentedString
&, State
);
172 State
parseEntity(SegmentedString
&, UChar
*& dest
, State
, unsigned& _cBufferPos
, bool start
, bool parsingTag
);
173 State
parseProcessingInstruction(SegmentedString
&, State
);
174 State
scriptHandler(State
);
175 State
scriptExecution(const String
& script
, State
, const String
& scriptURL
, int baseLine
= 1);
176 void setSrc(const SegmentedString
&);
178 // check if we have enough space in the buffer.
180 inline void checkBuffer(int len
= 10)
182 if ((dest
- buffer
) > size
- len
)
186 inline void checkScriptBuffer(int len
= 10)
188 if (scriptCodeSize
+ len
>= scriptCodeMaxSize
)
189 enlargeScriptBuffer(len
);
192 void enlargeBuffer(int len
);
193 void enlargeScriptBuffer(int len
);
195 bool continueProcessing(int& processedCount
, double startTime
, State
&);
196 void timerFired(Timer
<HTMLTokenizer
>*);
197 void allDataProcessed();
199 // from CachedResourceClient
200 void notifyFinished(CachedResource
*finishedObj
);
209 // the size of buffer
214 // are we in quotes within a html tag
215 enum { NoQuote
, SingleQuote
, DoubleQuote
} tquote
;
217 // Are we in a &... character entity description?
227 unsigned EntityUnicodeValue
;
243 State() : m_bits(0) { }
245 TagState
tagState() const { return static_cast<TagState
>(m_bits
& TagMask
); }
246 void setTagState(TagState t
) { m_bits
= (m_bits
& ~TagMask
) | t
; }
247 EntityState
entityState() const { return static_cast<EntityState
>((m_bits
& EntityMask
) >> EntityShift
); }
248 void setEntityState(EntityState e
) { m_bits
= (m_bits
& ~EntityMask
) | (e
<< EntityShift
); }
250 bool inScript() const { return testBit(InScript
); }
251 void setInScript(bool v
) { setBit(InScript
, v
); }
252 bool inStyle() const { return testBit(InStyle
); }
253 void setInStyle(bool v
) { setBit(InStyle
, v
); }
254 bool inXmp() const { return testBit(InXmp
); }
255 void setInXmp(bool v
) { setBit(InXmp
, v
); }
256 bool inTitle() const { return testBit(InTitle
); }
257 void setInTitle(bool v
) { setBit(InTitle
, v
); }
258 bool inIFrame() const { return testBit(InIFrame
); }
259 void setInIFrame(bool v
) { setBit(InIFrame
, v
); }
260 bool inPlainText() const { return testBit(InPlainText
); }
261 void setInPlainText(bool v
) { setBit(InPlainText
, v
); }
262 bool inProcessingInstruction() const { return testBit(InProcessingInstruction
); }
263 void setInProcessingInstruction(bool v
) { return setBit(InProcessingInstruction
, v
); }
264 bool inComment() const { return testBit(InComment
); }
265 void setInComment(bool v
) { setBit(InComment
, v
); }
266 bool inDoctype() const { return testBit(InDoctype
); }
267 void setInDoctype(bool v
) { setBit(InDoctype
, v
); }
268 bool inTextArea() const { return testBit(InTextArea
); }
269 void setInTextArea(bool v
) { setBit(InTextArea
, v
); }
270 bool escaped() const { return testBit(Escaped
); }
271 void setEscaped(bool v
) { setBit(Escaped
, v
); }
272 bool inServer() const { return testBit(InServer
); }
273 void setInServer(bool v
) { setBit(InServer
, v
); }
274 bool skipLF() const { return testBit(SkipLF
); }
275 void setSkipLF(bool v
) { setBit(SkipLF
, v
); }
276 bool startTag() const { return testBit(StartTag
); }
277 void setStartTag(bool v
) { setBit(StartTag
, v
); }
278 bool discardLF() const { return testBit(DiscardLF
); }
279 void setDiscardLF(bool v
) { setBit(DiscardLF
, v
); }
280 bool allowYield() const { return testBit(AllowYield
); }
281 void setAllowYield(bool v
) { setBit(AllowYield
, v
); }
282 bool loadingExtScript() const { return testBit(LoadingExtScript
); }
283 void setLoadingExtScript(bool v
) { setBit(LoadingExtScript
, v
); }
284 bool forceSynchronous() const { return testBit(ForceSynchronous
); }
285 void setForceSynchronous(bool v
) { setBit(ForceSynchronous
, v
); }
287 bool inAnySpecial() const { return m_bits
& (InScript
| InStyle
| InXmp
| InTextArea
| InTitle
| InIFrame
); }
288 bool hasTagState() const { return m_bits
& TagMask
; }
289 bool hasEntityState() const { return m_bits
& EntityMask
; }
291 bool needsSpecialWriteHandling() const { return m_bits
& (InScript
| InStyle
| InXmp
| InTextArea
| InTitle
| InIFrame
| TagMask
| EntityMask
| InPlainText
| InComment
| InDoctype
| InServer
| InProcessingInstruction
| StartTag
); }
294 static const int EntityShift
= 4;
296 TagMask
= (1 << 4) - 1,
297 EntityMask
= (1 << 7) - (1 << 4),
303 InPlainText
= 1 << 12,
304 InProcessingInstruction
= 1 << 13,
306 InTextArea
= 1 << 15,
311 DiscardLF
= 1 << 20, // FIXME: should clarify difference between skip and discard
312 AllowYield
= 1 << 21,
313 LoadingExtScript
= 1 << 22,
314 ForceSynchronous
= 1 << 23,
319 void setBit(StateBits bit
, bool value
)
326 bool testBit(StateBits bit
) const { return m_bits
& bit
; }
333 DoctypeToken m_doctypeToken
;
334 int m_doctypeSearchCount
;
335 int m_doctypeSecondarySearchCount
;
339 // Name of an attribute that we just scanned.
340 AtomicString attrName
;
342 // Used to store the code of a scripting sequence
344 // Size of the script sequenze stored in @ref #scriptCode
346 // Maximal size that can be stored in @ref #scriptCode
347 int scriptCodeMaxSize
;
348 // resync point of script code size
349 int scriptCodeResync
;
351 // Stores characters if we are scanning for a string like "</script>"
352 UChar searchBuffer
[10];
354 // Counts where we are in the string we are scanning for
356 // the stopper string
357 const char* searchStopper
;
359 int searchStopperLen
;
361 // if no more data is coming, just parse what we have (including ext scripts that
362 // may be still downloading) and finish
364 // URL to get source code of script from
366 String scriptSrcCharset
;
367 // the HTML code we will parse after the external script we are waiting for has loaded
368 SegmentedString pendingSrc
;
370 // the HTML code we will parse after this particular script has
371 // loaded, but before all pending HTML
372 SegmentedString
*currentPrependingSrc
;
374 // true if we are executing a script while parsing a document. This causes the parsing of
375 // the output of the script to be postponed until after the script has finished executing
376 int m_executingScript
;
377 Deque
<CachedResourceHandle
<CachedScript
> > pendingScripts
;
378 RefPtr
<Node
> scriptNode
;
380 bool m_requestingScript
;
381 bool m_hasScriptsWaitingForStylesheets
;
383 // if we found one broken comment, there are most likely others as well
384 // store a flag to get rid of the O(n^2) behaviour in such a case.
386 // current line number
388 // line number at which the current <script> started
389 int scriptStartLineno
;
392 double m_tokenizerTimeDelay
;
393 int m_tokenizerChunkSize
;
395 // The timer for continued processing.
396 Timer
<HTMLTokenizer
> m_timer
;
398 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
399 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
400 // we'll just make it large enough to handle all imaginable cases.
402 UChar cBuffer
[CBUFLEN
+ 2];
403 unsigned int m_cBufferPos
;
411 OwnPtr
<PreloadScanner
> m_preloadScanner
;
414 void parseHTMLDocumentFragment(const String
&, DocumentFragment
*);
416 UChar
decodeNamedEntity(const char*);
418 } // namespace WebCore
420 #endif // HTMLTokenizer_h