2008-11-04 Anders Carlsson <andersca@apple.com>
[webkit/qt.git] / WebCore / html / HTMLTokenizer.h
blob0d175dbf125e25cc7f3498dcf25efac0663c07bc
1 /*
2 Copyright (C) 1997 Martin Jones (mjones@kde.org)
3 (C) 1997 Torben Weis (weis@kde.org)
4 (C) 1998 Waldo Bastian (bastian@kde.org)
5 (C) 2001 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
8 This library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Library General Public
10 License as published by the Free Software Foundation; either
11 version 2 of the License, or (at your option) any later version.
13 This library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Library General Public License for more details.
18 You should have received a copy of the GNU Library General Public License
19 along with this library; see the file COPYING.LIB. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA.
24 #ifndef HTMLTokenizer_h
25 #define HTMLTokenizer_h
27 #include "CachedResourceClient.h"
28 #include "CachedResourceHandle.h"
29 #include "NamedMappedAttrMap.h"
30 #include "SegmentedString.h"
31 #include "Timer.h"
32 #include "Tokenizer.h"
33 #include <wtf/Deque.h>
34 #include <wtf/OwnPtr.h>
35 #include <wtf/Vector.h>
37 namespace WebCore {
39 class CachedScript;
40 class DocumentFragment;
41 class Document;
42 class HTMLDocument;
43 class HTMLViewSourceDocument;
44 class FrameView;
45 class HTMLParser;
46 class Node;
47 class PreloadScanner;
49 /**
50 * @internal
51 * represents one HTML tag. Consists of a numerical id, and the list
52 * of attributes. Can also represent text. In this case the id = 0 and
53 * text contains the text.
55 class Token {
56 public:
57 Token() : beginTag(true), flat(false), brokenXMLStyle(false), m_sourceInfo(0) { }
58 ~Token() { }
60 void addAttribute(Document*, AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
62 bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
63 bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
65 void reset()
67 attrs = 0;
68 text = 0;
69 tagName = nullAtom;
70 beginTag = true;
71 flat = false;
72 brokenXMLStyle = false;
73 if (m_sourceInfo)
74 m_sourceInfo->clear();
77 void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
79 RefPtr<NamedMappedAttrMap> attrs;
80 RefPtr<StringImpl> text;
81 AtomicString tagName;
82 bool beginTag;
83 bool flat;
84 bool brokenXMLStyle;
85 OwnPtr<Vector<UChar> > m_sourceInfo;
88 enum DoctypeState {
89 DoctypeBegin,
90 DoctypeBeforeName,
91 DoctypeName,
92 DoctypeAfterName,
93 DoctypeBeforePublicID,
94 DoctypePublicID,
95 DoctypeAfterPublicID,
96 DoctypeBeforeSystemID,
97 DoctypeSystemID,
98 DoctypeAfterSystemID,
99 DoctypeBogus
102 class DoctypeToken {
103 public:
104 DoctypeToken() {}
106 void reset()
108 m_name.clear();
109 m_publicID.clear();
110 m_systemID.clear();
111 m_state = DoctypeBegin;
112 m_source.clear();
115 DoctypeState state() { return m_state; }
116 void setState(DoctypeState s) { m_state = s; }
118 Vector<UChar> m_name;
119 Vector<UChar> m_publicID;
120 Vector<UChar> m_systemID;
121 DoctypeState m_state;
123 Vector<UChar> m_source;
126 //-----------------------------------------------------------------------------
128 class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
129 public:
130 HTMLTokenizer(HTMLDocument*, bool reportErrors);
131 HTMLTokenizer(HTMLViewSourceDocument*);
132 HTMLTokenizer(DocumentFragment*);
133 virtual ~HTMLTokenizer();
135 virtual bool write(const SegmentedString&, bool appendData);
136 virtual void finish();
137 virtual void setForceSynchronous(bool force);
138 virtual bool isWaitingForScripts() const;
139 virtual void stopParsing();
140 virtual bool processingData() const;
141 virtual int executingScript() const { return m_executingScript; }
143 virtual int lineNumber() const { return m_lineNumber; }
144 virtual int columnNumber() const { return 1; }
146 bool processingContentWrittenByScript() const { return src.excludeLineNumbers(); }
148 virtual void executeScriptsWaitingForStylesheets();
150 virtual bool isHTMLTokenizer() const { return true; }
151 HTMLParser* htmlParser() const { return parser; }
153 private:
154 class State;
156 // Where we are in parsing a tag
157 void begin();
158 void end();
160 void reset();
162 PassRefPtr<Node> processToken();
163 void processDoctypeToken();
165 State processListing(SegmentedString, State);
166 State parseComment(SegmentedString&, State);
167 State parseDoctype(SegmentedString&, State);
168 State parseServer(SegmentedString&, State);
169 State parseText(SegmentedString&, State);
170 State parseSpecial(SegmentedString&, State);
171 State parseTag(SegmentedString&, State);
172 State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& _cBufferPos, bool start, bool parsingTag);
173 State parseProcessingInstruction(SegmentedString&, State);
174 State scriptHandler(State);
175 State scriptExecution(const String& script, State, const String& scriptURL, int baseLine = 1);
176 void setSrc(const SegmentedString&);
178 // check if we have enough space in the buffer.
179 // if not enlarge it
180 inline void checkBuffer(int len = 10)
182 if ((dest - buffer) > size - len)
183 enlargeBuffer(len);
186 inline void checkScriptBuffer(int len = 10)
188 if (scriptCodeSize + len >= scriptCodeMaxSize)
189 enlargeScriptBuffer(len);
192 void enlargeBuffer(int len);
193 void enlargeScriptBuffer(int len);
195 bool continueProcessing(int& processedCount, double startTime, State&);
196 void timerFired(Timer<HTMLTokenizer>*);
197 void allDataProcessed();
199 // from CachedResourceClient
200 void notifyFinished(CachedResource *finishedObj);
202 // Internal buffers
203 ///////////////////
204 UChar* buffer;
205 UChar* dest;
207 Token currToken;
209 // the size of buffer
210 int size;
212 // Tokenizer flags
213 //////////////////
214 // are we in quotes within a html tag
215 enum { NoQuote, SingleQuote, DoubleQuote } tquote;
217 // Are we in a &... character entity description?
218 enum EntityState {
219 NoEntity = 0,
220 SearchEntity = 1,
221 NumericSearch = 2,
222 Hexadecimal = 3,
223 Decimal = 4,
224 EntityName = 5,
225 SearchSemicolon = 6
227 unsigned EntityUnicodeValue;
229 enum TagState {
230 NoTag = 0,
231 TagName = 1,
232 SearchAttribute = 2,
233 AttributeName = 3,
234 SearchEqual = 4,
235 SearchValue = 5,
236 QuotedValue = 6,
237 Value = 7,
238 SearchEnd = 8
241 class State {
242 public:
243 State() : m_bits(0) { }
245 TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
246 void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
247 EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
248 void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
250 bool inScript() const { return testBit(InScript); }
251 void setInScript(bool v) { setBit(InScript, v); }
252 bool inStyle() const { return testBit(InStyle); }
253 void setInStyle(bool v) { setBit(InStyle, v); }
254 bool inXmp() const { return testBit(InXmp); }
255 void setInXmp(bool v) { setBit(InXmp, v); }
256 bool inTitle() const { return testBit(InTitle); }
257 void setInTitle(bool v) { setBit(InTitle, v); }
258 bool inIFrame() const { return testBit(InIFrame); }
259 void setInIFrame(bool v) { setBit(InIFrame, v); }
260 bool inPlainText() const { return testBit(InPlainText); }
261 void setInPlainText(bool v) { setBit(InPlainText, v); }
262 bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
263 void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
264 bool inComment() const { return testBit(InComment); }
265 void setInComment(bool v) { setBit(InComment, v); }
266 bool inDoctype() const { return testBit(InDoctype); }
267 void setInDoctype(bool v) { setBit(InDoctype, v); }
268 bool inTextArea() const { return testBit(InTextArea); }
269 void setInTextArea(bool v) { setBit(InTextArea, v); }
270 bool escaped() const { return testBit(Escaped); }
271 void setEscaped(bool v) { setBit(Escaped, v); }
272 bool inServer() const { return testBit(InServer); }
273 void setInServer(bool v) { setBit(InServer, v); }
274 bool skipLF() const { return testBit(SkipLF); }
275 void setSkipLF(bool v) { setBit(SkipLF, v); }
276 bool startTag() const { return testBit(StartTag); }
277 void setStartTag(bool v) { setBit(StartTag, v); }
278 bool discardLF() const { return testBit(DiscardLF); }
279 void setDiscardLF(bool v) { setBit(DiscardLF, v); }
280 bool allowYield() const { return testBit(AllowYield); }
281 void setAllowYield(bool v) { setBit(AllowYield, v); }
282 bool loadingExtScript() const { return testBit(LoadingExtScript); }
283 void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
284 bool forceSynchronous() const { return testBit(ForceSynchronous); }
285 void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
287 bool inAnySpecial() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
288 bool hasTagState() const { return m_bits & TagMask; }
289 bool hasEntityState() const { return m_bits & EntityMask; }
291 bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }
293 private:
294 static const int EntityShift = 4;
295 enum StateBits {
296 TagMask = (1 << 4) - 1,
297 EntityMask = (1 << 7) - (1 << 4),
298 InScript = 1 << 7,
299 InStyle = 1 << 8,
300 // Bit 9 unused
301 InXmp = 1 << 10,
302 InTitle = 1 << 11,
303 InPlainText = 1 << 12,
304 InProcessingInstruction = 1 << 13,
305 InComment = 1 << 14,
306 InTextArea = 1 << 15,
307 Escaped = 1 << 16,
308 InServer = 1 << 17,
309 SkipLF = 1 << 18,
310 StartTag = 1 << 19,
311 DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
312 AllowYield = 1 << 21,
313 LoadingExtScript = 1 << 22,
314 ForceSynchronous = 1 << 23,
315 InIFrame = 1 << 24,
316 InDoctype = 1 << 25
319 void setBit(StateBits bit, bool value)
321 if (value)
322 m_bits |= bit;
323 else
324 m_bits &= ~bit;
326 bool testBit(StateBits bit) const { return m_bits & bit; }
328 unsigned m_bits;
331 State m_state;
333 DoctypeToken m_doctypeToken;
334 int m_doctypeSearchCount;
335 int m_doctypeSecondarySearchCount;
337 bool brokenServer;
339 // Name of an attribute that we just scanned.
340 AtomicString attrName;
342 // Used to store the code of a scripting sequence
343 UChar* scriptCode;
344 // Size of the script sequenze stored in @ref #scriptCode
345 int scriptCodeSize;
346 // Maximal size that can be stored in @ref #scriptCode
347 int scriptCodeMaxSize;
348 // resync point of script code size
349 int scriptCodeResync;
351 // Stores characters if we are scanning for a string like "</script>"
352 UChar searchBuffer[10];
354 // Counts where we are in the string we are scanning for
355 int searchCount;
356 // the stopper string
357 const char* searchStopper;
358 // the stopper len
359 int searchStopperLen;
361 // if no more data is coming, just parse what we have (including ext scripts that
362 // may be still downloading) and finish
363 bool noMoreData;
364 // URL to get source code of script from
365 String scriptSrc;
366 String scriptSrcCharset;
367 // the HTML code we will parse after the external script we are waiting for has loaded
368 SegmentedString pendingSrc;
370 // the HTML code we will parse after this particular script has
371 // loaded, but before all pending HTML
372 SegmentedString *currentPrependingSrc;
374 // true if we are executing a script while parsing a document. This causes the parsing of
375 // the output of the script to be postponed until after the script has finished executing
376 int m_executingScript;
377 Deque<CachedResourceHandle<CachedScript> > pendingScripts;
378 RefPtr<Node> scriptNode;
380 bool m_requestingScript;
381 bool m_hasScriptsWaitingForStylesheets;
383 // if we found one broken comment, there are most likely others as well
384 // store a flag to get rid of the O(n^2) behaviour in such a case.
385 bool brokenComments;
386 // current line number
387 int m_lineNumber;
388 // line number at which the current <script> started
389 int scriptStartLineno;
390 int tagStartLineno;
392 double m_tokenizerTimeDelay;
393 int m_tokenizerChunkSize;
395 // The timer for continued processing.
396 Timer<HTMLTokenizer> m_timer;
398 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
399 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
400 // we'll just make it large enough to handle all imaginable cases.
401 #define CBUFLEN 1024
402 UChar cBuffer[CBUFLEN + 2];
403 unsigned int m_cBufferPos;
405 SegmentedString src;
406 Document* m_doc;
407 HTMLParser* parser;
408 bool inWrite;
409 bool m_fragment;
411 OwnPtr<PreloadScanner> m_preloadScanner;
414 void parseHTMLDocumentFragment(const String&, DocumentFragment*);
416 UChar decodeNamedEntity(const char*);
418 } // namespace WebCore
420 #endif // HTMLTokenizer_h