Full parsing of doctype by the HTML Tokenizer
[kdelibs.git] / khtml / html / htmltokenizer.cpp
blob97a706dd194f1e57d1fe423cda3b7352ebd3f833
1 /*
2 This file is part of the KDE libraries
4 Copyright (C) 1997 Martin Jones (mjones@kde.org)
5 (C) 1997 Torben Weis (weis@kde.org)
6 (C) 1998 Waldo Bastian (bastian@kde.org)
7 (C) 1999 Lars Knoll (knoll@kde.org)
8 (C) 1999 Antti Koivisto (koivisto@kde.org)
9 (C) 2001-2003 Dirk Mueller (mueller@kde.org)
10 (C) 2004 Apple Computer, Inc.
11 (C) 2006 Germain Garand (germain@ebooksfrance.org)
13 This library is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Library General Public
15 License as published by the Free Software Foundation; either
16 version 2 of the License, or (at your option) any later version.
18 This library is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Library General Public License for more details.
23 You should have received a copy of the GNU Library General Public License
24 along with this library; see the file COPYING.LIB. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA.
28 //----------------------------------------------------------------------------
30 // KDE HTML Widget - Tokenizers
32 // #define TOKEN_DEBUG 1
33 //#define TOKEN_DEBUG 2
35 #include "htmltokenizer.h"
36 #include "html_documentimpl.h"
37 #include "htmlparser.h"
38 #include "dtd.h"
40 #include <misc/loader.h>
41 #include <misc/htmlhashes.h>
43 #include <khtmlview.h>
44 #include <khtml_part.h>
45 #include <xml/dom_docimpl.h>
46 #include <css/csshelper.h>
47 #include <ecma/kjs_proxy.h>
48 #include <kcharsets.h>
49 #include <kglobal.h>
50 #include <ctype.h>
51 #include <assert.h>
52 #include <QtCore/QVariant>
53 #include <kdebug.h>
54 #include <stdlib.h>
56 #include <config.h>
58 #include "kentities.c"
59 #include "htmlprospectivetokenizer.h"
61 #define PROSPECTIVE_TOKENIZER_ENABLED 1
63 using namespace khtml;
65 static const QChar commentStart [] = { '<','!','-','-', QChar::Null };
66 static const char doctypeStart [] = "<!doctype";
67 static const char publicStart [] = "public";
68 static const char systemStart [] = "system";
70 static const char scriptEnd [] = "</script";
71 static const char xmpEnd [] = "</xmp";
72 static const char styleEnd [] = "</style";
73 static const char textareaEnd [] = "</textarea";
74 static const char titleEnd [] = "</title";
76 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
77 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
78 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
80 // Full support for MS Windows extensions to Latin-1.
81 // Technically these extensions should only be activated for pages
82 // marked "windows-1252" or "cp1252", but
83 // in the standard Microsoft way, these extensions infect hundreds of thousands
84 // of web pages. Note that people with non-latin-1 Microsoft extensions
85 // are SOL.
87 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
88 // http://www.bbsinc.com/iso8859.html
89 // http://www.obviously.com/
91 // There may be better equivalents
92 #if 0
93 #define fixUpChar(x)
94 #else
95 #define fixUpChar(x) \
96 switch ((x).unicode()) \
97 { \
98 case 0x80: (x) = 0x20ac; break; \
99 case 0x82: (x) = 0x201a; break; \
100 case 0x83: (x) = 0x0192; break; \
101 case 0x84: (x) = 0x201e; break; \
102 case 0x85: (x) = 0x2026; break; \
103 case 0x86: (x) = 0x2020; break; \
104 case 0x87: (x) = 0x2021; break; \
105 case 0x88: (x) = 0x02C6; break; \
106 case 0x89: (x) = 0x2030; break; \
107 case 0x8A: (x) = 0x0160; break; \
108 case 0x8b: (x) = 0x2039; break; \
109 case 0x8C: (x) = 0x0152; break; \
110 case 0x8E: (x) = 0x017D; break; \
111 case 0x91: (x) = 0x2018; break; \
112 case 0x92: (x) = 0x2019; break; \
113 case 0x93: (x) = 0x201C; break; \
114 case 0x94: (x) = 0X201D; break; \
115 case 0x95: (x) = 0x2022; break; \
116 case 0x96: (x) = 0x2013; break; \
117 case 0x97: (x) = 0x2014; break; \
118 case 0x98: (x) = 0x02DC; break; \
119 case 0x99: (x) = 0x2122; break; \
120 case 0x9A: (x) = 0x0161; break; \
121 case 0x9b: (x) = 0x203A; break; \
122 case 0x9C: (x) = 0x0153; break; \
123 case 0x9E: (x) = 0x017E; break; \
124 case 0x9F: (x) = 0x0178; break; \
125 default: break; \
127 #endif
128 // ----------------------------------------------------------------------------
130 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view)
132 view = _view;
133 buffer = 0;
134 scriptCode = 0;
135 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
136 charsets = KGlobal::charsets();
137 parser = new KHTMLParser(_view, _doc);
138 m_executingScript = 0;
139 m_autoCloseTimer = 0;
140 m_prospectiveTokenizer = 0;
141 onHold = false;
143 reset();
146 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i)
148 view = 0;
149 buffer = 0;
150 scriptCode = 0;
151 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
152 charsets = KGlobal::charsets();
153 parser = new KHTMLParser( i, _doc );
154 m_executingScript = 0;
155 m_autoCloseTimer = 0;
156 m_prospectiveTokenizer = 0;
157 onHold = false;
159 reset();
162 void HTMLTokenizer::reset()
164 assert(m_executingScript == 0);
165 Q_ASSERT(onHold == false);
166 m_abort = false;
168 while (!cachedScript.isEmpty())
169 cachedScript.dequeue()->deref(this);
171 if ( buffer )
172 KHTML_DELETE_QCHAR_VEC(buffer);
173 buffer = dest = 0;
174 size = 0;
176 if ( scriptCode )
177 KHTML_DELETE_QCHAR_VEC(scriptCode);
178 scriptCode = 0;
179 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
181 if (m_autoCloseTimer) {
182 killTimer(m_autoCloseTimer);
183 m_autoCloseTimer = 0;
186 currToken.reset();
187 doctypeToken.reset();
190 void HTMLTokenizer::begin()
192 m_executingScript = 0;
193 onHold = false;
194 reset();
195 size = 254;
196 buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
197 dest = buffer;
198 tag = NoTag;
199 pending = NonePending;
200 discard = NoneDiscard;
201 pre = false;
202 prePos = 0;
203 plaintext = false;
204 xmp = false;
205 processingInstruction = false;
206 script = false;
207 escaped = false;
208 style = false;
209 skipLF = false;
210 select = false;
211 comment = false;
212 doctype = false;
213 doctypeComment = NoDoctypeComment;
214 doctypeAllowComment = false;
215 server = false;
216 textarea = false;
217 title = false;
218 startTag = false;
219 tquote = NoQuote;
220 searchCount = 0;
221 doctypeSearchCount = 0;
222 doctypeSecondarySearchCount = 0;
223 Entity = NoEntity;
224 noMoreData = false;
225 brokenComments = false;
226 brokenServer = false;
227 brokenScript = false;
228 lineno = 0;
229 scriptStartLineno = 0;
230 tagStartLineno = 0;
233 void HTMLTokenizer::processListing(TokenizerString list)
235 bool old_pre = pre;
237 // This function adds the listing 'list' as
238 // preformatted text-tokens to the token-collection
239 // thereby converting TABs.
240 if(!style) pre = true;
241 prePos = 0;
243 while ( !list.isEmpty() )
245 checkBuffer(3*TAB_SIZE);
247 if (skipLF && ( list->unicode() != '\n' ))
249 skipLF = false;
252 if (skipLF)
254 skipLF = false;
255 ++list;
257 else if (( list->unicode() == '\n' ) || ( list->unicode() == '\r' ))
259 if (discard == LFDiscard)
261 // Ignore this LF
262 discard = NoneDiscard; // We have discarded 1 LF
264 else
266 // Process this LF
267 if (pending)
268 addPending();
270 // we used to do it not at all and we want to have
271 // it fixed for textarea. So here we are
272 if ( textarea ) {
273 prePos++;
274 *dest++ = *list;
275 } else
276 pending = LFPending;
278 /* Check for MS-DOS CRLF sequence */
279 if (list->unicode() == '\r')
281 skipLF = true;
283 ++list;
285 else if (( list->unicode() == ' ' ) || ( list->unicode() == '\t'))
287 if (pending)
288 addPending();
289 if (*list == ' ')
290 pending = SpacePending;
291 else
292 pending = TabPending;
294 ++list;
296 else
298 discard = NoneDiscard;
299 if (pending)
300 addPending();
302 prePos++;
303 *dest++ = *list;
304 ++list;
309 if ((pending == SpacePending) || (pending == TabPending))
310 addPending();
311 else
312 pending = NonePending;
314 prePos = 0;
315 pre = old_pre;
318 void HTMLTokenizer::parseSpecial(TokenizerString &src)
320 assert( textarea || title || !Entity );
321 assert( !tag );
322 assert( xmp+textarea+title+style+script == 1 );
323 if (script)
324 scriptStartLineno = lineno+src.lineCount();
326 if ( comment ) parseComment( src );
328 while ( !src.isEmpty() ) {
329 checkScriptBuffer();
330 unsigned char ch = src->toLatin1();
331 if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QString::fromRawData( scriptCode+scriptCodeSize-3, 3 ) == "<!-" ) {
332 comment = true;
333 scriptCode[ scriptCodeSize++ ] = ch;
334 ++src;
335 parseComment( src );
336 continue;
338 if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
339 ++src;
340 scriptCodeSize = scriptCodeResync-1;
341 scriptCodeResync = 0;
342 scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
343 if ( script )
344 scriptHandler();
345 else {
346 processListing(TokenizerString(scriptCode, scriptCodeSize));
347 processToken();
348 if ( style ) { currToken.tid = ID_STYLE + ID_CLOSE_TAG; }
349 else if ( textarea ) { currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; }
350 else if ( title ) { currToken.tid = ID_TITLE + ID_CLOSE_TAG; }
351 else if ( xmp ) { currToken.tid = ID_XMP + ID_CLOSE_TAG; }
352 processToken();
353 script = style = textarea = title = xmp = false;
354 tquote = NoQuote;
355 scriptCodeSize = scriptCodeResync = 0;
357 return;
359 // possible end of tagname, lets check.
360 if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
361 scriptCodeSize >= searchStopperLen &&
362 !QString::fromRawData( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).indexOf( searchStopper, 0, Qt::CaseInsensitive )) {
363 scriptCodeResync = scriptCodeSize-searchStopperLen+1;
364 tquote = NoQuote;
365 continue;
367 if ( scriptCodeResync && !escaped ) {
368 if(ch == '\"')
369 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
370 else if(ch == '\'')
371 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
372 else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
373 tquote = NoQuote;
375 escaped = ( !escaped && ch == '\\' );
376 if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
377 QChar *scriptCodeDest = scriptCode+scriptCodeSize;
378 ++src;
379 parseEntity(src,scriptCodeDest,true);
380 scriptCodeSize = scriptCodeDest-scriptCode;
382 else {
383 scriptCode[ scriptCodeSize++ ] = *src;
384 ++src;
389 void HTMLTokenizer::scriptHandler()
391 QString currentScriptSrc = scriptSrc;
392 scriptSrc.clear();
394 processListing(TokenizerString(scriptCode, scriptCodeSize));
395 QString exScript( buffer, dest-buffer );
397 processToken();
398 currToken.tid = ID_SCRIPT + ID_CLOSE_TAG;
399 processToken();
401 // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
402 bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET);
403 bool effectiveScript = !parser->skipMode() && !followingFrameset;
404 bool deferredScript = false;
406 if ( effectiveScript ) {
407 CachedScript* cs = 0;
409 // forget what we just got, load from src url instead
410 if ( !currentScriptSrc.isEmpty() && javascript &&
411 (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) )) {
412 cachedScript.enqueue(cs);
415 if (cs) {
416 pendingQueue.push(src);
417 int scriptCount = cachedScript.count();
418 setSrc(TokenizerString());
419 scriptCodeSize = scriptCodeResync = 0;
420 cs->ref(this);
421 if (cachedScript.count() == scriptCount)
422 deferredScript = true;
424 else if (currentScriptSrc.isEmpty() && view && javascript ) {
425 pendingQueue.push(src);
426 setSrc(TokenizerString());
427 scriptCodeSize = scriptCodeResync = 0;
428 scriptExecution( exScript, QString(), tagStartLineno /*scriptStartLineno*/ );
429 } else {
430 // script was filtered or disallowed
431 effectiveScript = false;
435 script = false;
436 scriptCodeSize = scriptCodeResync = 0;
438 if ( !effectiveScript )
439 return;
441 if ( !m_executingScript && cachedScript.isEmpty() ) {
442 src.append(pendingQueue.pop());
443 } else if ( cachedScript.isEmpty() ) {
444 write( pendingQueue.pop(), false );
445 } else if ( !deferredScript && pendingQueue.count() > 1) {
446 TokenizerString t = pendingQueue.pop();
447 pendingQueue.top().prepend( t );
449 #if PROSPECTIVE_TOKENIZER_ENABLED
450 if (!cachedScript.isEmpty() && !m_executingScript) {
451 if (!m_prospectiveTokenizer)
452 m_prospectiveTokenizer = new ProspectiveTokenizer(parser->docPtr());
453 if (!m_prospectiveTokenizer->inProgress() && !pendingQueue.isEmpty()) {
454 m_prospectiveTokenizer->begin();
455 m_prospectiveTokenizer->write(pendingQueue.top());
458 #endif
462 void HTMLTokenizer::scriptExecution( const QString& str, const QString& scriptURL,
463 int baseLine)
465 bool oldscript = script;
466 m_executingScript++;
467 script = false;
468 QString url;
469 if (scriptURL.isNull() && view)
470 url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL().url();
471 else
472 url = scriptURL;
474 if (view)
475 view->part()->executeScript(url,baseLine+1,Node(),str);
476 m_executingScript--;
477 script = oldscript;
480 void HTMLTokenizer::parseComment(TokenizerString &src)
482 // SGML strict
483 bool strict = parser->doc()->inStrictMode() && parser->doc()->htmlMode() != DocumentImpl::XHtml && !script && !style;
484 int delimiterCount = 0;
485 bool canClose = false;
487 checkScriptBuffer(src.length());
488 while ( src.length() ) {
489 scriptCode[ scriptCodeSize++ ] = *src;
491 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
492 qDebug("comment is now: *%s*", src.toString().left(16).toLatin1().constData());
493 #endif
495 if (strict)
497 if (src->unicode() == '-') {
498 delimiterCount++;
499 if (delimiterCount == 2) {
500 delimiterCount = 0;
501 canClose = !canClose;
504 else
505 delimiterCount = 0;
508 if ((!strict || canClose) && src->unicode() == '>')
510 bool handleBrokenComments = brokenComments && !( script || style );
511 bool scriptEnd=false;
512 if (!strict)
514 if ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
515 scriptCode[scriptCodeSize-2] == '-' )
516 scriptEnd=true;
519 if (canClose || handleBrokenComments || scriptEnd ){
520 ++src;
521 if ( !( title || script || xmp || textarea || style) ) {
522 checkScriptBuffer();
523 scriptCode[ scriptCodeSize ] = 0;
524 scriptCode[ scriptCodeSize + 1 ] = 0;
525 currToken.tid = ID_COMMENT;
526 processListing(TokenizerString(scriptCode, scriptCodeSize - 2));
527 processToken();
528 currToken.tid = ID_COMMENT + ID_CLOSE_TAG;
529 processToken();
530 scriptCodeSize = 0;
532 comment = false;
533 return; // Finished parsing comment
536 ++src;
540 void HTMLTokenizer::parseDoctypeComment(TokenizerString &src)
542 while (!src.isEmpty()) {
543 QChar c = *src;
544 switch (doctypeComment) {
545 case DoctypeCommentHalfBegin: {
546 if (c != '-') {
547 // Ooops, it's not comment
548 doctypeComment = DoctypeCommentBogus;
549 return;
550 } else {
551 // Doctype comment begins
552 doctypeComment = DoctypeComment;
553 ++src;
555 break;
557 case DoctypeComment: {
558 if (c == '-') {
559 // Perhaps this is end of comment
560 doctypeComment = DoctypeCommentHalfEnd;
561 ++src;
562 } else {
563 // Keep scanning for '--'
564 ++src;
566 break;
568 case DoctypeCommentHalfEnd: {
569 if (c == '-') {
570 // Doctype comment ends
571 doctypeComment = DoctypeCommentEnd;
572 return;
573 } else {
574 // It's not '--'
575 ++src;
576 doctypeComment = DoctypeComment;
578 break;
580 default: {
581 assert(!"Undefined doctype comment state");
582 break;
588 void HTMLTokenizer::parseDoctype(TokenizerString &src)
590 while (!src.isEmpty() && doctype) {
591 QChar c;
592 bool isWhitespace = false;
593 int dontAdvance = 0;
594 if (doctypeComment == DoctypeCommentEnd) {
595 doctypeComment = NoDoctypeComment;
596 isWhitespace = true;
597 } else if (doctypeComment == DoctypeCommentBogus) {
598 doctypeComment = NoDoctypeComment;
599 c = '-';
600 dontAdvance++;
601 } else {
602 c = *src;
603 if (doctypeAllowComment) {
604 if (!doctypeComment && c == '-') {
605 doctypeComment = DoctypeCommentHalfBegin;
606 ++src;
608 if (doctypeComment) {
609 parseDoctypeComment(src);
610 continue;
612 isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
616 switch (doctypeToken.state) {
617 case DoctypeBegin: {
618 doctypeToken.state = DoctypeBeforeName;
619 if (isWhitespace) {
620 // nothing
622 break;
624 case DoctypeBeforeName: {
625 if (c == '>') {
626 // Malformed. Just exit.
627 doctype = false;
628 } else if (isWhitespace) {
629 // nothing
630 } else {
631 dontAdvance++;
632 doctypeToken.state = DoctypeName;
634 break;
636 case DoctypeName: {
637 if (c == '>') {
638 // Valid doctype. Emit it.
639 doctype = false;
640 processDoctypeToken();
641 } else if (isWhitespace) {
642 doctypeSearchCount = 0; // Used now to scan for PUBLIC
643 doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
644 doctypeToken.state = DoctypeAfterName;
645 } else {
646 doctypeToken.name.append(c);
648 break;
650 case DoctypeAfterName: {
651 if (c == '>') {
652 // Valid doctype. Emit it.
653 doctype = false;
654 processDoctypeToken();
655 } else if (c == '[') {
656 if(doctypeSearchCount > 0 || doctypeSecondarySearchCount > 0) { // is there any public/system indicator before?
657 doctypeSearchCount = doctypeSecondarySearchCount = 0;
658 doctypeToken.state = DoctypeBogus;
660 // Found internal subset
661 doctypeToken.state = DoctypeInternalSubset;
662 doctypeAllowComment = false;
663 } else if (!isWhitespace) {
664 if (c.toLower() == publicStart[doctypeSearchCount]) {
665 doctypeSearchCount++;
666 if(doctypeSearchCount == 6)
667 // Found 'PUBLIC' sequence
668 doctypeToken.state = DoctypeBeforePublicID;
669 } else if (doctypeSearchCount > 0) {
670 doctypeSearchCount = 0;
671 doctypeToken.state = DoctypeBogus;
672 } else if (c.toLower() == systemStart[doctypeSecondarySearchCount]) {
673 doctypeSecondarySearchCount++;
674 if(doctypeSecondarySearchCount == 6)
675 // Found 'SYSTEM' sequence
676 doctypeToken.state = DoctypeBeforeSystemID;
677 } else {
678 doctypeSecondarySearchCount = 0;
679 doctypeToken.state = DoctypeBogus;
681 } else {
682 // Whitespace keeps us in the after name state
684 break;
686 case DoctypeBeforePublicID: {
687 if (c == '\"' || c == '\'') {
688 tquote = c == '\"' ? DoubleQuote : SingleQuote;
689 doctypeToken.state = DoctypePublicID;
690 doctypeAllowComment = false;
691 } else if (c == '>') {
692 // Considered bogus. Don't process the doctype.
693 doctype = false;
694 } else if (isWhitespace) {
695 // nothing
696 } else
697 doctypeToken.state = DoctypeBogus;
698 break;
700 case DoctypePublicID: {
701 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
702 doctypeToken.state = DoctypeAfterPublicID;
703 doctypeAllowComment = true;
704 } else if (c == '>') {
705 // Considered bogus. Don't process the doctype.
706 doctype = false;
707 } else {
708 doctypeToken.publicID.append(c);
710 break;
712 case DoctypeAfterPublicID: {
713 if (c == '\"' || c == '\'') {
714 tquote = c == '\"' ? DoubleQuote : SingleQuote;
715 doctypeToken.state = DoctypeSystemID;
716 } else if (c == '>') {
717 // Valid doctype. Emit it now.
718 doctype = false;
719 processDoctypeToken();
720 } else if (isWhitespace) {
721 // nothing
722 } else if (c == '[') {
723 // Found internal subset
724 doctypeToken.state = DoctypeInternalSubset;
725 doctypeAllowComment = false;
726 } else
727 doctypeToken.state = DoctypeBogus;
728 break;
730 case DoctypeBeforeSystemID: {
731 if (c == '\"' || c == '\'') {
732 tquote = c == '\"' ? DoubleQuote : SingleQuote;
733 doctypeToken.state = DoctypeSystemID;
734 doctypeAllowComment = false;
735 } else if (c == '>') {
736 // Considered bogus. Don't process the doctype.
737 doctype = false;
738 } else if (isWhitespace) {
739 // nothing
740 } else
741 doctypeToken.state = DoctypeBogus;
742 break;
744 case DoctypeSystemID: {
745 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
746 doctypeToken.state = DoctypeAfterSystemID;
747 doctypeAllowComment = true;
748 } else if (c == '>') {
749 // Considered bogus. Don't process the doctype.
750 doctype = false;
751 } else {
752 doctypeToken.systemID.append(c);
754 break;
756 case DoctypeAfterSystemID: {
757 if (c == '>') {
758 // Valid doctype. Emit it now.
759 doctype = false;
760 processDoctypeToken();
761 } else if (isWhitespace) {
762 // nothing
763 } else if (c == '[') {
764 // Found internal subset
765 doctypeToken.state = DoctypeInternalSubset;
766 doctypeAllowComment = false;
767 } else {
768 doctypeToken.state = DoctypeBogus;
770 break;
772 case DoctypeInternalSubset: {
773 if(c == ']') {
774 // Done
775 doctypeToken.state = DoctypeAfterInternalSubset;
776 doctypeAllowComment = true;
777 } else {
778 doctypeToken.internalSubset.append(c);
780 break;
782 case DoctypeAfterInternalSubset: {
783 if (c == '>') {
784 // Valid doctype. Emit it now.
785 doctype = false;
786 processDoctypeToken();
787 } else if (isWhitespace) {
788 // nothing
789 } else
790 doctypeToken.state = DoctypeBogus;
791 break;
793 case DoctypeBogus: {
794 if (c == '>') {
795 // Done with the bogus doctype.
796 doctype = false;
797 } else {
798 // Just keep scanning for '>'
800 break;
802 default:
803 break;
805 if (!dontAdvance)
806 ++src;
807 else if (dontAdvance == 1)
808 continue;
809 else // double dontAdvance++, do workaround
810 doctypeComment = DoctypeCommentBogus;
814 void HTMLTokenizer::parseServer(TokenizerString &src)
816 checkScriptBuffer(src.length());
817 while ( !src.isEmpty() ) {
818 scriptCode[ scriptCodeSize++ ] = *src;
819 if (src->unicode() == '>' &&
820 scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
821 ++src;
822 server = false;
823 scriptCodeSize = 0;
824 return; // Finished parsing server include
826 ++src;
830 void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
832 char oldchar = 0;
833 while ( !src.isEmpty() )
835 unsigned char chbegin = src->toLatin1();
836 if(chbegin == '\'') {
837 tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
839 else if(chbegin == '\"') {
840 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
842 // Look for '?>'
843 // some crappy sites omit the "?" before it, so
844 // we look for an unquoted '>' instead. (IE compatible)
845 else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
847 // We got a '?>' sequence
848 processingInstruction = false;
849 ++src;
850 discard=LFDiscard;
851 return; // Finished parsing comment!
853 ++src;
854 oldchar = chbegin;
858 void HTMLTokenizer::parseText(TokenizerString &src)
860 while ( !src.isEmpty() )
862 // do we need to enlarge the buffer?
863 checkBuffer();
865 // ascii is okay because we only do ascii comparisons
866 unsigned char chbegin = src->toLatin1();
868 if (skipLF && ( chbegin != '\n' ))
870 skipLF = false;
873 if (skipLF)
875 skipLF = false;
876 ++src;
878 else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
880 if (chbegin == '\r')
881 skipLF = true;
883 *dest++ = '\n';
884 ++src;
886 else {
887 *dest++ = *src;
888 ++src;
894 void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
896 if( start )
898 cBufferPos = 0;
899 entityLen = 0;
900 Entity = SearchEntity;
903 while( !src.isEmpty() )
905 ushort cc = src->unicode();
906 switch(Entity) {
907 case NoEntity:
908 return;
910 break;
911 case SearchEntity:
912 if(cc == '#') {
913 cBuffer[cBufferPos++] = cc;
914 ++src;
915 Entity = NumericSearch;
917 else
918 Entity = EntityName;
920 break;
922 case NumericSearch:
923 if(cc == 'x' || cc == 'X') {
924 cBuffer[cBufferPos++] = cc;
925 ++src;
926 Entity = Hexadecimal;
928 else if(cc >= '0' && cc <= '9')
929 Entity = Decimal;
930 else
931 Entity = SearchSemicolon;
933 break;
935 case Hexadecimal:
937 int uc = EntityChar.unicode();
938 int ll = qMin<uint>(src.length(), 8);
939 while(ll--) {
940 QChar csrc(src->toLower());
941 cc = csrc.cell();
943 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
944 break;
946 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
947 cBuffer[cBufferPos++] = cc;
948 ++src;
950 EntityChar = QChar(uc);
951 Entity = SearchSemicolon;
952 break;
954 case Decimal:
956 int uc = EntityChar.unicode();
957 int ll = qMin(src.length(), 9-cBufferPos);
958 while(ll--) {
959 cc = src->cell();
961 if(src->row() || !(cc >= '0' && cc <= '9')) {
962 Entity = SearchSemicolon;
963 break;
966 uc = uc * 10 + (cc - '0');
967 cBuffer[cBufferPos++] = cc;
968 ++src;
970 EntityChar = QChar(uc);
971 if(cBufferPos == 9) Entity = SearchSemicolon;
972 break;
974 case EntityName:
976 int ll = qMin(src.length(), 9-cBufferPos);
977 while(ll--) {
978 QChar csrc = *src;
979 cc = csrc.cell();
981 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
982 (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
983 Entity = SearchSemicolon;
984 break;
987 cBuffer[cBufferPos++] = cc;
988 ++src;
990 // be IE compatible and interpret even unterminated entities
991 // outside tags. like "foo &nbspstuff bla".
992 if ( tag == NoTag ) {
993 const entity* e = kde_findEntity(cBuffer, cBufferPos);
994 if ( e && e->code < 256 ) {
995 EntityChar = e->code;
996 entityLen = cBufferPos;
1000 if(cBufferPos == 9) Entity = SearchSemicolon;
1001 if(Entity == SearchSemicolon) {
1002 if(cBufferPos > 1) {
1003 const entity *e = kde_findEntity(cBuffer, cBufferPos);
1004 // IE only accepts unterminated entities < 256,
1005 // Gecko accepts them all, but only outside tags
1006 if(e && ( tag == NoTag || e->code < 256 || *src == ';' )) {
1007 EntityChar = e->code;
1008 entityLen = cBufferPos;
1012 break;
1014 case SearchSemicolon:
1015 #ifdef TOKEN_DEBUG
1016 kDebug( 6036 ) << "ENTITY " << EntityChar.unicode();
1017 #endif
1018 fixUpChar(EntityChar);
1020 if (*src == ';')
1021 ++src;
1023 if ( !EntityChar.isNull() ) {
1024 checkBuffer();
1025 if (entityLen > 0 && entityLen < cBufferPos) {
1026 int rem = cBufferPos - entityLen;
1027 src.prepend( TokenizerString(QString::fromAscii(cBuffer+entityLen, rem)) );
1029 src.push( EntityChar );
1030 } else {
1031 #ifdef TOKEN_DEBUG
1032 kDebug( 6036 ) << "unknown entity!";
1033 #endif
1034 checkBuffer(10);
1035 // ignore the sequence, add it to the buffer as plaintext
1036 *dest++ = '&';
1037 for(unsigned int i = 0; i < cBufferPos; i++)
1038 dest[i] = cBuffer[i];
1039 dest += cBufferPos;
1040 if (pre)
1041 prePos += cBufferPos+1;
1044 Entity = NoEntity;
1045 EntityChar = QChar::Null;
1046 return;
1051 void HTMLTokenizer::parseTag(TokenizerString &src)
1053 assert(!Entity );
1054 checkScriptBuffer( src.length() );
1056 while ( !src.isEmpty() )
1058 checkBuffer();
1059 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1060 uint l = 0;
1061 while(l < src.length() && (src.toString()[l]).toLatin1().constData() != '>')
1062 l++;
1063 qDebug("src is now: *%s*, tquote: %d",
1064 src.toString().left(l).toLatin1().constData(), tquote);
1065 #endif
1066 switch(tag) {
1067 case NoTag:
1068 return;
1069 case TagName:
1071 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1072 qDebug("TagName");
1073 #endif
1074 if (searchCount > 0)
1076 if (*src == commentStart[searchCount])
1078 searchCount++;
1079 if (searchCount == 2)
1080 doctypeSearchCount++; // A '!' is also part of doctype, so we are moving through that still as well
1081 else
1082 doctypeSearchCount = 0;
1084 if (searchCount == 4)
1086 #ifdef TOKEN_DEBUG
1087 kDebug( 6036 ) << "Found comment";
1088 #endif
1089 // Found '<!--' sequence
1090 ++src;
1091 dest = buffer; // ignore the previous part of this tag
1092 tag = NoTag;
1094 comment = true;
1095 parseComment(src);
1096 return; // Finished parsing tag!
1098 // cuts of high part, is okay
1099 cBuffer[cBufferPos++] = src->cell();
1100 ++src;
1101 break;
1103 else
1104 searchCount = 0; // Stop looking for '<!--' sequence
1107 if (doctypeSearchCount > 0) {
1108 if((*src).toLower() == doctypeStart[doctypeSearchCount]) {
1109 doctypeSearchCount++;
1110 cBuffer[cBufferPos++] = src->cell();
1111 ++src;
1112 if(doctypeSearchCount == 9) {
1113 // Found '<!DOCTYPE' sequence
1114 tag = NoTag;
1115 doctypeAllowComment = true;
1116 doctypeComment = NoDoctypeComment;
1117 doctypeToken.reset();
1118 doctype = true;
1120 parseDoctype(src);
1121 return;
1123 break;
1124 } else
1125 doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
1128 bool finish = false;
1129 unsigned int ll = qMin(src.length(), CBUFLEN-cBufferPos);
1130 while(ll--) {
1131 ushort curchar = src->unicode();
1132 if(curchar <= ' ' || curchar == '>' ) {
1133 finish = true;
1134 break;
1136 // this is a nasty performance trick. will work for the A-Z
1137 // characters, but not for others. if it contains one,
1138 // we fail anyway
1139 char cc = curchar;
1140 cBuffer[cBufferPos++] = cc | 0x20;
1141 ++src;
1144 // Disadvantage: we add the possible rest of the tag
1145 // as attribute names. ### judge if this causes problems
1146 if(finish || CBUFLEN == cBufferPos) {
1147 bool beginTag;
1148 char* ptr = cBuffer;
1149 unsigned int len = cBufferPos;
1150 cBuffer[cBufferPos] = '\0';
1151 if ((cBufferPos > 0) && (*ptr == '/'))
1153 // End Tag
1154 beginTag = false;
1155 ptr++;
1156 len--;
1158 else
1159 // Start Tag
1160 beginTag = true;
1161 // Accept empty xml tags like <br/>
1162 if(len > 1 && ptr[len-1] == '/' ) {
1163 ptr[--len] = '\0';
1164 // if its like <br/> and not like <input/ value=foo>, take it as flat
1165 if (*src == '>')
1166 currToken.flat = true;
1169 uint tagID = khtml::getTagID(ptr, len);
1170 if (!tagID) {
1171 DOMString tagName(ptr);
1172 DocumentImpl *doc = parser->docPtr();
1173 if (Element::khtmlValidQualifiedName(tagName))
1174 tagID = doc->getId(NodeImpl::ElementId, tagName.implementation(), false, false);
1175 #ifdef TOKEN_DEBUG
1176 QByteArray tmp(ptr, len+1);
1177 kDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"";
1178 #endif
1180 if (tagID) {
1181 #ifdef TOKEN_DEBUG
1182 QByteArray tmp(ptr, len+1);
1183 kDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data();
1184 #endif
1185 currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG;
1187 dest = buffer;
1188 tag = SearchAttribute;
1189 cBufferPos = 0;
1191 break;
1193 case SearchAttribute:
1195 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1196 qDebug("SearchAttribute");
1197 #endif
1198 bool atespace = false;
1199 ushort curchar;
1200 while(!src.isEmpty()) {
1201 curchar = src->unicode();
1202 if(curchar > ' ') {
1203 if(curchar == '<' || curchar == '>')
1204 tag = SearchEnd;
1205 else if(atespace && (curchar == '\'' || curchar == '"'))
1207 tag = SearchValue;
1208 *dest++ = 0;
1209 attrName.clear();
1211 else
1212 tag = AttributeName;
1214 cBufferPos = 0;
1215 break;
1217 atespace = true;
1218 ++src;
1220 break;
1222 case AttributeName:
1224 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1225 qDebug("AttributeName");
1226 #endif
1227 ushort curchar;
1228 int ll = qMin(src.length(), CBUFLEN-cBufferPos);
1230 while(ll--) {
1231 curchar = src->unicode();
1232 if(curchar <= '>') {
1233 if(curchar <= ' ' || curchar == '=' || curchar == '>') {
1234 unsigned int a;
1235 cBuffer[cBufferPos] = '\0';
1236 a = khtml::getAttrID(cBuffer, cBufferPos);
1238 if ( !a ) {
1239 // did we just get /> or e.g checked/>
1240 if (curchar == '>' && cBufferPos >=1 && cBuffer[cBufferPos-1] == '/') {
1241 currToken.flat = true;
1242 if (cBufferPos>1)
1243 a = khtml::getAttrID(cBuffer, cBufferPos-1);
1245 if (!a)
1246 attrName = QLatin1String(QByteArray(cBuffer, cBufferPos+1).data());
1249 dest = buffer;
1250 *dest++ = a;
1251 #ifdef TOKEN_DEBUG
1252 if (!a || (cBufferPos && *cBuffer == '!'))
1253 kDebug( 6036 ) << "Unknown attribute: *" << QByteArray(cBuffer, cBufferPos+1).data() << "*";
1254 else
1255 kDebug( 6036 ) << "Known attribute: " << QByteArray(cBuffer, cBufferPos+1).data();
1256 #endif
1258 tag = SearchEqual;
1259 break;
1262 cBuffer[cBufferPos++] =
1263 ( curchar >= 'A' && curchar <= 'Z' ) ? curchar | 0x20 : curchar;
1264 ++src;
1266 if ( cBufferPos == CBUFLEN ) {
1267 cBuffer[cBufferPos] = '\0';
1268 attrName = QLatin1String(QByteArray(cBuffer, cBufferPos+1).data());
1269 dest = buffer;
1270 *dest++ = 0;
1271 tag = SearchEqual;
1273 break;
1275 case SearchEqual:
1277 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1278 qDebug("SearchEqual");
1279 #endif
1280 ushort curchar;
1281 bool atespace = false;
1282 while(!src.isEmpty()) {
1283 curchar = src->unicode();
1284 if(curchar > ' ') {
1285 if(curchar == '=') {
1286 #ifdef TOKEN_DEBUG
1287 kDebug(6036) << "found equal";
1288 #endif
1289 tag = SearchValue;
1290 ++src;
1292 else if(atespace && (curchar == '\'' || curchar == '"'))
1294 tag = SearchValue;
1295 *dest++ = 0;
1296 attrName.clear();
1298 else {
1299 DOMString v("");
1300 currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1301 dest = buffer;
1302 tag = SearchAttribute;
1304 break;
1306 atespace = true;
1307 ++src;
1309 break;
1311 case SearchValue:
1313 ushort curchar;
1314 while(!src.isEmpty()) {
1315 curchar = src->unicode();
1316 if(curchar > ' ') {
1317 if(( curchar == '\'' || curchar == '\"' )) {
1318 tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1319 tag = QuotedValue;
1320 ++src;
1321 } else
1322 tag = Value;
1324 break;
1326 ++src;
1328 break;
1330 case QuotedValue:
1332 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1333 qDebug("QuotedValue");
1334 #endif
1335 ushort curchar;
1336 while(!src.isEmpty()) {
1337 checkBuffer();
1339 curchar = src->unicode();
1340 if(curchar <= '\'' && !src.escaped()) {
1341 // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1342 if ( curchar == '&' )
1344 ++src;
1345 parseEntity(src, dest, true);
1346 break;
1348 else if ( (tquote == SingleQuote && curchar == '\'') ||
1349 (tquote == DoubleQuote && curchar == '\"') )
1351 // some <input type=hidden> rely on trailing spaces. argh
1352 while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
1353 dest--; // remove trailing newlines
1354 DOMString v(buffer+1, dest-buffer-1);
1355 currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1357 dest = buffer;
1358 tag = SearchAttribute;
1359 tquote = NoQuote;
1360 ++src;
1361 break;
1364 *dest++ = *src;
1365 ++src;
1367 break;
1369 case Value:
1371 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1372 qDebug("Value");
1373 #endif
1374 ushort curchar;
1375 while(!src.isEmpty()) {
1376 checkBuffer();
1377 curchar = src->unicode();
1378 if(curchar <= '>' && !src.escaped()) {
1379 // parse Entities
1380 if ( curchar == '&' )
1382 ++src;
1383 parseEntity(src, dest, true);
1384 break;
1386 // no quotes. Every space means end of value
1387 // '/' does not delimit in IE!
1388 if ( curchar <= ' ' || curchar == '>' )
1390 DOMString v(buffer+1, dest-buffer-1);
1391 currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1392 dest = buffer;
1393 tag = SearchAttribute;
1394 break;
1398 *dest++ = *src;
1399 ++src;
1401 break;
1403 case SearchEnd:
1405 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1406 qDebug("SearchEnd");
1407 #endif
1408 while(!src.isEmpty()) {
1409 if(*src == '<' || *src == '>')
1410 break;
1412 if (*src == '/')
1413 currToken.flat = true;
1415 ++src;
1417 if(src.isEmpty() && *src != '<' && *src != '>') break;
1419 searchCount = 0; // Stop looking for '<!--' sequence
1420 tag = NoTag;
1421 tquote = NoQuote;
1422 if ( *src == '>' )
1423 ++src;
1425 if ( !currToken.tid ) //stop if tag is unknown
1426 return;
1428 uint tagID = currToken.tid;
1429 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1430 kDebug( 6036 ) << "appending Tag: " << tagID;
1431 #endif
1432 // If the tag requires an end tag it cannot be flat,
1433 // unless we are using the HTML parser to parse XHTML
1434 // The only exception is SCRIPT and priority 0 tokens.
1435 if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT &&
1436 DOM::endTagRequirement(tagID) == DOM::REQUIRED &&
1437 parser->doc()->htmlMode() != DocumentImpl::XHtml)
1438 currToken.flat = false;
1440 bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
1442 if(tagID >= ID_CLOSE_TAG)
1443 tagID -= ID_CLOSE_TAG;
1444 else if ( !brokenScript && tagID == ID_SCRIPT ) {
1445 DOMStringImpl* a = 0;
1446 bool foundTypeAttribute = false;
1447 scriptSrc = scriptSrcCharset = QString();
1448 if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
1449 view && /* are we a regular tokenizer or just for innerHTML ? */
1450 parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
1452 if ( ( a = currToken.attrs->getValue( ATTR_SRC ) ) )
1453 scriptSrc = parser->doc()->completeURL(khtml::parseURL( DOMString(a) ).string() );
1454 if ( ( a = currToken.attrs->getValue( ATTR_CHARSET ) ) )
1455 scriptSrcCharset = DOMString(a).string().trimmed();
1456 if ( scriptSrcCharset.isEmpty() && view)
1457 scriptSrcCharset = parser->doc()->view()->part()->encoding();
1458 /* Check type before language, since language is deprecated */
1459 if ((a = currToken.attrs->getValue(ATTR_TYPE)) != 0 && !DOMString(a).string().isEmpty())
1460 foundTypeAttribute = true;
1461 else
1462 a = currToken.attrs->getValue(ATTR_LANGUAGE);
1464 javascript = true;
1466 if( foundTypeAttribute ) {
1468 Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
1469 Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
1470 Mozilla 1.5 accepts application/x-javascript, WinIE 6 doesn't.
1471 Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
1472 Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
1473 We want to accept all the values that either of these browsers accept, but not other values.
1475 QString type = DOMString(a).string().trimmed().toLower();
1476 if( type.compare("text/javascript") != 0 &&
1477 type.compare("text/javascript1.0") != 0 &&
1478 type.compare("text/javascript1.1") != 0 &&
1479 type.compare("text/javascript1.2") != 0 &&
1480 type.compare("text/javascript1.3") != 0 &&
1481 type.compare("text/javascript1.4") != 0 &&
1482 type.compare("text/javascript1.5") != 0 &&
1483 type.compare("text/jscript") != 0 &&
1484 type.compare("text/ecmascript") != 0 &&
1485 type.compare("text/livescript") != 0 &&
1486 type.compare("application/x-javascript") != 0 &&
1487 type.compare("application/x-ecmascript") != 0 &&
1488 type.compare("application/javascript") != 0 &&
1489 type.compare("application/ecmascript") != 0 )
1490 javascript = false;
1491 } else if( a ) {
1493 Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
1494 Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
1495 Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
1496 We want to accept all the values that either of these browsers accept, but not other values.
1498 QString lang = DOMString(a).string();
1499 lang = lang.toLower();
1500 if( lang.compare("") != 0 &&
1501 lang.compare("javascript") != 0 &&
1502 lang.compare("javascript1.0") != 0 &&
1503 lang.compare("javascript1.1") != 0 &&
1504 lang.compare("javascript1.2") != 0 &&
1505 lang.compare("javascript1.3") != 0 &&
1506 lang.compare("javascript1.4") != 0 &&
1507 lang.compare("javascript1.5") != 0 &&
1508 lang.compare("ecmascript") != 0 &&
1509 lang.compare("livescript") != 0 &&
1510 lang.compare("jscript") )
1511 javascript = false;
1515 processToken();
1517 if ( parser->selectMode() && beginTag)
1518 discard = AllDiscard;
1520 switch( tagID ) {
1521 case ID_PRE:
1522 pre = beginTag;
1523 if (beginTag)
1524 discard = LFDiscard;
1525 prePos = 0;
1526 break;
1527 case ID_BR:
1528 prePos = 0;
1529 break;
1530 case ID_SCRIPT:
1531 if (beginTag) {
1532 searchStopper = scriptEnd;
1533 searchStopperLen = 8;
1534 script = true;
1535 parseSpecial(src);
1537 else if (tagID < ID_CLOSE_TAG) // Handle <script src="foo"/>
1538 scriptHandler();
1539 break;
1540 case ID_STYLE:
1541 if (beginTag) {
1542 searchStopper = styleEnd;
1543 searchStopperLen = 7;
1544 style = true;
1545 parseSpecial(src);
1547 break;
1548 case ID_TEXTAREA:
1549 if(beginTag) {
1550 searchStopper = textareaEnd;
1551 searchStopperLen = 10;
1552 textarea = true;
1553 discard = NoneDiscard;
1554 parseSpecial(src);
1556 break;
1557 case ID_TITLE:
1558 if (beginTag) {
1559 searchStopper = titleEnd;
1560 searchStopperLen = 7;
1561 title = true;
1562 parseSpecial(src);
1564 break;
1565 case ID_XMP:
1566 if (beginTag) {
1567 searchStopper = xmpEnd;
1568 searchStopperLen = 5;
1569 xmp = true;
1570 parseSpecial(src);
1572 break;
1573 case ID_SELECT:
1574 select = beginTag;
1575 break;
1576 case ID_PLAINTEXT:
1577 plaintext = beginTag;
1578 break;
1580 return; // Finished parsing tag!
1582 } // end switch
1584 return;
1587 void HTMLTokenizer::addPending()
1589 if ( select && !(comment || script))
1591 *dest++ = ' ';
1593 else if ( textarea )
1595 switch(pending) {
1596 case LFPending: *dest++ = QLatin1Char('\n'); prePos = 0; break;
1597 case SpacePending: *dest++ = QLatin1Char(' '); ++prePos; break;
1598 case TabPending: *dest++ = QLatin1Char('\t'); prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
1599 case NonePending:
1600 assert(0);
1603 else
1605 int p;
1607 switch (pending)
1609 case SpacePending:
1610 // Insert a breaking space
1611 *dest++ = QLatin1Char(' ');
1612 prePos++;
1613 break;
1615 case LFPending:
1616 *dest = QLatin1Char('\n');
1617 dest++;
1618 prePos = 0;
1619 break;
1621 case TabPending:
1622 p = TAB_SIZE - ( prePos % TAB_SIZE );
1623 for ( int x = 0; x < p; x++ )
1624 *dest++ = QLatin1Char(' ');
1625 prePos += p;
1626 break;
1628 case NonePending:
1629 assert(0);
1630 break;
1634 pending = NonePending;
1637 void HTMLTokenizer::write( const TokenizerString &str, bool appendData )
1639 #ifdef TOKEN_DEBUG
1640 kDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")";
1641 #endif
1643 if ( !buffer )
1644 return;
1646 if ( ( m_executingScript && appendData ) || cachedScript.count() ) {
1647 // don't parse; we will do this later
1648 if (pendingQueue.isEmpty())
1649 pendingQueue.push(str);
1650 else if (appendData)
1651 pendingQueue.bottom().append(str);
1652 else
1653 pendingQueue.top().append(str);
1654 #if PROSPECTIVE_TOKENIZER_ENABLED
1655 if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData)
1656 m_prospectiveTokenizer->write(str);
1657 #endif
1658 return;
1661 if ( onHold ) {
1662 src.append(str);
1663 return;
1666 if (!src.isEmpty())
1667 src.append(str);
1668 else
1669 setSrc(str);
1670 m_abort = false;
1672 // if (Entity)
1673 // parseEntity(src, dest);
1675 while ( !src.isEmpty() )
1677 if ( m_abort )
1678 return;
1679 // do we need to enlarge the buffer?
1680 checkBuffer();
1682 ushort cc = src->unicode();
1684 if (skipLF && (cc != '\n'))
1685 skipLF = false;
1687 if (skipLF) {
1688 skipLF = false;
1689 ++src;
1691 else if ( Entity )
1692 parseEntity( src, dest );
1693 else if ( plaintext )
1694 parseText( src );
1695 else if (script)
1696 parseSpecial(src);
1697 else if (style)
1698 parseSpecial(src);
1699 else if (xmp)
1700 parseSpecial(src);
1701 else if (textarea)
1702 parseSpecial(src);
1703 else if (title)
1704 parseSpecial(src);
1705 else if (comment)
1706 parseComment(src);
1707 else if (doctypeComment && doctypeComment != DoctypeCommentEnd && doctypeComment != DoctypeCommentBogus)
1708 parseDoctypeComment(src);
1709 else if (doctype)
1710 parseDoctype(src);
1711 else if (server)
1712 parseServer(src);
1713 else if (processingInstruction)
1714 parseProcessingInstruction(src);
1715 else if (tag)
1716 parseTag(src);
1717 else if ( startTag )
1719 startTag = false;
1720 bool endTag = false;
1722 switch(cc) {
1723 case '/':
1724 endTag = true;
1725 break;
1726 case '!':
1728 // <!-- comment --> or <!DOCTYPE ...>
1729 searchCount = 1; // Look for '<!--' sequence to start comment...
1730 doctypeSearchCount = 1; // ... or for '<!DOCTYPE' sequence to start doctype
1731 break;
1733 case '?':
1735 // xml processing instruction
1736 processingInstruction = true;
1737 tquote = NoQuote;
1738 parseProcessingInstruction(src);
1739 continue;
1741 break;
1743 case '%':
1744 if (!brokenServer) {
1745 // <% server stuff, handle as comment %>
1746 server = true;
1747 tquote = NoQuote;
1748 parseServer(src);
1749 continue;
1751 // else fall through
1752 default:
1754 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
1756 // Start of a Start-Tag
1758 else
1760 // Invalid tag
1761 // Add as is
1762 if (pending)
1763 addPending();
1764 *dest = '<';
1765 dest++;
1766 continue;
1769 }; // end case
1771 // According to SGML any LF immediately after a starttag, or
1772 // immediately before an endtag should be ignored.
1773 // ### Gecko and MSIE though only ignores LF immediately after
1774 // starttags and only for PRE elements -- asj (28/06-2005)
1775 if ( pending )
1776 if (!select)
1777 addPending();
1778 else
1779 pending = NonePending;
1781 // Cancel unused discards
1782 discard = NoneDiscard;
1783 // if (!endTag) discard = LFDiscard;
1785 processToken();
1787 cBufferPos = 0;
1788 tag = TagName;
1789 parseTag(src);
1791 else if ( cc == '&' && !src.escaped())
1793 ++src;
1794 if ( pending )
1795 addPending();
1796 discard = NoneDiscard;
1797 parseEntity(src, dest, true);
1799 else if ( cc == '<' && !src.escaped())
1801 tagStartLineno = lineno+src.lineCount();
1802 ++src;
1803 discard = NoneDiscard;
1804 startTag = true;
1806 else if (( cc == '\n' ) || ( cc == '\r' ))
1808 if (discard == SpaceDiscard)
1809 discard = NoneDiscard;
1811 if (discard == LFDiscard) {
1812 // Ignore one LF
1813 discard = NoneDiscard;
1815 else if (discard == AllDiscard)
1817 // Ignore
1819 else
1821 if (select && !script) {
1822 pending = LFPending;
1823 } else {
1824 if (pending)
1825 addPending();
1826 pending = LFPending;
1830 /* Check for MS-DOS CRLF sequence */
1831 if (cc == '\r')
1833 skipLF = true;
1835 ++src;
1837 else if (( cc == ' ' ) || ( cc == '\t' ))
1839 if(discard == LFDiscard)
1840 discard = NoneDiscard;
1842 if(discard == SpaceDiscard) {
1843 // Ignore one space
1844 discard = NoneDiscard;
1846 else if(discard == AllDiscard)
1848 // Ignore
1850 else {
1851 if (select && !script) {
1852 if (!pending)
1853 pending = SpacePending;
1854 } else {
1855 if (pending)
1856 addPending();
1857 if (cc == ' ')
1858 pending = SpacePending;
1859 else
1860 pending = TabPending;
1864 ++src;
1866 else
1868 if (pending)
1869 addPending();
1871 discard = NoneDiscard;
1872 if ( pre )
1874 prePos++;
1876 *dest = *src;
1877 fixUpChar( *dest );
1878 ++dest;
1879 ++src;
1883 if (noMoreData && cachedScript.isEmpty() && !m_executingScript)
1884 end(); // this actually causes us to be deleted
1887 void HTMLTokenizer::timerEvent( QTimerEvent *e )
1889 if ( e->timerId() == m_autoCloseTimer && cachedScript.isEmpty() ) {
1890 finish();
1894 void HTMLTokenizer::setAutoClose( bool b ) {
1895 killTimer( m_autoCloseTimer );
1896 m_autoCloseTimer = 0;
1897 if ( b )
1898 m_autoCloseTimer = startTimer(100);
1901 void HTMLTokenizer::end()
1903 if ( buffer == 0 ) {
1904 emit finishedParsing();
1905 return;
1908 // parseTag is using the buffer for different matters
1909 if ( !tag )
1910 processToken();
1912 if(buffer)
1913 KHTML_DELETE_QCHAR_VEC(buffer);
1915 if(scriptCode)
1916 KHTML_DELETE_QCHAR_VEC(scriptCode);
1918 scriptCode = 0;
1919 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1920 buffer = 0;
1921 emit finishedParsing();
1924 void HTMLTokenizer::finish()
1926 if ( m_autoCloseTimer ) {
1927 killTimer( m_autoCloseTimer );
1928 m_autoCloseTimer = 0;
1930 // do this as long as we don't find matching comment ends
1931 while((title || script || comment || server) && scriptCode && scriptCodeSize)
1933 // we've found an unmatched comment start
1934 if (comment)
1935 brokenComments = true;
1936 else if (server)
1937 brokenServer = true;
1938 else if (script)
1939 brokenScript = true;
1941 checkScriptBuffer();
1942 scriptCode[ scriptCodeSize ] = 0;
1943 scriptCode[ scriptCodeSize + 1 ] = 0;
1944 int pos;
1945 QString food;
1946 if (title || style || script)
1947 food.setUnicode(scriptCode, scriptCodeSize);
1948 else if (server) {
1949 food = "<";
1950 food += QString(scriptCode, scriptCodeSize);
1952 else {
1953 pos = QString::fromRawData(scriptCode, scriptCodeSize).indexOf('>');
1954 food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
1956 KHTML_DELETE_QCHAR_VEC(scriptCode);
1957 scriptCode = 0;
1958 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
1959 if (script)
1960 scriptHandler();
1962 comment = title = server = script = false;
1963 if ( !food.isEmpty() )
1964 write(food, true);
1966 // this indicates we will not receive any more data... but if we are waiting on
1967 // an external script to load, we can't finish parsing until that is done
1968 noMoreData = true;
1969 if (cachedScript.isEmpty() && !m_executingScript && !onHold)
1970 end(); // this actually causes us to be deleted
1973 void HTMLTokenizer::processToken()
1975 KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
1976 if (jsProxy)
1977 jsProxy->setEventHandlerLineno(tagStartLineno+1);
1978 if ( dest > buffer )
1980 #if 0
1981 if(currToken.tid) {
1982 qDebug( "unexpected token id: %d, str: *%s*", currToken.tid,QString::fromRawData( buffer, dest-buffer ).toLatin1().constData() );
1983 assert(0);
1986 #endif
1987 currToken.text = new DOMStringImpl( buffer, dest - buffer );
1988 currToken.text->ref();
1989 if (currToken.tid != ID_COMMENT)
1990 currToken.tid = ID_TEXT;
1992 else if(!currToken.tid) {
1993 currToken.reset();
1994 if (jsProxy)
1995 jsProxy->setEventHandlerLineno(lineno+src.lineCount()+1);
1996 return;
1999 dest = buffer;
2001 #ifdef TOKEN_DEBUG
2002 QString name = QString( getTagName(currToken.tid) );
2003 QString text;
2004 if(currToken.text)
2005 text = QString::fromRawData(currToken.text->s, currToken.text->l);
2007 kDebug( 6036 ) << "Token --> " << name << " id = " << currToken.tid;
2008 if (currToken.flat)
2009 kDebug( 6036 ) << "Token is FLAT!";
2010 if(!text.isNull())
2011 kDebug( 6036 ) << "text: \"" << text << "\"";
2012 unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
2013 if(l) {
2014 kDebug( 6036 ) << "Attributes: " << l;
2015 for (unsigned long i = 0; i < l; ++i) {
2016 NodeImpl::Id tid = currToken.attrs->idAt(i);
2017 DOMString value = currToken.attrs->valueAt(i);
2018 kDebug( 6036 ) << " " << tid << " " << parser->doc()->getDocument()->getName(NodeImpl::AttributeId, tid).string()
2019 << "=\"" << value.string() << "\"" << endl;
2022 kDebug( 6036 );
2023 #endif
2025 // In some cases, parseToken() can cause javascript code to be executed
2026 // (for example, when setting an attribute that causes an event handler
2027 // to be created). So we need to protect against re-entrancy into the parser
2028 m_executingScript++;
2030 // pass the token over to the parser, the parser DOES NOT delete the token
2031 parser->parseToken(&currToken);
2033 m_executingScript--;
2035 if ( currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces() )
2036 discard = NoneDiscard;
2038 currToken.reset();
2039 if (jsProxy)
2040 jsProxy->setEventHandlerLineno(1);
2043 void HTMLTokenizer::processDoctypeToken()
2045 // kDebug( 6036 ) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID;
2046 doctypeToken.publicID = doctypeToken.publicID.simplified();
2047 doctypeToken.systemID = doctypeToken.systemID.simplified();
2048 parser->parseDoctypeToken(&doctypeToken);
2052 HTMLTokenizer::~HTMLTokenizer()
2054 reset();
2055 delete m_prospectiveTokenizer;
2056 delete parser;
2060 void HTMLTokenizer::enlargeBuffer(int len)
2062 int newsize = qMax(size*2, size+len);
2063 int oldoffs = (dest - buffer);
2065 buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize);
2066 dest = buffer + oldoffs;
2067 size = newsize;
2070 void HTMLTokenizer::enlargeScriptBuffer(int len)
2072 int newsize = qMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
2073 scriptCode = KHTML_REALLOC_QCHAR_VEC(scriptCode, newsize);
2074 scriptCodeMaxSize = newsize;
2077 void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/)
2079 assert(!cachedScript.isEmpty());
2080 bool done = false;
2081 while (!done && cachedScript.head()->isLoaded()) {
2083 kDebug( 6036 ) << "Finished loading an external script";
2085 CachedScript* cs = cachedScript.dequeue();
2086 DOMString scriptSource = cs->script();
2087 #ifdef TOKEN_DEBUG
2088 kDebug( 6036 ) << "External script is:" << endl << scriptSource.string();
2089 #endif
2090 setSrc(TokenizerString());
2092 // make sure we forget about the script before we execute the new one
2093 // infinite recursion might happen otherwise
2094 QString cachedScriptUrl( cs->url().string() );
2095 cs->deref(this);
2097 scriptExecution( scriptSource.string(), cachedScriptUrl );
2099 done = cachedScript.isEmpty();
2101 // 'script' is true when we are called synchronously from
2102 // scriptHandler(). In that case scriptHandler() will take care
2103 // of 'scriptOutput'.
2104 if ( !script ) {
2105 while (pendingQueue.count() > 1) {
2106 TokenizerString t = pendingQueue.pop();
2107 pendingQueue.top().prepend( t );
2109 if (done) {
2110 write(pendingQueue.pop(), false);
2112 // we might be deleted at this point, do not
2113 // access any members.
2118 bool HTMLTokenizer::isWaitingForScripts() const
2120 return cachedScript.count();
2123 bool HTMLTokenizer::isExecutingScript() const
2125 return (m_executingScript > 0);
2128 void HTMLTokenizer::setSrc(const TokenizerString& source)
2130 lineno += src.lineCount();
2131 src = source;
2132 src.resetLineCount();
2135 void HTMLTokenizer::setOnHold(bool _onHold)
2137 if (onHold == _onHold) return;
2138 onHold = _onHold;