2 This file is part of the KDE libraries
4 Copyright (C) 1997 Martin Jones (mjones@kde.org)
5 (C) 1997 Torben Weis (weis@kde.org)
6 (C) 1998 Waldo Bastian (bastian@kde.org)
7 (C) 1999 Lars Knoll (knoll@kde.org)
8 (C) 1999 Antti Koivisto (koivisto@kde.org)
9 (C) 2001-2003 Dirk Mueller (mueller@kde.org)
10 (C) 2004 Apple Computer, Inc.
11 (C) 2006 Germain Garand (germain@ebooksfrance.org)
13 This library is free software; you can redistribute it and/or
14 modify it under the terms of the GNU Library General Public
15 License as published by the Free Software Foundation; either
16 version 2 of the License, or (at your option) any later version.
18 This library is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 Library General Public License for more details.
23 You should have received a copy of the GNU Library General Public License
24 along with this library; see the file COPYING.LIB. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA.
28 //----------------------------------------------------------------------------
30 // KDE HTML Widget - Tokenizers
32 // #define TOKEN_DEBUG 1
33 //#define TOKEN_DEBUG 2
35 #include "htmltokenizer.h"
36 #include "html_documentimpl.h"
37 #include "htmlparser.h"
40 #include <misc/loader.h>
41 #include <misc/htmlhashes.h>
43 #include <khtmlview.h>
44 #include <khtml_part.h>
45 #include <xml/dom_docimpl.h>
46 #include <css/csshelper.h>
47 #include <ecma/kjs_proxy.h>
48 #include <kcharsets.h>
52 #include <QtCore/QVariant>
58 #include "kentities.c"
59 #include "htmlprospectivetokenizer.h"
61 #define PROSPECTIVE_TOKENIZER_ENABLED 1
63 using namespace khtml
;
65 static const QChar commentStart
[] = { '<','!','-','-', QChar::Null
};
66 static const char doctypeStart
[] = "<!doctype";
67 static const char publicStart
[] = "public";
68 static const char systemStart
[] = "system";
70 static const char scriptEnd
[] = "</script";
71 static const char xmpEnd
[] = "</xmp";
72 static const char styleEnd
[] = "</style";
73 static const char textareaEnd
[] = "</textarea";
74 static const char titleEnd
[] = "</title";
76 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
77 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
78 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
80 // Full support for MS Windows extensions to Latin-1.
81 // Technically these extensions should only be activated for pages
82 // marked "windows-1252" or "cp1252", but
83 // in the standard Microsoft way, these extensions infect hundreds of thousands
84 // of web pages. Note that people with non-latin-1 Microsoft extensions
87 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
88 // http://www.bbsinc.com/iso8859.html
89 // http://www.obviously.com/
91 // There may be better equivalents
95 #define fixUpChar(x) \
96 switch ((x).unicode()) \
98 case 0x80: (x) = 0x20ac; break; \
99 case 0x82: (x) = 0x201a; break; \
100 case 0x83: (x) = 0x0192; break; \
101 case 0x84: (x) = 0x201e; break; \
102 case 0x85: (x) = 0x2026; break; \
103 case 0x86: (x) = 0x2020; break; \
104 case 0x87: (x) = 0x2021; break; \
105 case 0x88: (x) = 0x02C6; break; \
106 case 0x89: (x) = 0x2030; break; \
107 case 0x8A: (x) = 0x0160; break; \
108 case 0x8b: (x) = 0x2039; break; \
109 case 0x8C: (x) = 0x0152; break; \
110 case 0x8E: (x) = 0x017D; break; \
111 case 0x91: (x) = 0x2018; break; \
112 case 0x92: (x) = 0x2019; break; \
113 case 0x93: (x) = 0x201C; break; \
114 case 0x94: (x) = 0X201D; break; \
115 case 0x95: (x) = 0x2022; break; \
116 case 0x96: (x) = 0x2013; break; \
117 case 0x97: (x) = 0x2014; break; \
118 case 0x98: (x) = 0x02DC; break; \
119 case 0x99: (x) = 0x2122; break; \
120 case 0x9A: (x) = 0x0161; break; \
121 case 0x9b: (x) = 0x203A; break; \
122 case 0x9C: (x) = 0x0153; break; \
123 case 0x9E: (x) = 0x017E; break; \
124 case 0x9F: (x) = 0x0178; break; \
128 // ----------------------------------------------------------------------------
130 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl
*_doc
, KHTMLView
*_view
)
135 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
136 charsets
= KGlobal::charsets();
137 parser
= new KHTMLParser(_view
, _doc
);
138 m_executingScript
= 0;
139 m_autoCloseTimer
= 0;
140 m_prospectiveTokenizer
= 0;
146 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl
*_doc
, DOM::DocumentFragmentImpl
*i
)
151 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
152 charsets
= KGlobal::charsets();
153 parser
= new KHTMLParser( i
, _doc
);
154 m_executingScript
= 0;
155 m_autoCloseTimer
= 0;
156 m_prospectiveTokenizer
= 0;
162 void HTMLTokenizer::reset()
164 assert(m_executingScript
== 0);
165 Q_ASSERT(onHold
== false);
168 while (!cachedScript
.isEmpty())
169 cachedScript
.dequeue()->deref(this);
172 KHTML_DELETE_QCHAR_VEC(buffer
);
177 KHTML_DELETE_QCHAR_VEC(scriptCode
);
179 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
181 if (m_autoCloseTimer
) {
182 killTimer(m_autoCloseTimer
);
183 m_autoCloseTimer
= 0;
187 doctypeToken
.reset();
190 void HTMLTokenizer::begin()
192 m_executingScript
= 0;
196 buffer
= KHTML_ALLOC_QCHAR_VEC( 255 );
199 pending
= NonePending
;
200 discard
= NoneDiscard
;
205 processingInstruction
= false;
213 doctypeComment
= NoDoctypeComment
;
214 doctypeAllowComment
= false;
221 doctypeSearchCount
= 0;
222 doctypeSecondarySearchCount
= 0;
225 brokenComments
= false;
226 brokenServer
= false;
227 brokenScript
= false;
229 scriptStartLineno
= 0;
233 void HTMLTokenizer::processListing(TokenizerString list
)
237 // This function adds the listing 'list' as
238 // preformatted text-tokens to the token-collection
239 // thereby converting TABs.
240 if(!style
) pre
= true;
243 while ( !list
.isEmpty() )
245 checkBuffer(3*TAB_SIZE
);
247 if (skipLF
&& ( list
->unicode() != '\n' ))
257 else if (( list
->unicode() == '\n' ) || ( list
->unicode() == '\r' ))
259 if (discard
== LFDiscard
)
262 discard
= NoneDiscard
; // We have discarded 1 LF
270 // we used to do it not at all and we want to have
271 // it fixed for textarea. So here we are
278 /* Check for MS-DOS CRLF sequence */
279 if (list
->unicode() == '\r')
285 else if (( list
->unicode() == ' ' ) || ( list
->unicode() == '\t'))
290 pending
= SpacePending
;
292 pending
= TabPending
;
298 discard
= NoneDiscard
;
309 if ((pending
== SpacePending
) || (pending
== TabPending
))
312 pending
= NonePending
;
318 void HTMLTokenizer::parseSpecial(TokenizerString
&src
)
320 assert( textarea
|| title
|| !Entity
);
322 assert( xmp
+textarea
+title
+style
+script
== 1 );
324 scriptStartLineno
= lineno
+src
.lineCount();
326 if ( comment
) parseComment( src
);
328 while ( !src
.isEmpty() ) {
330 unsigned char ch
= src
->toLatin1();
331 if ( !scriptCodeResync
&& !brokenComments
&& !textarea
&& !xmp
&& ch
== '-' && scriptCodeSize
>= 3 && !src
.escaped() && QString::fromRawData( scriptCode
+scriptCodeSize
-3, 3 ) == "<!-" ) {
333 scriptCode
[ scriptCodeSize
++ ] = ch
;
338 if ( scriptCodeResync
&& !tquote
&& ( ch
== '>' ) ) {
340 scriptCodeSize
= scriptCodeResync
-1;
341 scriptCodeResync
= 0;
342 scriptCode
[ scriptCodeSize
] = scriptCode
[ scriptCodeSize
+ 1 ] = 0;
346 processListing(TokenizerString(scriptCode
, scriptCodeSize
));
348 if ( style
) { currToken
.tid
= ID_STYLE
+ ID_CLOSE_TAG
; }
349 else if ( textarea
) { currToken
.tid
= ID_TEXTAREA
+ ID_CLOSE_TAG
; }
350 else if ( title
) { currToken
.tid
= ID_TITLE
+ ID_CLOSE_TAG
; }
351 else if ( xmp
) { currToken
.tid
= ID_XMP
+ ID_CLOSE_TAG
; }
353 script
= style
= textarea
= title
= xmp
= false;
355 scriptCodeSize
= scriptCodeResync
= 0;
359 // possible end of tagname, lets check.
360 if ( !scriptCodeResync
&& !escaped
&& !src
.escaped() && ( ch
== '>' || ch
== '/' || ch
<= ' ' ) && ch
&&
361 scriptCodeSize
>= searchStopperLen
&&
362 !QString::fromRawData( scriptCode
+scriptCodeSize
-searchStopperLen
, searchStopperLen
).indexOf( searchStopper
, 0, Qt::CaseInsensitive
)) {
363 scriptCodeResync
= scriptCodeSize
-searchStopperLen
+1;
367 if ( scriptCodeResync
&& !escaped
) {
369 tquote
= (tquote
== NoQuote
) ? DoubleQuote
: ((tquote
== SingleQuote
) ? SingleQuote
: NoQuote
);
371 tquote
= (tquote
== NoQuote
) ? SingleQuote
: (tquote
== DoubleQuote
) ? DoubleQuote
: NoQuote
;
372 else if (tquote
!= NoQuote
&& (ch
== '\r' || ch
== '\n'))
375 escaped
= ( !escaped
&& ch
== '\\' );
376 if (!scriptCodeResync
&& (textarea
||title
) && !src
.escaped() && ch
== '&') {
377 QChar
*scriptCodeDest
= scriptCode
+scriptCodeSize
;
379 parseEntity(src
,scriptCodeDest
,true);
380 scriptCodeSize
= scriptCodeDest
-scriptCode
;
383 scriptCode
[ scriptCodeSize
++ ] = *src
;
389 void HTMLTokenizer::scriptHandler()
391 QString currentScriptSrc
= scriptSrc
;
394 processListing(TokenizerString(scriptCode
, scriptCodeSize
));
395 QString
exScript( buffer
, dest
-buffer
);
398 currToken
.tid
= ID_SCRIPT
+ ID_CLOSE_TAG
;
401 // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
402 bool followingFrameset
= (parser
->doc()->body() && parser
->doc()->body()->id() == ID_FRAMESET
);
403 bool effectiveScript
= !parser
->skipMode() && !followingFrameset
;
404 bool deferredScript
= false;
406 if ( effectiveScript
) {
407 CachedScript
* cs
= 0;
409 // forget what we just got, load from src url instead
410 if ( !currentScriptSrc
.isEmpty() && javascript
&&
411 (cs
= parser
->doc()->docLoader()->requestScript(currentScriptSrc
, scriptSrcCharset
) )) {
412 cachedScript
.enqueue(cs
);
416 pendingQueue
.push(src
);
417 int scriptCount
= cachedScript
.count();
418 setSrc(TokenizerString());
419 scriptCodeSize
= scriptCodeResync
= 0;
421 if (cachedScript
.count() == scriptCount
)
422 deferredScript
= true;
424 else if (currentScriptSrc
.isEmpty() && view
&& javascript
) {
425 pendingQueue
.push(src
);
426 setSrc(TokenizerString());
427 scriptCodeSize
= scriptCodeResync
= 0;
428 scriptExecution( exScript
, QString(), tagStartLineno
/*scriptStartLineno*/ );
430 // script was filtered or disallowed
431 effectiveScript
= false;
436 scriptCodeSize
= scriptCodeResync
= 0;
438 if ( !effectiveScript
)
441 if ( !m_executingScript
&& cachedScript
.isEmpty() ) {
442 src
.append(pendingQueue
.pop());
443 } else if ( cachedScript
.isEmpty() ) {
444 write( pendingQueue
.pop(), false );
445 } else if ( !deferredScript
&& pendingQueue
.count() > 1) {
446 TokenizerString t
= pendingQueue
.pop();
447 pendingQueue
.top().prepend( t
);
449 #if PROSPECTIVE_TOKENIZER_ENABLED
450 if (!cachedScript
.isEmpty() && !m_executingScript
) {
451 if (!m_prospectiveTokenizer
)
452 m_prospectiveTokenizer
= new ProspectiveTokenizer(parser
->docPtr());
453 if (!m_prospectiveTokenizer
->inProgress() && !pendingQueue
.isEmpty()) {
454 m_prospectiveTokenizer
->begin();
455 m_prospectiveTokenizer
->write(pendingQueue
.top());
462 void HTMLTokenizer::scriptExecution( const QString
& str
, const QString
& scriptURL
,
465 bool oldscript
= script
;
469 if (scriptURL
.isNull() && view
)
470 url
= static_cast<DocumentImpl
*>(view
->part()->document().handle())->URL().url();
475 view
->part()->executeScript(url
,baseLine
+1,Node(),str
);
480 void HTMLTokenizer::parseComment(TokenizerString
&src
)
483 bool strict
= parser
->doc()->inStrictMode() && parser
->doc()->htmlMode() != DocumentImpl::XHtml
&& !script
&& !style
;
484 int delimiterCount
= 0;
485 bool canClose
= false;
487 checkScriptBuffer(src
.length());
488 while ( src
.length() ) {
489 scriptCode
[ scriptCodeSize
++ ] = *src
;
491 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
492 qDebug("comment is now: *%s*", src
.toString().left(16).toLatin1().constData());
497 if (src
->unicode() == '-') {
499 if (delimiterCount
== 2) {
501 canClose
= !canClose
;
508 if ((!strict
|| canClose
) && src
->unicode() == '>')
510 bool handleBrokenComments
= brokenComments
&& !( script
|| style
);
511 bool scriptEnd
=false;
514 if ( scriptCodeSize
> 2 && scriptCode
[scriptCodeSize
-3] == '-' &&
515 scriptCode
[scriptCodeSize
-2] == '-' )
519 if (canClose
|| handleBrokenComments
|| scriptEnd
){
521 if ( !( title
|| script
|| xmp
|| textarea
|| style
) ) {
523 scriptCode
[ scriptCodeSize
] = 0;
524 scriptCode
[ scriptCodeSize
+ 1 ] = 0;
525 currToken
.tid
= ID_COMMENT
;
526 processListing(TokenizerString(scriptCode
, scriptCodeSize
- 2));
528 currToken
.tid
= ID_COMMENT
+ ID_CLOSE_TAG
;
533 return; // Finished parsing comment
540 void HTMLTokenizer::parseDoctypeComment(TokenizerString
&src
)
542 while (!src
.isEmpty()) {
544 switch (doctypeComment
) {
545 case DoctypeCommentHalfBegin
: {
547 // Ooops, it's not comment
548 doctypeComment
= DoctypeCommentBogus
;
551 // Doctype comment begins
552 doctypeComment
= DoctypeComment
;
557 case DoctypeComment
: {
559 // Perhaps this is end of comment
560 doctypeComment
= DoctypeCommentHalfEnd
;
563 // Keep scanning for '--'
568 case DoctypeCommentHalfEnd
: {
570 // Doctype comment ends
571 doctypeComment
= DoctypeCommentEnd
;
576 doctypeComment
= DoctypeComment
;
581 assert(!"Undefined doctype comment state");
588 void HTMLTokenizer::parseDoctype(TokenizerString
&src
)
590 while (!src
.isEmpty() && doctype
) {
592 bool isWhitespace
= false;
594 if (doctypeComment
== DoctypeCommentEnd
) {
595 doctypeComment
= NoDoctypeComment
;
597 } else if (doctypeComment
== DoctypeCommentBogus
) {
598 doctypeComment
= NoDoctypeComment
;
603 if (doctypeAllowComment
) {
604 if (!doctypeComment
&& c
== '-') {
605 doctypeComment
= DoctypeCommentHalfBegin
;
608 if (doctypeComment
) {
609 parseDoctypeComment(src
);
612 isWhitespace
= c
== '\r' || c
== '\n' || c
== '\t' || c
== ' ';
616 switch (doctypeToken
.state
) {
618 doctypeToken
.state
= DoctypeBeforeName
;
624 case DoctypeBeforeName
: {
626 // Malformed. Just exit.
628 } else if (isWhitespace
) {
632 doctypeToken
.state
= DoctypeName
;
638 // Valid doctype. Emit it.
640 processDoctypeToken();
641 } else if (isWhitespace
) {
642 doctypeSearchCount
= 0; // Used now to scan for PUBLIC
643 doctypeSecondarySearchCount
= 0; // Used now to scan for SYSTEM
644 doctypeToken
.state
= DoctypeAfterName
;
646 doctypeToken
.name
.append(c
);
650 case DoctypeAfterName
: {
652 // Valid doctype. Emit it.
654 processDoctypeToken();
655 } else if (c
== '[') {
656 if(doctypeSearchCount
> 0 || doctypeSecondarySearchCount
> 0) { // is there any public/system indicator before?
657 doctypeSearchCount
= doctypeSecondarySearchCount
= 0;
658 doctypeToken
.state
= DoctypeBogus
;
660 // Found internal subset
661 doctypeToken
.state
= DoctypeInternalSubset
;
662 doctypeAllowComment
= false;
663 } else if (!isWhitespace
) {
664 if (c
.toLower() == publicStart
[doctypeSearchCount
]) {
665 doctypeSearchCount
++;
666 if(doctypeSearchCount
== 6)
667 // Found 'PUBLIC' sequence
668 doctypeToken
.state
= DoctypeBeforePublicID
;
669 } else if (doctypeSearchCount
> 0) {
670 doctypeSearchCount
= 0;
671 doctypeToken
.state
= DoctypeBogus
;
672 } else if (c
.toLower() == systemStart
[doctypeSecondarySearchCount
]) {
673 doctypeSecondarySearchCount
++;
674 if(doctypeSecondarySearchCount
== 6)
675 // Found 'SYSTEM' sequence
676 doctypeToken
.state
= DoctypeBeforeSystemID
;
678 doctypeSecondarySearchCount
= 0;
679 doctypeToken
.state
= DoctypeBogus
;
682 // Whitespace keeps us in the after name state
686 case DoctypeBeforePublicID
: {
687 if (c
== '\"' || c
== '\'') {
688 tquote
= c
== '\"' ? DoubleQuote
: SingleQuote
;
689 doctypeToken
.state
= DoctypePublicID
;
690 doctypeAllowComment
= false;
691 } else if (c
== '>') {
692 // Considered bogus. Don't process the doctype.
694 } else if (isWhitespace
) {
697 doctypeToken
.state
= DoctypeBogus
;
700 case DoctypePublicID
: {
701 if ((c
== '\"' && tquote
== DoubleQuote
) || (c
== '\'' && tquote
== SingleQuote
)) {
702 doctypeToken
.state
= DoctypeAfterPublicID
;
703 doctypeAllowComment
= true;
704 } else if (c
== '>') {
705 // Considered bogus. Don't process the doctype.
708 doctypeToken
.publicID
.append(c
);
712 case DoctypeAfterPublicID
: {
713 if (c
== '\"' || c
== '\'') {
714 tquote
= c
== '\"' ? DoubleQuote
: SingleQuote
;
715 doctypeToken
.state
= DoctypeSystemID
;
716 } else if (c
== '>') {
717 // Valid doctype. Emit it now.
719 processDoctypeToken();
720 } else if (isWhitespace
) {
722 } else if (c
== '[') {
723 // Found internal subset
724 doctypeToken
.state
= DoctypeInternalSubset
;
725 doctypeAllowComment
= false;
727 doctypeToken
.state
= DoctypeBogus
;
730 case DoctypeBeforeSystemID
: {
731 if (c
== '\"' || c
== '\'') {
732 tquote
= c
== '\"' ? DoubleQuote
: SingleQuote
;
733 doctypeToken
.state
= DoctypeSystemID
;
734 doctypeAllowComment
= false;
735 } else if (c
== '>') {
736 // Considered bogus. Don't process the doctype.
738 } else if (isWhitespace
) {
741 doctypeToken
.state
= DoctypeBogus
;
744 case DoctypeSystemID
: {
745 if ((c
== '\"' && tquote
== DoubleQuote
) || (c
== '\'' && tquote
== SingleQuote
)) {
746 doctypeToken
.state
= DoctypeAfterSystemID
;
747 doctypeAllowComment
= true;
748 } else if (c
== '>') {
749 // Considered bogus. Don't process the doctype.
752 doctypeToken
.systemID
.append(c
);
756 case DoctypeAfterSystemID
: {
758 // Valid doctype. Emit it now.
760 processDoctypeToken();
761 } else if (isWhitespace
) {
763 } else if (c
== '[') {
764 // Found internal subset
765 doctypeToken
.state
= DoctypeInternalSubset
;
766 doctypeAllowComment
= false;
768 doctypeToken
.state
= DoctypeBogus
;
772 case DoctypeInternalSubset
: {
775 doctypeToken
.state
= DoctypeAfterInternalSubset
;
776 doctypeAllowComment
= true;
778 doctypeToken
.internalSubset
.append(c
);
782 case DoctypeAfterInternalSubset
: {
784 // Valid doctype. Emit it now.
786 processDoctypeToken();
787 } else if (isWhitespace
) {
790 doctypeToken
.state
= DoctypeBogus
;
795 // Done with the bogus doctype.
798 // Just keep scanning for '>'
807 else if (dontAdvance
== 1)
809 else // double dontAdvance++, do workaround
810 doctypeComment
= DoctypeCommentBogus
;
814 void HTMLTokenizer::parseServer(TokenizerString
&src
)
816 checkScriptBuffer(src
.length());
817 while ( !src
.isEmpty() ) {
818 scriptCode
[ scriptCodeSize
++ ] = *src
;
819 if (src
->unicode() == '>' &&
820 scriptCodeSize
> 1 && scriptCode
[scriptCodeSize
-2] == '%') {
824 return; // Finished parsing server include
830 void HTMLTokenizer::parseProcessingInstruction(TokenizerString
&src
)
833 while ( !src
.isEmpty() )
835 unsigned char chbegin
= src
->toLatin1();
836 if(chbegin
== '\'') {
837 tquote
= tquote
== SingleQuote
? NoQuote
: SingleQuote
;
839 else if(chbegin
== '\"') {
840 tquote
= tquote
== DoubleQuote
? NoQuote
: DoubleQuote
;
843 // some crappy sites omit the "?" before it, so
844 // we look for an unquoted '>' instead. (IE compatible)
845 else if ( chbegin
== '>' && ( !tquote
|| oldchar
== '?' ) )
847 // We got a '?>' sequence
848 processingInstruction
= false;
851 return; // Finished parsing comment!
858 void HTMLTokenizer::parseText(TokenizerString
&src
)
860 while ( !src
.isEmpty() )
862 // do we need to enlarge the buffer?
865 // ascii is okay because we only do ascii comparisons
866 unsigned char chbegin
= src
->toLatin1();
868 if (skipLF
&& ( chbegin
!= '\n' ))
878 else if (( chbegin
== '\n' ) || ( chbegin
== '\r' ))
894 void HTMLTokenizer::parseEntity(TokenizerString
&src
, QChar
*&dest
, bool start
)
900 Entity
= SearchEntity
;
903 while( !src
.isEmpty() )
905 ushort cc
= src
->unicode();
913 cBuffer
[cBufferPos
++] = cc
;
915 Entity
= NumericSearch
;
923 if(cc
== 'x' || cc
== 'X') {
924 cBuffer
[cBufferPos
++] = cc
;
926 Entity
= Hexadecimal
;
928 else if(cc
>= '0' && cc
<= '9')
931 Entity
= SearchSemicolon
;
937 int uc
= EntityChar
.unicode();
938 int ll
= qMin
<uint
>(src
.length(), 8);
940 QChar
csrc(src
->toLower());
943 if(csrc
.row() || !((cc
>= '0' && cc
<= '9') || (cc
>= 'a' && cc
<= 'f'))) {
946 uc
= uc
*16 + (cc
- ( cc
< 'a' ? '0' : 'a' - 10));
947 cBuffer
[cBufferPos
++] = cc
;
950 EntityChar
= QChar(uc
);
951 Entity
= SearchSemicolon
;
956 int uc
= EntityChar
.unicode();
957 int ll
= qMin(src
.length(), 9-cBufferPos
);
961 if(src
->row() || !(cc
>= '0' && cc
<= '9')) {
962 Entity
= SearchSemicolon
;
966 uc
= uc
* 10 + (cc
- '0');
967 cBuffer
[cBufferPos
++] = cc
;
970 EntityChar
= QChar(uc
);
971 if(cBufferPos
== 9) Entity
= SearchSemicolon
;
976 int ll
= qMin(src
.length(), 9-cBufferPos
);
981 if(csrc
.row() || !((cc
>= 'a' && cc
<= 'z') ||
982 (cc
>= '0' && cc
<= '9') || (cc
>= 'A' && cc
<= 'Z'))) {
983 Entity
= SearchSemicolon
;
987 cBuffer
[cBufferPos
++] = cc
;
990 // be IE compatible and interpret even unterminated entities
991 // outside tags. like "foo  stuff bla".
992 if ( tag
== NoTag
) {
993 const entity
* e
= kde_findEntity(cBuffer
, cBufferPos
);
994 if ( e
&& e
->code
< 256 ) {
995 EntityChar
= e
->code
;
996 entityLen
= cBufferPos
;
1000 if(cBufferPos
== 9) Entity
= SearchSemicolon
;
1001 if(Entity
== SearchSemicolon
) {
1002 if(cBufferPos
> 1) {
1003 const entity
*e
= kde_findEntity(cBuffer
, cBufferPos
);
1004 // IE only accepts unterminated entities < 256,
1005 // Gecko accepts them all, but only outside tags
1006 if(e
&& ( tag
== NoTag
|| e
->code
< 256 || *src
== ';' )) {
1007 EntityChar
= e
->code
;
1008 entityLen
= cBufferPos
;
1014 case SearchSemicolon
:
1016 kDebug( 6036 ) << "ENTITY " << EntityChar
.unicode();
1018 fixUpChar(EntityChar
);
1023 if ( !EntityChar
.isNull() ) {
1025 if (entityLen
> 0 && entityLen
< cBufferPos
) {
1026 int rem
= cBufferPos
- entityLen
;
1027 src
.prepend( TokenizerString(QString::fromAscii(cBuffer
+entityLen
, rem
)) );
1029 src
.push( EntityChar
);
1032 kDebug( 6036 ) << "unknown entity!";
1035 // ignore the sequence, add it to the buffer as plaintext
1037 for(unsigned int i
= 0; i
< cBufferPos
; i
++)
1038 dest
[i
] = cBuffer
[i
];
1041 prePos
+= cBufferPos
+1;
1045 EntityChar
= QChar::Null
;
1051 void HTMLTokenizer::parseTag(TokenizerString
&src
)
1054 checkScriptBuffer( src
.length() );
1056 while ( !src
.isEmpty() )
1059 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1061 while(l
< src
.length() && (src
.toString()[l
]).toLatin1().constData() != '>')
1063 qDebug("src is now: *%s*, tquote: %d",
1064 src
.toString().left(l
).toLatin1().constData(), tquote
);
1071 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1074 if (searchCount
> 0)
1076 if (*src
== commentStart
[searchCount
])
1079 if (searchCount
== 2)
1080 doctypeSearchCount
++; // A '!' is also part of doctype, so we are moving through that still as well
1082 doctypeSearchCount
= 0;
1084 if (searchCount
== 4)
1087 kDebug( 6036 ) << "Found comment";
1089 // Found '<!--' sequence
1091 dest
= buffer
; // ignore the previous part of this tag
1096 return; // Finished parsing tag!
1098 // cuts of high part, is okay
1099 cBuffer
[cBufferPos
++] = src
->cell();
1104 searchCount
= 0; // Stop looking for '<!--' sequence
1107 if (doctypeSearchCount
> 0) {
1108 if((*src
).toLower() == doctypeStart
[doctypeSearchCount
]) {
1109 doctypeSearchCount
++;
1110 cBuffer
[cBufferPos
++] = src
->cell();
1112 if(doctypeSearchCount
== 9) {
1113 // Found '<!DOCTYPE' sequence
1115 doctypeAllowComment
= true;
1116 doctypeComment
= NoDoctypeComment
;
1117 doctypeToken
.reset();
1125 doctypeSearchCount
= 0; // Stop looking for '<!DOCTYPE' sequence
1128 bool finish
= false;
1129 unsigned int ll
= qMin(src
.length(), CBUFLEN
-cBufferPos
);
1131 ushort curchar
= src
->unicode();
1132 if(curchar
<= ' ' || curchar
== '>' ) {
1136 // this is a nasty performance trick. will work for the A-Z
1137 // characters, but not for others. if it contains one,
1140 cBuffer
[cBufferPos
++] = cc
| 0x20;
1144 // Disadvantage: we add the possible rest of the tag
1145 // as attribute names. ### judge if this causes problems
1146 if(finish
|| CBUFLEN
== cBufferPos
) {
1148 char* ptr
= cBuffer
;
1149 unsigned int len
= cBufferPos
;
1150 cBuffer
[cBufferPos
] = '\0';
1151 if ((cBufferPos
> 0) && (*ptr
== '/'))
1161 // Accept empty xml tags like <br/>
1162 if(len
> 1 && ptr
[len
-1] == '/' ) {
1164 // if its like <br/> and not like <input/ value=foo>, take it as flat
1166 currToken
.flat
= true;
1169 uint tagID
= khtml::getTagID(ptr
, len
);
1171 DOMString
tagName(ptr
);
1172 DocumentImpl
*doc
= parser
->docPtr();
1173 if (Element::khtmlValidQualifiedName(tagName
))
1174 tagID
= doc
->getId(NodeImpl::ElementId
, tagName
.implementation(), false, false);
1176 QByteArray
tmp(ptr
, len
+1);
1177 kDebug( 6036 ) << "Unknown tag: \"" << tmp
.data() << "\"";
1182 QByteArray
tmp(ptr
, len
+1);
1183 kDebug( 6036 ) << "found tag id=" << tagID
<< ": " << tmp
.data();
1185 currToken
.tid
= beginTag
? tagID
: tagID
+ ID_CLOSE_TAG
;
1188 tag
= SearchAttribute
;
1193 case SearchAttribute
:
1195 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1196 qDebug("SearchAttribute");
1198 bool atespace
= false;
1200 while(!src
.isEmpty()) {
1201 curchar
= src
->unicode();
1203 if(curchar
== '<' || curchar
== '>')
1205 else if(atespace
&& (curchar
== '\'' || curchar
== '"'))
1212 tag
= AttributeName
;
1224 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1225 qDebug("AttributeName");
1228 int ll
= qMin(src
.length(), CBUFLEN
-cBufferPos
);
1231 curchar
= src
->unicode();
1232 if(curchar
<= '>') {
1233 if(curchar
<= ' ' || curchar
== '=' || curchar
== '>') {
1235 cBuffer
[cBufferPos
] = '\0';
1236 a
= khtml::getAttrID(cBuffer
, cBufferPos
);
1239 // did we just get /> or e.g checked/>
1240 if (curchar
== '>' && cBufferPos
>=1 && cBuffer
[cBufferPos
-1] == '/') {
1241 currToken
.flat
= true;
1243 a
= khtml::getAttrID(cBuffer
, cBufferPos
-1);
1246 attrName
= QLatin1String(QByteArray(cBuffer
, cBufferPos
+1).data());
1252 if (!a
|| (cBufferPos
&& *cBuffer
== '!'))
1253 kDebug( 6036 ) << "Unknown attribute: *" << QByteArray(cBuffer
, cBufferPos
+1).data() << "*";
1255 kDebug( 6036 ) << "Known attribute: " << QByteArray(cBuffer
, cBufferPos
+1).data();
1262 cBuffer
[cBufferPos
++] =
1263 ( curchar
>= 'A' && curchar
<= 'Z' ) ? curchar
| 0x20 : curchar
;
1266 if ( cBufferPos
== CBUFLEN
) {
1267 cBuffer
[cBufferPos
] = '\0';
1268 attrName
= QLatin1String(QByteArray(cBuffer
, cBufferPos
+1).data());
1277 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1278 qDebug("SearchEqual");
1281 bool atespace
= false;
1282 while(!src
.isEmpty()) {
1283 curchar
= src
->unicode();
1285 if(curchar
== '=') {
1287 kDebug(6036) << "found equal";
1292 else if(atespace
&& (curchar
== '\'' || curchar
== '"'))
1300 currToken
.addAttribute(parser
->docPtr(), buffer
, attrName
, v
);
1302 tag
= SearchAttribute
;
1314 while(!src
.isEmpty()) {
1315 curchar
= src
->unicode();
1317 if(( curchar
== '\'' || curchar
== '\"' )) {
1318 tquote
= curchar
== '\"' ? DoubleQuote
: SingleQuote
;
1332 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1333 qDebug("QuotedValue");
1336 while(!src
.isEmpty()) {
1339 curchar
= src
->unicode();
1340 if(curchar
<= '\'' && !src
.escaped()) {
1341 // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1342 if ( curchar
== '&' )
1345 parseEntity(src
, dest
, true);
1348 else if ( (tquote
== SingleQuote
&& curchar
== '\'') ||
1349 (tquote
== DoubleQuote
&& curchar
== '\"') )
1351 // some <input type=hidden> rely on trailing spaces. argh
1352 while(dest
> buffer
+1 && (*(dest
-1) == '\n' || *(dest
-1) == '\r'))
1353 dest
--; // remove trailing newlines
1354 DOMString
v(buffer
+1, dest
-buffer
-1);
1355 currToken
.addAttribute(parser
->docPtr(), buffer
, attrName
, v
);
1358 tag
= SearchAttribute
;
1371 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1375 while(!src
.isEmpty()) {
1377 curchar
= src
->unicode();
1378 if(curchar
<= '>' && !src
.escaped()) {
1380 if ( curchar
== '&' )
1383 parseEntity(src
, dest
, true);
1386 // no quotes. Every space means end of value
1387 // '/' does not delimit in IE!
1388 if ( curchar
<= ' ' || curchar
== '>' )
1390 DOMString
v(buffer
+1, dest
-buffer
-1);
1391 currToken
.addAttribute(parser
->docPtr(), buffer
, attrName
, v
);
1393 tag
= SearchAttribute
;
1405 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1406 qDebug("SearchEnd");
1408 while(!src
.isEmpty()) {
1409 if(*src
== '<' || *src
== '>')
1413 currToken
.flat
= true;
1417 if(src
.isEmpty() && *src
!= '<' && *src
!= '>') break;
1419 searchCount
= 0; // Stop looking for '<!--' sequence
1425 if ( !currToken
.tid
) //stop if tag is unknown
1428 uint tagID
= currToken
.tid
;
1429 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1430 kDebug( 6036 ) << "appending Tag: " << tagID
;
1432 // If the tag requires an end tag it cannot be flat,
1433 // unless we are using the HTML parser to parse XHTML
1434 // The only exception is SCRIPT and priority 0 tokens.
1435 if (tagID
< ID_CLOSE_TAG
&& tagID
!= ID_SCRIPT
&&
1436 DOM::endTagRequirement(tagID
) == DOM::REQUIRED
&&
1437 parser
->doc()->htmlMode() != DocumentImpl::XHtml
)
1438 currToken
.flat
= false;
1440 bool beginTag
= !currToken
.flat
&& (tagID
< ID_CLOSE_TAG
);
1442 if(tagID
>= ID_CLOSE_TAG
)
1443 tagID
-= ID_CLOSE_TAG
;
1444 else if ( !brokenScript
&& tagID
== ID_SCRIPT
) {
1445 DOMStringImpl
* a
= 0;
1446 bool foundTypeAttribute
= false;
1447 scriptSrc
= scriptSrcCharset
= QString();
1448 if ( currToken
.attrs
&& /* potentially have a ATTR_SRC ? */
1449 view
&& /* are we a regular tokenizer or just for innerHTML ? */
1450 parser
->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
1452 if ( ( a
= currToken
.attrs
->getValue( ATTR_SRC
) ) )
1453 scriptSrc
= parser
->doc()->completeURL(khtml::parseURL( DOMString(a
) ).string() );
1454 if ( ( a
= currToken
.attrs
->getValue( ATTR_CHARSET
) ) )
1455 scriptSrcCharset
= DOMString(a
).string().trimmed();
1456 if ( scriptSrcCharset
.isEmpty() && view
)
1457 scriptSrcCharset
= parser
->doc()->view()->part()->encoding();
1458 /* Check type before language, since language is deprecated */
1459 if ((a
= currToken
.attrs
->getValue(ATTR_TYPE
)) != 0 && !DOMString(a
).string().isEmpty())
1460 foundTypeAttribute
= true;
1462 a
= currToken
.attrs
->getValue(ATTR_LANGUAGE
);
1466 if( foundTypeAttribute
) {
1468 Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
1469 Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
1470 Mozilla 1.5 accepts application/x-javascript, WinIE 6 doesn't.
1471 Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
1472 Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
1473 We want to accept all the values that either of these browsers accept, but not other values.
1475 QString type
= DOMString(a
).string().trimmed().toLower();
1476 if( type
.compare("text/javascript") != 0 &&
1477 type
.compare("text/javascript1.0") != 0 &&
1478 type
.compare("text/javascript1.1") != 0 &&
1479 type
.compare("text/javascript1.2") != 0 &&
1480 type
.compare("text/javascript1.3") != 0 &&
1481 type
.compare("text/javascript1.4") != 0 &&
1482 type
.compare("text/javascript1.5") != 0 &&
1483 type
.compare("text/jscript") != 0 &&
1484 type
.compare("text/ecmascript") != 0 &&
1485 type
.compare("text/livescript") != 0 &&
1486 type
.compare("application/x-javascript") != 0 &&
1487 type
.compare("application/x-ecmascript") != 0 &&
1488 type
.compare("application/javascript") != 0 &&
1489 type
.compare("application/ecmascript") != 0 )
1493 Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
1494 Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
1495 Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
1496 We want to accept all the values that either of these browsers accept, but not other values.
1498 QString lang
= DOMString(a
).string();
1499 lang
= lang
.toLower();
1500 if( lang
.compare("") != 0 &&
1501 lang
.compare("javascript") != 0 &&
1502 lang
.compare("javascript1.0") != 0 &&
1503 lang
.compare("javascript1.1") != 0 &&
1504 lang
.compare("javascript1.2") != 0 &&
1505 lang
.compare("javascript1.3") != 0 &&
1506 lang
.compare("javascript1.4") != 0 &&
1507 lang
.compare("javascript1.5") != 0 &&
1508 lang
.compare("ecmascript") != 0 &&
1509 lang
.compare("livescript") != 0 &&
1510 lang
.compare("jscript") )
1517 if ( parser
->selectMode() && beginTag
)
1518 discard
= AllDiscard
;
1524 discard
= LFDiscard
;
1532 searchStopper
= scriptEnd
;
1533 searchStopperLen
= 8;
1537 else if (tagID
< ID_CLOSE_TAG
) // Handle <script src="foo"/>
1542 searchStopper
= styleEnd
;
1543 searchStopperLen
= 7;
1550 searchStopper
= textareaEnd
;
1551 searchStopperLen
= 10;
1553 discard
= NoneDiscard
;
1559 searchStopper
= titleEnd
;
1560 searchStopperLen
= 7;
1567 searchStopper
= xmpEnd
;
1568 searchStopperLen
= 5;
1577 plaintext
= beginTag
;
1580 return; // Finished parsing tag!
1587 void HTMLTokenizer::addPending()
1589 if ( select
&& !(comment
|| script
))
1593 else if ( textarea
)
1596 case LFPending
: *dest
++ = QLatin1Char('\n'); prePos
= 0; break;
1597 case SpacePending
: *dest
++ = QLatin1Char(' '); ++prePos
; break;
1598 case TabPending
: *dest
++ = QLatin1Char('\t'); prePos
+= TAB_SIZE
- (prePos
% TAB_SIZE
); break;
1610 // Insert a breaking space
1611 *dest
++ = QLatin1Char(' ');
1616 *dest
= QLatin1Char('\n');
1622 p
= TAB_SIZE
- ( prePos
% TAB_SIZE
);
1623 for ( int x
= 0; x
< p
; x
++ )
1624 *dest
++ = QLatin1Char(' ');
1634 pending
= NonePending
;
1637 void HTMLTokenizer::write( const TokenizerString
&str
, bool appendData
)
1640 kDebug( 6036 ) << this << " Tokenizer::write(\"" << str
.toString() << "\"," << appendData
<< ")";
1646 if ( ( m_executingScript
&& appendData
) || cachedScript
.count() ) {
1647 // don't parse; we will do this later
1648 if (pendingQueue
.isEmpty())
1649 pendingQueue
.push(str
);
1650 else if (appendData
)
1651 pendingQueue
.bottom().append(str
);
1653 pendingQueue
.top().append(str
);
1654 #if PROSPECTIVE_TOKENIZER_ENABLED
1655 if (m_prospectiveTokenizer
&& m_prospectiveTokenizer
->inProgress() && appendData
)
1656 m_prospectiveTokenizer
->write(str
);
1673 // parseEntity(src, dest);
1675 while ( !src
.isEmpty() )
1679 // do we need to enlarge the buffer?
1682 ushort cc
= src
->unicode();
1684 if (skipLF
&& (cc
!= '\n'))
1692 parseEntity( src
, dest
);
1693 else if ( plaintext
)
1707 else if (doctypeComment
&& doctypeComment
!= DoctypeCommentEnd
&& doctypeComment
!= DoctypeCommentBogus
)
1708 parseDoctypeComment(src
);
1713 else if (processingInstruction
)
1714 parseProcessingInstruction(src
);
1717 else if ( startTag
)
1720 bool endTag
= false;
1728 // <!-- comment --> or <!DOCTYPE ...>
1729 searchCount
= 1; // Look for '<!--' sequence to start comment...
1730 doctypeSearchCount
= 1; // ... or for '<!DOCTYPE' sequence to start doctype
1735 // xml processing instruction
1736 processingInstruction
= true;
1738 parseProcessingInstruction(src
);
1744 if (!brokenServer
) {
1745 // <% server stuff, handle as comment %>
1751 // else fall through
1754 if( ((cc
>= 'a') && (cc
<= 'z')) || ((cc
>= 'A') && (cc
<= 'Z')))
1756 // Start of a Start-Tag
1771 // According to SGML any LF immediately after a starttag, or
1772 // immediately before an endtag should be ignored.
1773 // ### Gecko and MSIE though only ignores LF immediately after
1774 // starttags and only for PRE elements -- asj (28/06-2005)
1779 pending
= NonePending
;
1781 // Cancel unused discards
1782 discard
= NoneDiscard
;
1783 // if (!endTag) discard = LFDiscard;
1791 else if ( cc
== '&' && !src
.escaped())
1796 discard
= NoneDiscard
;
1797 parseEntity(src
, dest
, true);
1799 else if ( cc
== '<' && !src
.escaped())
1801 tagStartLineno
= lineno
+src
.lineCount();
1803 discard
= NoneDiscard
;
1806 else if (( cc
== '\n' ) || ( cc
== '\r' ))
1808 if (discard
== SpaceDiscard
)
1809 discard
= NoneDiscard
;
1811 if (discard
== LFDiscard
) {
1813 discard
= NoneDiscard
;
1815 else if (discard
== AllDiscard
)
1821 if (select
&& !script
) {
1822 pending
= LFPending
;
1826 pending
= LFPending
;
1830 /* Check for MS-DOS CRLF sequence */
1837 else if (( cc
== ' ' ) || ( cc
== '\t' ))
1839 if(discard
== LFDiscard
)
1840 discard
= NoneDiscard
;
1842 if(discard
== SpaceDiscard
) {
1844 discard
= NoneDiscard
;
1846 else if(discard
== AllDiscard
)
1851 if (select
&& !script
) {
1853 pending
= SpacePending
;
1858 pending
= SpacePending
;
1860 pending
= TabPending
;
1871 discard
= NoneDiscard
;
1883 if (noMoreData
&& cachedScript
.isEmpty() && !m_executingScript
)
1884 end(); // this actually causes us to be deleted
1887 void HTMLTokenizer::timerEvent( QTimerEvent
*e
)
1889 if ( e
->timerId() == m_autoCloseTimer
&& cachedScript
.isEmpty() ) {
1894 void HTMLTokenizer::setAutoClose( bool b
) {
1895 killTimer( m_autoCloseTimer
);
1896 m_autoCloseTimer
= 0;
1898 m_autoCloseTimer
= startTimer(100);
1901 void HTMLTokenizer::end()
1903 if ( buffer
== 0 ) {
1904 emit
finishedParsing();
1908 // parseTag is using the buffer for different matters
1913 KHTML_DELETE_QCHAR_VEC(buffer
);
1916 KHTML_DELETE_QCHAR_VEC(scriptCode
);
1919 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
1921 emit
finishedParsing();
1924 void HTMLTokenizer::finish()
1926 if ( m_autoCloseTimer
) {
1927 killTimer( m_autoCloseTimer
);
1928 m_autoCloseTimer
= 0;
1930 // do this as long as we don't find matching comment ends
1931 while((title
|| script
|| comment
|| server
) && scriptCode
&& scriptCodeSize
)
1933 // we've found an unmatched comment start
1935 brokenComments
= true;
1937 brokenServer
= true;
1939 brokenScript
= true;
1941 checkScriptBuffer();
1942 scriptCode
[ scriptCodeSize
] = 0;
1943 scriptCode
[ scriptCodeSize
+ 1 ] = 0;
1946 if (title
|| style
|| script
)
1947 food
.setUnicode(scriptCode
, scriptCodeSize
);
1950 food
+= QString(scriptCode
, scriptCodeSize
);
1953 pos
= QString::fromRawData(scriptCode
, scriptCodeSize
).indexOf('>');
1954 food
.setUnicode(scriptCode
+pos
+1, scriptCodeSize
-pos
-1); // deep copy
1956 KHTML_DELETE_QCHAR_VEC(scriptCode
);
1958 scriptCodeSize
= scriptCodeMaxSize
= scriptCodeResync
= 0;
1962 comment
= title
= server
= script
= false;
1963 if ( !food
.isEmpty() )
1966 // this indicates we will not receive any more data... but if we are waiting on
1967 // an external script to load, we can't finish parsing until that is done
1969 if (cachedScript
.isEmpty() && !m_executingScript
&& !onHold
)
1970 end(); // this actually causes us to be deleted
1973 void HTMLTokenizer::processToken()
1975 KJSProxy
*jsProxy
= view
? view
->part()->jScript() : 0L;
1977 jsProxy
->setEventHandlerLineno(tagStartLineno
+1);
1978 if ( dest
> buffer
)
1982 qDebug( "unexpected token id: %d, str: *%s*", currToken
.tid
,QString::fromRawData( buffer
, dest
-buffer
).toLatin1().constData() );
1987 currToken
.text
= new DOMStringImpl( buffer
, dest
- buffer
);
1988 currToken
.text
->ref();
1989 if (currToken
.tid
!= ID_COMMENT
)
1990 currToken
.tid
= ID_TEXT
;
1992 else if(!currToken
.tid
) {
1995 jsProxy
->setEventHandlerLineno(lineno
+src
.lineCount()+1);
2002 QString name
= QString( getTagName(currToken
.tid
) );
2005 text
= QString::fromRawData(currToken
.text
->s
, currToken
.text
->l
);
2007 kDebug( 6036 ) << "Token --> " << name
<< " id = " << currToken
.tid
;
2009 kDebug( 6036 ) << "Token is FLAT!";
2011 kDebug( 6036 ) << "text: \"" << text
<< "\"";
2012 unsigned long l
= currToken
.attrs
? currToken
.attrs
->length() : 0;
2014 kDebug( 6036 ) << "Attributes: " << l
;
2015 for (unsigned long i
= 0; i
< l
; ++i
) {
2016 NodeImpl::Id tid
= currToken
.attrs
->idAt(i
);
2017 DOMString value
= currToken
.attrs
->valueAt(i
);
2018 kDebug( 6036 ) << " " << tid
<< " " << parser
->doc()->getDocument()->getName(NodeImpl::AttributeId
, tid
).string()
2019 << "=\"" << value
.string() << "\"" << endl
;
2025 // In some cases, parseToken() can cause javascript code to be executed
2026 // (for example, when setting an attribute that causes an event handler
2027 // to be created). So we need to protect against re-entrancy into the parser
2028 m_executingScript
++;
2030 // pass the token over to the parser, the parser DOES NOT delete the token
2031 parser
->parseToken(&currToken
);
2033 m_executingScript
--;
2035 if ( currToken
.flat
&& currToken
.tid
!= ID_TEXT
&& !parser
->noSpaces() )
2036 discard
= NoneDiscard
;
2040 jsProxy
->setEventHandlerLineno(1);
2043 void HTMLTokenizer::processDoctypeToken()
2045 // kDebug( 6036 ) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID;
2046 doctypeToken
.publicID
= doctypeToken
.publicID
.simplified();
2047 doctypeToken
.systemID
= doctypeToken
.systemID
.simplified();
2048 parser
->parseDoctypeToken(&doctypeToken
);
2052 HTMLTokenizer::~HTMLTokenizer()
2055 delete m_prospectiveTokenizer
;
2060 void HTMLTokenizer::enlargeBuffer(int len
)
2062 int newsize
= qMax(size
*2, size
+len
);
2063 int oldoffs
= (dest
- buffer
);
2065 buffer
= KHTML_REALLOC_QCHAR_VEC(buffer
, newsize
);
2066 dest
= buffer
+ oldoffs
;
2070 void HTMLTokenizer::enlargeScriptBuffer(int len
)
2072 int newsize
= qMax(scriptCodeMaxSize
*2, scriptCodeMaxSize
+len
);
2073 scriptCode
= KHTML_REALLOC_QCHAR_VEC(scriptCode
, newsize
);
2074 scriptCodeMaxSize
= newsize
;
2077 void HTMLTokenizer::notifyFinished(CachedObject
* /*finishedObj*/)
2079 assert(!cachedScript
.isEmpty());
2081 while (!done
&& cachedScript
.head()->isLoaded()) {
2083 kDebug( 6036 ) << "Finished loading an external script";
2085 CachedScript
* cs
= cachedScript
.dequeue();
2086 DOMString scriptSource
= cs
->script();
2088 kDebug( 6036 ) << "External script is:" << endl
<< scriptSource
.string();
2090 setSrc(TokenizerString());
2092 // make sure we forget about the script before we execute the new one
2093 // infinite recursion might happen otherwise
2094 QString
cachedScriptUrl( cs
->url().string() );
2097 scriptExecution( scriptSource
.string(), cachedScriptUrl
);
2099 done
= cachedScript
.isEmpty();
2101 // 'script' is true when we are called synchronously from
2102 // scriptHandler(). In that case scriptHandler() will take care
2103 // of 'scriptOutput'.
2105 while (pendingQueue
.count() > 1) {
2106 TokenizerString t
= pendingQueue
.pop();
2107 pendingQueue
.top().prepend( t
);
2110 write(pendingQueue
.pop(), false);
2112 // we might be deleted at this point, do not
2113 // access any members.
2118 bool HTMLTokenizer::isWaitingForScripts() const
2120 return cachedScript
.count();
2123 bool HTMLTokenizer::isExecutingScript() const
2125 return (m_executingScript
> 0);
2128 void HTMLTokenizer::setSrc(const TokenizerString
& source
)
2130 lineno
+= src
.lineCount();
2132 src
.resetLineCount();
2135 void HTMLTokenizer::setOnHold(bool _onHold
)
2137 if (onHold
== _onHold
) return;