1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
16 type tokenTest
struct {
17 // A short description of the test case.
21 // The string representations of the expected tokens, joined by '$'.
25 var tokenTests
= []tokenTest
{
31 // A single text node. The tokenizer should not break text nodes on whitespace,
32 // nor should it normalize whitespace within a text node.
44 // A start, self-closing and end tag. The tokenizer does not care if the start
45 // and end tokens don't match; that is the job of the parser.
51 // Angle brackets that aren't a tag.
100 "a<<$<b>$>>c",
104 "if x<0 and y < 0 then x*y>0",
105 "if x<0 and y < 0 then x*y>0",
107 // EOF in a tag name.
133 // Some malformed tags that are missing a '>'.
177 `<p id="0" <="" p="">`,
184 // Raw text and RCDATA.
187 "<script><a></b></script>",
188 "<script>$<a></b>$</script>",
191 "unfinished script end tag",
193 "<script>$a</SCR",
196 "broken script end tag",
197 "<SCRIPT>a</SCR ipt>",
198 "<script>$a</SCR ipt>",
201 "EOF in script end tag",
203 "<script>$a</SCRipt",
207 "<SCRIPT>a</SCRiptx",
208 "<script>$a</SCRiptx",
211 "' ' completes script end tag",
212 "<SCRIPT>a</SCRipt ",
216 "'>' completes script end tag",
217 "<SCRIPT>a</SCRipt>",
218 "<script>$a$</script>",
221 "self-closing script end tag",
222 "<SCRIPT>a</SCRipt/>",
223 "<script>$a$</script>",
227 "<SCRIPT>a</SCRipt<script>",
228 "<script>$a</SCRipt<script>",
231 "script end tag after unfinished",
232 "<SCRIPT>a</SCRipt</script>",
233 "<script>$a</SCRipt$</script>",
236 "script/style mismatched tags",
238 "<script>$a</style>",
241 "style element with entity",
243 "<style>$&apos;",
247 "<textarea><div></textarea>",
248 "<textarea>$<div>$</textarea>",
251 "title with tag and entity",
252 "<title><b>K&R C</b></title>",
253 "<title>$<b>K&R C</b>$</title>",
262 "DOCTYPE with no space",
267 "DOCTYPE with two spaces",
272 "looks like DOCTYPE but isn't",
274 "<!--DOCUMENT html-->",
281 // XML processing instructions.
283 "XML processing instruction",
290 "abc<b><!-- skipme --></b>def",
291 "abc$<b>$<!-- skipme -->$</b>$def",
358 // An attribute with a backslash.
364 // Entities, tag name and attribute key lower-casing, and whitespace
365 // normalization within a tag.
368 "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
369 `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`,
371 // A nonexistent entity. Tokenizing and converting back to a string should
372 // escape the "&" to become "&".
375 `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
376 `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
379 "entity without semicolon",
380 `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`,
381 `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
384 "entity with digits",
389 // http://dev.w3.org/html5/spec/Overview.html#attributes-0
392 `<input disabled FOO>`,
393 `<input disabled="" foo="">`,
396 "Empty attribute, whitespace",
397 `<input disabled FOO >`,
398 `<input disabled="" foo="">`,
401 "Unquoted attribute value",
402 `<input value=yes FOO=BAR>`,
403 `<input value="yes" foo="BAR">`,
406 "Unquoted attribute value, spaces",
407 `<input value = yes FOO = BAR>`,
408 `<input value="yes" foo="BAR">`,
411 "Unquoted attribute value, trailing space",
412 `<input value=yes FOO=BAR >`,
413 `<input value="yes" foo="BAR">`,
416 "Single-quoted attribute value",
417 `<input value='yes' FOO='BAR'>`,
418 `<input value="yes" foo="BAR">`,
421 "Single-quoted attribute value, trailing space",
422 `<input value='yes' FOO='BAR' >`,
423 `<input value="yes" foo="BAR">`,
426 "Double-quoted attribute value",
427 `<input value="I'm an attribute" FOO="BAR">`,
428 `<input value="I'm an attribute" foo="BAR">`,
431 "Attribute name characters",
432 `<meta http-equiv="content-type">`,
433 `<meta http-equiv="content-type">`,
437 `a<P V="0 1" w='2' X=3 y>z`,
438 `a$<p v="0 1" w="2" x="3" y="">$z`,
441 "Attributes with a solitary single quote",
442 `<p id=can't><p id=won't>`,
443 `<p id="can't">$<p id="won't">`,
447 func TestTokenizer(t
*testing
.T
) {
449 for _
, tt
:= range tokenTests
{
450 z
:= NewTokenizer(strings
.NewReader(tt
.html
))
452 for i
, s
:= range strings
.Split(tt
.golden
, "$") {
453 if z
.Next() == ErrorToken
{
454 t
.Errorf("%s token %d: want %q got error %v", tt
.desc
, i
, s
, z
.Err())
457 actual
:= z
.Token().String()
459 t
.Errorf("%s token %d: want %q got %q", tt
.desc
, i
, s
, actual
)
465 if z
.Err() != io
.EOF
{
466 t
.Errorf("%s: want EOF got %q", tt
.desc
, z
.Err())
471 type unescapeTest
struct {
472 // A short description of the test case.
476 // The unescaped text.
480 var unescapeTests
= []unescapeTest
{
481 // Handle no entities.
487 // Handle simple named entities.
493 // Handle hitting the end of the string.
499 // Handle entities with two codepoints.
503 "text \u22db\ufe00 blah",
505 // Handle decimal numeric entities.
511 // Handle hexadecimal numeric entities.
514 "Lambda = λ = λ ",
517 // Handle numeric early termination.
520 "&# &#x €43 © = ©f = ©",
521 "&# &#x €43 © = ©f = ©",
523 // Handle numeric ISO-8859-1 entity replacements.
525 "numericReplacements",
531 func TestUnescape(t
*testing
.T
) {
532 for _
, tt
:= range unescapeTests
{
533 unescaped
:= UnescapeString(tt
.html
)
534 if unescaped
!= tt
.unescaped
{
535 t
.Errorf("TestUnescape %s: want %q, got %q", tt
.desc
, tt
.unescaped
, unescaped
)
540 func TestUnescapeEscape(t
*testing
.T
) {
550 `"<&>"`,
551 `3&5==1 && 0<1, "0<1", a+acute=á`,
552 `The special characters are: <, >, &, ' and "`,
554 for _
, s
:= range ss
{
555 if got
:= UnescapeString(EscapeString(s
)); got
!= s
{
556 t
.Errorf("got %q want %q", got
, s
)
561 func TestBufAPI(t
*testing
.T
) {
562 s
:= "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
563 z
:= NewTokenizer(bytes
.NewBufferString(s
))
564 var result bytes
.Buffer
571 if z
.Err() != io
.EOF
{
577 result
.Write(z
.Text())
579 case StartTagToken
, EndTagToken
:
581 if len(tn
) == 1 && tn
[0] == 'a' {
582 if tt
== StartTagToken
{
591 v
:= string(result
.Bytes())
593 t
.Errorf("TestBufAPI: want %q got %q", u
, v
)
597 func TestConvertNewlines(t
*testing
.T
) {
598 testCases
:= map[string]string{
599 "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
600 "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
601 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
612 "\r\r\n\n": "\n\n\n",
613 "\r\r\r\n": "\n\n\n",
617 for in
, want
:= range testCases
{
618 if got
:= string(convertNewlines([]byte(in
))); got
!= want
{
619 t
.Errorf("input %q: got %q, want %q", in
, got
, want
)
630 func benchmarkTokenizer(b
*testing
.B
, level
int) {
631 buf
, err
:= ioutil
.ReadFile("testdata/go1.html")
633 b
.Fatalf("could not read testdata/go1.html: %v", err
)
635 b
.SetBytes(int64(len(buf
)))
639 for i
:= 0; i
< b
.N
; i
++ {
640 z
:= NewTokenizer(bytes
.NewBuffer(buf
))
643 if tt
== ErrorToken
{
644 if err
:= z
.Err(); err
!= nil && err
!= io
.EOF
{
645 b
.Fatalf("tokenizer error: %v", err
)
651 // Calling z.Raw just returns the raw bytes of the token. It does
652 // not unescape < to <, or lower-case tag names and attribute keys.
655 // Caling z.Text, z.TagName and z.TagAttr returns []byte values
656 // whose contents may change on the next call to z.Next.
658 case TextToken
, CommentToken
, DoctypeToken
:
660 case StartTagToken
, SelfClosingTagToken
:
661 _
, more
:= z
.TagName()
663 _
, _
, more
= z
.TagAttr()
669 // Calling z.Token converts []byte values to strings whose validity
670 // extend beyond the next call to z.Next.
677 func BenchmarkRawLevelTokenizer(b
*testing
.B
) { benchmarkTokenizer(b
, rawLevel
) }
678 func BenchmarkLowLevelTokenizer(b
*testing
.B
) { benchmarkTokenizer(b
, lowLevel
) }
679 func BenchmarkHighLevelTokenizer(b
*testing
.B
) { benchmarkTokenizer(b
, highLevel
) }