Merged revisions 195034,195219,195245,195357,195374,195428,195599,195673,195809 via...
[official-gcc.git] / main / libgo / go / exp / html / token_test.go
blob14e23467f49e9b152fe74bc6941cb1caedd0c478
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package html
7 import (
8 "bytes"
9 "io"
10 "io/ioutil"
11 "runtime"
12 "strings"
13 "testing"
16 type tokenTest struct {
17 // A short description of the test case.
18 desc string
19 // The HTML to parse.
20 html string
21 // The string representations of the expected tokens, joined by '$'.
22 golden string
25 var tokenTests = []tokenTest{
27 "empty",
28 "",
29 "",
31 // A single text node. The tokenizer should not break text nodes on whitespace,
32 // nor should it normalize whitespace within a text node.
34 "text",
35 "foo bar",
36 "foo bar",
38 // An entity.
40 "entity",
41 "one < two",
42 "one < two",
44 // A start, self-closing and end tag. The tokenizer does not care if the start
45 // and end tokens don't match; that is the job of the parser.
47 "tags",
48 "<a>b<c/>d</e>",
49 "<a>$b$<c/>$d$</e>",
51 // Angle brackets that aren't a tag.
53 "not a tag #0",
54 "<",
55 "&lt;",
58 "not a tag #1",
59 "</",
60 "&lt;/",
63 "not a tag #2",
64 "</>",
65 "",
68 "not a tag #3",
69 "a</>b",
70 "a$b",
73 "not a tag #4",
74 "</ >",
75 "<!-- -->",
78 "not a tag #5",
79 "</.",
80 "<!--.-->",
83 "not a tag #6",
84 "</.>",
85 "<!--.-->",
88 "not a tag #7",
89 "a < b",
90 "a &lt; b",
93 "not a tag #8",
94 "<.>",
95 "&lt;.&gt;",
98 "not a tag #9",
99 "a<<<b>>>c",
100 "a&lt;&lt;$<b>$&gt;&gt;c",
103 "not a tag #10",
104 "if x<0 and y < 0 then x*y>0",
105 "if x&lt;0 and y &lt; 0 then x*y&gt;0",
107 // EOF in a tag name.
109 "tag name eof #0",
110 "<a",
114 "tag name eof #1",
115 "<a ",
119 "tag name eof #2",
120 "a<b",
121 "a",
124 "tag name eof #3",
125 "<a><b",
126 "<a>",
129 "tag name eof #4",
130 `<a x`,
133 // Some malformed tags that are missing a '>'.
135 "malformed tag #0",
136 `<p</p>`,
137 `<p< p="">`,
140 "malformed tag #1",
141 `<p </p>`,
142 `<p <="" p="">`,
145 "malformed tag #2",
146 `<p id`,
150 "malformed tag #3",
151 `<p id=`,
155 "malformed tag #4",
156 `<p id=>`,
157 `<p id="">`,
160 "malformed tag #5",
161 `<p id=0`,
165 "malformed tag #6",
166 `<p id=0</p>`,
167 `<p id="0&lt;/p">`,
170 "malformed tag #7",
171 `<p id="0</p>`,
175 "malformed tag #8",
176 `<p id="0"</p>`,
177 `<p id="0" <="" p="">`,
180 "malformed tag #9",
181 `<p></p id`,
182 `<p>`,
184 // Raw text and RCDATA.
186 "basic raw text",
187 "<script><a></b></script>",
188 "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
191 "unfinished script end tag",
192 "<SCRIPT>a</SCR",
193 "<script>$a&lt;/SCR",
196 "broken script end tag",
197 "<SCRIPT>a</SCR ipt>",
198 "<script>$a&lt;/SCR ipt&gt;",
201 "EOF in script end tag",
202 "<SCRIPT>a</SCRipt",
203 "<script>$a&lt;/SCRipt",
206 "scriptx end tag",
207 "<SCRIPT>a</SCRiptx",
208 "<script>$a&lt;/SCRiptx",
211 "' ' completes script end tag",
212 "<SCRIPT>a</SCRipt ",
213 "<script>$a",
216 "'>' completes script end tag",
217 "<SCRIPT>a</SCRipt>",
218 "<script>$a$</script>",
221 "self-closing script end tag",
222 "<SCRIPT>a</SCRipt/>",
223 "<script>$a$</script>",
226 "nested script tag",
227 "<SCRIPT>a</SCRipt<script>",
228 "<script>$a&lt;/SCRipt&lt;script&gt;",
231 "script end tag after unfinished",
232 "<SCRIPT>a</SCRipt</script>",
233 "<script>$a&lt;/SCRipt$</script>",
236 "script/style mismatched tags",
237 "<script>a</style>",
238 "<script>$a&lt;/style&gt;",
241 "style element with entity",
242 "<style>&apos;",
243 "<style>$&amp;apos;",
246 "textarea with tag",
247 "<textarea><div></textarea>",
248 "<textarea>$&lt;div&gt;$</textarea>",
251 "title with tag and entity",
252 "<title><b>K&amp;R C</b></title>",
253 "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
255 // DOCTYPE tests.
257 "Proper DOCTYPE",
258 "<!DOCTYPE html>",
259 "<!DOCTYPE html>",
262 "DOCTYPE with no space",
263 "<!doctypehtml>",
264 "<!DOCTYPE html>",
267 "DOCTYPE with two spaces",
268 "<!doctype html>",
269 "<!DOCTYPE html>",
272 "looks like DOCTYPE but isn't",
273 "<!DOCUMENT html>",
274 "<!--DOCUMENT html-->",
277 "DOCTYPE at EOF",
278 "<!DOCtype",
279 "<!DOCTYPE >",
281 // XML processing instructions.
283 "XML processing instruction",
284 "<?xml?>",
285 "<!--?xml?-->",
287 // Comments.
289 "comment0",
290 "abc<b><!-- skipme --></b>def",
291 "abc$<b>$<!-- skipme -->$</b>$def",
294 "comment1",
295 "a<!-->z",
296 "a$<!---->$z",
299 "comment2",
300 "a<!--->z",
301 "a$<!---->$z",
304 "comment3",
305 "a<!--x>-->z",
306 "a$<!--x>-->$z",
309 "comment4",
310 "a<!--x->-->z",
311 "a$<!--x->-->$z",
314 "comment5",
315 "a<!>z",
316 "a$<!---->$z",
319 "comment6",
320 "a<!->z",
321 "a$<!----->$z",
324 "comment7",
325 "a<!---<>z",
326 "a$<!---<>z-->",
329 "comment8",
330 "a<!--z",
331 "a$<!--z-->",
334 "comment9",
335 "a<!--z-",
336 "a$<!--z-->",
339 "comment10",
340 "a<!--z--",
341 "a$<!--z-->",
344 "comment11",
345 "a<!--z---",
346 "a$<!--z--->",
349 "comment12",
350 "a<!--z----",
351 "a$<!--z---->",
354 "comment13",
355 "a<!--x--!>z",
356 "a$<!--x-->$z",
358 // An attribute with a backslash.
360 "backslash",
361 `<p id="a\"b">`,
362 `<p id="a\" b"="">`,
364 // Entities, tag name and attribute key lower-casing, and whitespace
365 // normalization within a tag.
367 "tricky",
368 "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
369 `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
371 // A nonexistent entity. Tokenizing and converting back to a string should
372 // escape the "&" to become "&amp;".
374 "noSuchEntity",
375 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
376 `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
379 "entity without semicolon",
380 `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
381 `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
384 "entity with digits",
385 "&frac12;",
386 "½",
388 // Attribute tests:
389 // http://dev.w3.org/html5/spec/Overview.html#attributes-0
391 "Empty attribute",
392 `<input disabled FOO>`,
393 `<input disabled="" foo="">`,
396 "Empty attribute, whitespace",
397 `<input disabled FOO >`,
398 `<input disabled="" foo="">`,
401 "Unquoted attribute value",
402 `<input value=yes FOO=BAR>`,
403 `<input value="yes" foo="BAR">`,
406 "Unquoted attribute value, spaces",
407 `<input value = yes FOO = BAR>`,
408 `<input value="yes" foo="BAR">`,
411 "Unquoted attribute value, trailing space",
412 `<input value=yes FOO=BAR >`,
413 `<input value="yes" foo="BAR">`,
416 "Single-quoted attribute value",
417 `<input value='yes' FOO='BAR'>`,
418 `<input value="yes" foo="BAR">`,
421 "Single-quoted attribute value, trailing space",
422 `<input value='yes' FOO='BAR' >`,
423 `<input value="yes" foo="BAR">`,
426 "Double-quoted attribute value",
427 `<input value="I'm an attribute" FOO="BAR">`,
428 `<input value="I&#39;m an attribute" foo="BAR">`,
431 "Attribute name characters",
432 `<meta http-equiv="content-type">`,
433 `<meta http-equiv="content-type">`,
436 "Mixed attributes",
437 `a<P V="0 1" w='2' X=3 y>z`,
438 `a$<p v="0 1" w="2" x="3" y="">$z`,
441 "Attributes with a solitary single quote",
442 `<p id=can't><p id=won't>`,
443 `<p id="can&#39;t">$<p id="won&#39;t">`,
447 func TestTokenizer(t *testing.T) {
448 loop:
449 for _, tt := range tokenTests {
450 z := NewTokenizer(strings.NewReader(tt.html))
451 if tt.golden != "" {
452 for i, s := range strings.Split(tt.golden, "$") {
453 if z.Next() == ErrorToken {
454 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
455 continue loop
457 actual := z.Token().String()
458 if s != actual {
459 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
460 continue loop
464 z.Next()
465 if z.Err() != io.EOF {
466 t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
471 type unescapeTest struct {
472 // A short description of the test case.
473 desc string
474 // The HTML text.
475 html string
476 // The unescaped text.
477 unescaped string
480 var unescapeTests = []unescapeTest{
481 // Handle no entities.
483 "copy",
484 "A\ttext\nstring",
485 "A\ttext\nstring",
487 // Handle simple named entities.
489 "simple",
490 "&amp; &gt; &lt;",
491 "& > <",
493 // Handle hitting the end of the string.
495 "stringEnd",
496 "&amp &amp",
497 "& &",
499 // Handle entities with two codepoints.
501 "multiCodepoint",
502 "text &gesl; blah",
503 "text \u22db\ufe00 blah",
505 // Handle decimal numeric entities.
507 "decimalEntity",
508 "Delta = &#916; ",
509 "Delta = Δ ",
511 // Handle hexadecimal numeric entities.
513 "hexadecimalEntity",
514 "Lambda = &#x3bb; = &#X3Bb ",
515 "Lambda = λ = λ ",
517 // Handle numeric early termination.
519 "numericEnds",
520 "&# &#x &#128;43 &copy = &#169f = &#xa9",
521 "&# &#x €43 © = ©f = ©",
523 // Handle numeric ISO-8859-1 entity replacements.
525 "numericReplacements",
526 "Footnote&#x87;",
527 "Footnote‡",
531 func TestUnescape(t *testing.T) {
532 for _, tt := range unescapeTests {
533 unescaped := UnescapeString(tt.html)
534 if unescaped != tt.unescaped {
535 t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
540 func TestUnescapeEscape(t *testing.T) {
541 ss := []string{
543 `abc def`,
544 `a & b`,
545 `a&amp;b`,
546 `a &amp b`,
547 `&quot;`,
548 `"`,
549 `"<&>"`,
550 `&quot;&lt;&amp;&gt;&quot;`,
551 `3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
552 `The special characters are: <, >, &, ' and "`,
554 for _, s := range ss {
555 if got := UnescapeString(EscapeString(s)); got != s {
556 t.Errorf("got %q want %q", got, s)
561 func TestBufAPI(t *testing.T) {
562 s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
563 z := NewTokenizer(bytes.NewBufferString(s))
564 var result bytes.Buffer
565 depth := 0
566 loop:
567 for {
568 tt := z.Next()
569 switch tt {
570 case ErrorToken:
571 if z.Err() != io.EOF {
572 t.Error(z.Err())
574 break loop
575 case TextToken:
576 if depth > 0 {
577 result.Write(z.Text())
579 case StartTagToken, EndTagToken:
580 tn, _ := z.TagName()
581 if len(tn) == 1 && tn[0] == 'a' {
582 if tt == StartTagToken {
583 depth++
584 } else {
585 depth--
590 u := "14567"
591 v := string(result.Bytes())
592 if u != v {
593 t.Errorf("TestBufAPI: want %q got %q", u, v)
597 func TestConvertNewlines(t *testing.T) {
598 testCases := map[string]string{
599 "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
600 "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
601 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
602 "": "",
603 "\n": "\n",
604 "\n\r": "\n\n",
605 "\r": "\n",
606 "\r\n": "\n",
607 "\r\n\n": "\n\n",
608 "\r\n\r": "\n\n",
609 "\r\n\r\n": "\n\n",
610 "\r\r": "\n\n",
611 "\r\r\n": "\n\n",
612 "\r\r\n\n": "\n\n\n",
613 "\r\r\r\n": "\n\n\n",
614 "\r \n": "\n \n",
615 "xyz": "xyz",
617 for in, want := range testCases {
618 if got := string(convertNewlines([]byte(in))); got != want {
619 t.Errorf("input %q: got %q, want %q", in, got, want)
624 const (
625 rawLevel = iota
626 lowLevel
627 highLevel
630 func benchmarkTokenizer(b *testing.B, level int) {
631 buf, err := ioutil.ReadFile("testdata/go1.html")
632 if err != nil {
633 b.Fatalf("could not read testdata/go1.html: %v", err)
635 b.SetBytes(int64(len(buf)))
636 runtime.GC()
637 b.ReportAllocs()
638 b.ResetTimer()
639 for i := 0; i < b.N; i++ {
640 z := NewTokenizer(bytes.NewBuffer(buf))
641 for {
642 tt := z.Next()
643 if tt == ErrorToken {
644 if err := z.Err(); err != nil && err != io.EOF {
645 b.Fatalf("tokenizer error: %v", err)
647 break
649 switch level {
650 case rawLevel:
651 // Calling z.Raw just returns the raw bytes of the token. It does
652 // not unescape &lt; to <, or lower-case tag names and attribute keys.
653 z.Raw()
654 case lowLevel:
655 // Caling z.Text, z.TagName and z.TagAttr returns []byte values
656 // whose contents may change on the next call to z.Next.
657 switch tt {
658 case TextToken, CommentToken, DoctypeToken:
659 z.Text()
660 case StartTagToken, SelfClosingTagToken:
661 _, more := z.TagName()
662 for more {
663 _, _, more = z.TagAttr()
665 case EndTagToken:
666 z.TagName()
668 case highLevel:
669 // Calling z.Token converts []byte values to strings whose validity
670 // extend beyond the next call to z.Next.
671 z.Token()
677 func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
678 func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
679 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }