main/libgo/go/exp/html/token_test.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "io"
  10         "io/ioutil"
  11         "runtime"
  12         "strings"
  13         "testing"
  14 )
  15
  16 type tokenTest struct {
  17         // A short description of the test case.
  18         desc string
  19         // The HTML to parse.
  20         html string
  21         // The string representations of the expected tokens, joined by '$'.
  22         golden string
  23 }
  24
  25 var tokenTests = []tokenTest{
  26         {
  27                 "empty",
  28                 "",
  29                 "",
  30         },
  31         // A single text node. The tokenizer should not break text nodes on whitespace,
  32         // nor should it normalize whitespace within a text node.
  33         {
  34                 "text",
  35                 "foo  bar",
  36                 "foo  bar",
  37         },
  38         // An entity.
  39         {
  40                 "entity",
  41                 "one &lt; two",
  42                 "one &lt; two",
  43         },
  44         // A start, self-closing and end tag. The tokenizer does not care if the start
  45         // and end tokens don't match; that is the job of the parser.
  46         {
  47                 "tags",
  48                 "<a>b<c/>d</e>",
  49                 "<a>$b$<c/>$d$</e>",
  50         },
  51         // Angle brackets that aren't a tag.
  52         {
  53                 "not a tag #0",
  54                 "<",
  55                 "&lt;",
  56         },
  57         {
  58                 "not a tag #1",
  59                 "</",
  60                 "&lt;/",
  61         },
  62         {
  63                 "not a tag #2",
  64                 "</>",
  65                 "",
  66         },
  67         {
  68                 "not a tag #3",
  69                 "a</>b",
  70                 "a$b",
  71         },
  72         {
  73                 "not a tag #4",
  74                 "</ >",
  75                 "<!-- -->",
  76         },
  77         {
  78                 "not a tag #5",
  79                 "</.",
  80                 "<!--.-->",
  81         },
  82         {
  83                 "not a tag #6",
  84                 "</.>",
  85                 "<!--.-->",
  86         },
  87         {
  88                 "not a tag #7",
  89                 "a < b",
  90                 "a &lt; b",
  91         },
  92         {
  93                 "not a tag #8",
  94                 "<.>",
  95                 "&lt;.&gt;",
  96         },
  97         {
  98                 "not a tag #9",
  99                 "a<<<b>>>c",
 100                 "a&lt;&lt;$<b>$&gt;&gt;c",
 101         },
 102         {
 103                 "not a tag #10",
 104                 "if x<0 and y < 0 then x*y>0",
 105                 "if x&lt;0 and y &lt; 0 then x*y&gt;0",
 106         },
 107         // EOF in a tag name.
 108         {
 109                 "tag name eof #0",
 110                 "<a",
 111                 "",
 112         },
 113         {
 114                 "tag name eof #1",
 115                 "<a ",
 116                 "",
 117         },
 118         {
 119                 "tag name eof #2",
 120                 "a<b",
 121                 "a",
 122         },
 123         {
 124                 "tag name eof #3",
 125                 "<a><b",
 126                 "<a>",
 127         },
 128         {
 129                 "tag name eof #4",
 130                 `<a x`,
 131                 ``,
 132         },
 133         // Some malformed tags that are missing a '>'.
 134         {
 135                 "malformed tag #0",
 136                 `<p</p>`,
 137                 `<p< p="">`,
 138         },
 139         {
 140                 "malformed tag #1",
 141                 `<p </p>`,
 142                 `<p <="" p="">`,
 143         },
 144         {
 145                 "malformed tag #2",
 146                 `<p id`,
 147                 ``,
 148         },
 149         {
 150                 "malformed tag #3",
 151                 `<p id=`,
 152                 ``,
 153         },
 154         {
 155                 "malformed tag #4",
 156                 `<p id=>`,
 157                 `<p id="">`,
 158         },
 159         {
 160                 "malformed tag #5",
 161                 `<p id=0`,
 162                 ``,
 163         },
 164         {
 165                 "malformed tag #6",
 166                 `<p id=0</p>`,
 167                 `<p id="0&lt;/p">`,
 168         },
 169         {
 170                 "malformed tag #7",
 171                 `<p id="0</p>`,
 172                 ``,
 173         },
 174         {
 175                 "malformed tag #8",
 176                 `<p id="0"</p>`,
 177                 `<p id="0" <="" p="">`,
 178         },
 179         {
 180                 "malformed tag #9",
 181                 `<p></p id`,
 182                 `<p>`,
 183         },
 184         // Raw text and RCDATA.
 185         {
 186                 "basic raw text",
 187                 "<script><a></b></script>",
 188                 "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
 189         },
 190         {
 191                 "unfinished script end tag",
 192                 "<SCRIPT>a</SCR",
 193                 "<script>$a&lt;/SCR",
 194         },
 195         {
 196                 "broken script end tag",
 197                 "<SCRIPT>a</SCR ipt>",
 198                 "<script>$a&lt;/SCR ipt&gt;",
 199         },
 200         {
 201                 "EOF in script end tag",
 202                 "<SCRIPT>a</SCRipt",
 203                 "<script>$a&lt;/SCRipt",
 204         },
 205         {
 206                 "scriptx end tag",
 207                 "<SCRIPT>a</SCRiptx",
 208                 "<script>$a&lt;/SCRiptx",
 209         },
 210         {
 211                 "' ' completes script end tag",
 212                 "<SCRIPT>a</SCRipt ",
 213                 "<script>$a",
 214         },
 215         {
 216                 "'>' completes script end tag",
 217                 "<SCRIPT>a</SCRipt>",
 218                 "<script>$a$</script>",
 219         },
 220         {
 221                 "self-closing script end tag",
 222                 "<SCRIPT>a</SCRipt/>",
 223                 "<script>$a$</script>",
 224         },
 225         {
 226                 "nested script tag",
 227                 "<SCRIPT>a</SCRipt<script>",
 228                 "<script>$a&lt;/SCRipt&lt;script&gt;",
 229         },
 230         {
 231                 "script end tag after unfinished",
 232                 "<SCRIPT>a</SCRipt</script>",
 233                 "<script>$a&lt;/SCRipt$</script>",
 234         },
 235         {
 236                 "script/style mismatched tags",
 237                 "<script>a</style>",
 238                 "<script>$a&lt;/style&gt;",
 239         },
 240         {
 241                 "style element with entity",
 242                 "<style>&apos;",
 243                 "<style>$&amp;apos;",
 244         },
 245         {
 246                 "textarea with tag",
 247                 "<textarea><div></textarea>",
 248                 "<textarea>$&lt;div&gt;$</textarea>",
 249         },
 250         {
 251                 "title with tag and entity",
 252                 "<title><b>K&amp;R C</b></title>",
 253                 "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
 254         },
 255         // DOCTYPE tests.
 256         {
 257                 "Proper DOCTYPE",
 258                 "<!DOCTYPE html>",
 259                 "<!DOCTYPE html>",
 260         },
 261         {
 262                 "DOCTYPE with no space",
 263                 "<!doctypehtml>",
 264                 "<!DOCTYPE html>",
 265         },
 266         {
 267                 "DOCTYPE with two spaces",
 268                 "<!doctype  html>",
 269                 "<!DOCTYPE html>",
 270         },
 271         {
 272                 "looks like DOCTYPE but isn't",
 273                 "<!DOCUMENT html>",
 274                 "<!--DOCUMENT html-->",
 275         },
 276         {
 277                 "DOCTYPE at EOF",
 278                 "<!DOCtype",
 279                 "<!DOCTYPE >",
 280         },
 281         // XML processing instructions.
 282         {
 283                 "XML processing instruction",
 284                 "<?xml?>",
 285                 "<!--?xml?-->",
 286         },
 287         // Comments.
 288         {
 289                 "comment0",
 290                 "abc<b><!-- skipme --></b>def",
 291                 "abc$<b>$<!-- skipme -->$</b>$def",
 292         },
 293         {
 294                 "comment1",
 295                 "a<!-->z",
 296                 "a$<!---->$z",
 297         },
 298         {
 299                 "comment2",
 300                 "a<!--->z",
 301                 "a$<!---->$z",
 302         },
 303         {
 304                 "comment3",
 305                 "a<!--x>-->z",
 306                 "a$<!--x>-->$z",
 307         },
 308         {
 309                 "comment4",
 310                 "a<!--x->-->z",
 311                 "a$<!--x->-->$z",
 312         },
 313         {
 314                 "comment5",
 315                 "a<!>z",
 316                 "a$<!---->$z",
 317         },
 318         {
 319                 "comment6",
 320                 "a<!->z",
 321                 "a$<!----->$z",
 322         },
 323         {
 324                 "comment7",
 325                 "a<!---<>z",
 326                 "a$<!---<>z-->",
 327         },
 328         {
 329                 "comment8",
 330                 "a<!--z",
 331                 "a$<!--z-->",
 332         },
 333         {
 334                 "comment9",
 335                 "a<!--z-",
 336                 "a$<!--z-->",
 337         },
 338         {
 339                 "comment10",
 340                 "a<!--z--",
 341                 "a$<!--z-->",
 342         },
 343         {
 344                 "comment11",
 345                 "a<!--z---",
 346                 "a$<!--z--->",
 347         },
 348         {
 349                 "comment12",
 350                 "a<!--z----",
 351                 "a$<!--z---->",
 352         },
 353         {
 354                 "comment13",
 355                 "a<!--x--!>z",
 356                 "a$<!--x-->$z",
 357         },
 358         // An attribute with a backslash.
 359         {
 360                 "backslash",
 361                 `<p id="a\"b">`,
 362                 `<p id="a\" b"="">`,
 363         },
 364         // Entities, tag name and attribute key lower-casing, and whitespace
 365         // normalization within a tag.
 366         {
 367                 "tricky",
 368                 "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
 369                 `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
 370         },
 371         // A nonexistent entity. Tokenizing and converting back to a string should
 372         // escape the "&" to become "&amp;".
 373         {
 374                 "noSuchEntity",
 375                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 376                 `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 377         },
 378         {
 379                 "entity without semicolon",
 380                 `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
 381                 `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
 382         },
 383         {
 384                 "entity with digits",
 385                 "&frac12;",
 386                 "½",
 387         },
 388         // Attribute tests:
 389         // http://dev.w3.org/html5/spec/Overview.html#attributes-0
 390         {
 391                 "Empty attribute",
 392                 `<input disabled FOO>`,
 393                 `<input disabled="" foo="">`,
 394         },
 395         {
 396                 "Empty attribute, whitespace",
 397                 `<input disabled FOO >`,
 398                 `<input disabled="" foo="">`,
 399         },
 400         {
 401                 "Unquoted attribute value",
 402                 `<input value=yes FOO=BAR>`,
 403                 `<input value="yes" foo="BAR">`,
 404         },
 405         {
 406                 "Unquoted attribute value, spaces",
 407                 `<input value = yes FOO = BAR>`,
 408                 `<input value="yes" foo="BAR">`,
 409         },
 410         {
 411                 "Unquoted attribute value, trailing space",
 412                 `<input value=yes FOO=BAR >`,
 413                 `<input value="yes" foo="BAR">`,
 414         },
 415         {
 416                 "Single-quoted attribute value",
 417                 `<input value='yes' FOO='BAR'>`,
 418                 `<input value="yes" foo="BAR">`,
 419         },
 420         {
 421                 "Single-quoted attribute value, trailing space",
 422                 `<input value='yes' FOO='BAR' >`,
 423                 `<input value="yes" foo="BAR">`,
 424         },
 425         {
 426                 "Double-quoted attribute value",
 427                 `<input value="I'm an attribute" FOO="BAR">`,
 428                 `<input value="I&#39;m an attribute" foo="BAR">`,
 429         },
 430         {
 431                 "Attribute name characters",
 432                 `<meta http-equiv="content-type">`,
 433                 `<meta http-equiv="content-type">`,
 434         },
 435         {
 436                 "Mixed attributes",
 437                 `a<P V="0 1" w='2' X=3 y>z`,
 438                 `a$<p v="0 1" w="2" x="3" y="">$z`,
 439         },
 440         {
 441                 "Attributes with a solitary single quote",
 442                 `<p id=can't><p id=won't>`,
 443                 `<p id="can&#39;t">$<p id="won&#39;t">`,
 444         },
 445 }
 446
 447 func TestTokenizer(t *testing.T) {
 448 loop:
 449         for _, tt := range tokenTests {
 450                 z := NewTokenizer(strings.NewReader(tt.html))
 451                 if tt.golden != "" {
 452                         for i, s := range strings.Split(tt.golden, "$") {
 453                                 if z.Next() == ErrorToken {
 454                                         t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
 455                                         continue loop
 456                                 }
 457                                 actual := z.Token().String()
 458                                 if s != actual {
 459                                         t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
 460                                         continue loop
 461                                 }
 462                         }
 463                 }
 464                 z.Next()
 465                 if z.Err() != io.EOF {
 466                         t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
 467                 }
 468         }
 469 }
 470
 471 type unescapeTest struct {
 472         // A short description of the test case.
 473         desc string
 474         // The HTML text.
 475         html string
 476         // The unescaped text.
 477         unescaped string
 478 }
 479
 480 var unescapeTests = []unescapeTest{
 481         // Handle no entities.
 482         {
 483                 "copy",
 484                 "A\ttext\nstring",
 485                 "A\ttext\nstring",
 486         },
 487         // Handle simple named entities.
 488         {
 489                 "simple",
 490                 "&amp; &gt; &lt;",
 491                 "& > <",
 492         },
 493         // Handle hitting the end of the string.
 494         {
 495                 "stringEnd",
 496                 "&amp &amp",
 497                 "& &",
 498         },
 499         // Handle entities with two codepoints.
 500         {
 501                 "multiCodepoint",
 502                 "text &gesl; blah",
 503                 "text \u22db\ufe00 blah",
 504         },
 505         // Handle decimal numeric entities.
 506         {
 507                 "decimalEntity",
 508                 "Delta = &#916; ",
 509                 "Delta = Δ ",
 510         },
 511         // Handle hexadecimal numeric entities.
 512         {
 513                 "hexadecimalEntity",
 514                 "Lambda = &#x3bb; = &#X3Bb ",
 515                 "Lambda = λ = λ ",
 516         },
 517         // Handle numeric early termination.
 518         {
 519                 "numericEnds",
 520                 "&# &#x &#128;43 &copy = &#169f = &#xa9",
 521                 "&# &#x €43 © = ©f = ©",
 522         },
 523         // Handle numeric ISO-8859-1 entity replacements.
 524         {
 525                 "numericReplacements",
 526                 "Footnote&#x87;",
 527                 "Footnote‡",
 528         },
 529 }
 530
 531 func TestUnescape(t *testing.T) {
 532         for _, tt := range unescapeTests {
 533                 unescaped := UnescapeString(tt.html)
 534                 if unescaped != tt.unescaped {
 535                         t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
 536                 }
 537         }
 538 }
 539
 540 func TestUnescapeEscape(t *testing.T) {
 541         ss := []string{
 542                 ``,
 543                 `abc def`,
 544                 `a & b`,
 545                 `a&amp;b`,
 546                 `a &amp b`,
 547                 `&quot;`,
 548                 `"`,
 549                 `"<&>"`,
 550                 `&quot;&lt;&amp;&gt;&quot;`,
 551                 `3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
 552                 `The special characters are: <, >, &, ' and "`,
 553         }
 554         for _, s := range ss {
 555                 if got := UnescapeString(EscapeString(s)); got != s {
 556                         t.Errorf("got %q want %q", got, s)
 557                 }
 558         }
 559 }
 560
 561 func TestBufAPI(t *testing.T) {
 562         s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
 563         z := NewTokenizer(bytes.NewBufferString(s))
 564         var result bytes.Buffer
 565         depth := 0
 566 loop:
 567         for {
 568                 tt := z.Next()
 569                 switch tt {
 570                 case ErrorToken:
 571                         if z.Err() != io.EOF {
 572                                 t.Error(z.Err())
 573                         }
 574                         break loop
 575                 case TextToken:
 576                         if depth > 0 {
 577                                 result.Write(z.Text())
 578                         }
 579                 case StartTagToken, EndTagToken:
 580                         tn, _ := z.TagName()
 581                         if len(tn) == 1 && tn[0] == 'a' {
 582                                 if tt == StartTagToken {
 583                                         depth++
 584                                 } else {
 585                                         depth--
 586                                 }
 587                         }
 588                 }
 589         }
 590         u := "14567"
 591         v := string(result.Bytes())
 592         if u != v {
 593                 t.Errorf("TestBufAPI: want %q got %q", u, v)
 594         }
 595 }
 596
 597 func TestConvertNewlines(t *testing.T) {
 598         testCases := map[string]string{
 599                 "Mac\rDOS\r\nUnix\n":    "Mac\nDOS\nUnix\n",
 600                 "Unix\nMac\rDOS\r\n":    "Unix\nMac\nDOS\n",
 601                 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
 602                 "":         "",
 603                 "\n":       "\n",
 604                 "\n\r":     "\n\n",
 605                 "\r":       "\n",
 606                 "\r\n":     "\n",
 607                 "\r\n\n":   "\n\n",
 608                 "\r\n\r":   "\n\n",
 609                 "\r\n\r\n": "\n\n",
 610                 "\r\r":     "\n\n",
 611                 "\r\r\n":   "\n\n",
 612                 "\r\r\n\n": "\n\n\n",
 613                 "\r\r\r\n": "\n\n\n",
 614                 "\r \n":    "\n \n",
 615                 "xyz":      "xyz",
 616         }
 617         for in, want := range testCases {
 618                 if got := string(convertNewlines([]byte(in))); got != want {
 619                         t.Errorf("input %q: got %q, want %q", in, got, want)
 620                 }
 621         }
 622 }
 623
 624 const (
 625         rawLevel = iota
 626         lowLevel
 627         highLevel
 628 )
 629
 630 func benchmarkTokenizer(b *testing.B, level int) {
 631         buf, err := ioutil.ReadFile("testdata/go1.html")
 632         if err != nil {
 633                 b.Fatalf("could not read testdata/go1.html: %v", err)
 634         }
 635         b.SetBytes(int64(len(buf)))
 636         runtime.GC()
 637         b.ReportAllocs()
 638         b.ResetTimer()
 639         for i := 0; i < b.N; i++ {
 640                 z := NewTokenizer(bytes.NewBuffer(buf))
 641                 for {
 642                         tt := z.Next()
 643                         if tt == ErrorToken {
 644                                 if err := z.Err(); err != nil && err != io.EOF {
 645                                         b.Fatalf("tokenizer error: %v", err)
 646                                 }
 647                                 break
 648                         }
 649                         switch level {
 650                         case rawLevel:
 651                                 // Calling z.Raw just returns the raw bytes of the token. It does
 652                                 // not unescape &lt; to <, or lower-case tag names and attribute keys.
 653                                 z.Raw()
 654                         case lowLevel:
 655                                 // Caling z.Text, z.TagName and z.TagAttr returns []byte values
 656                                 // whose contents may change on the next call to z.Next.
 657                                 switch tt {
 658                                 case TextToken, CommentToken, DoctypeToken:
 659                                         z.Text()
 660                                 case StartTagToken, SelfClosingTagToken:
 661                                         _, more := z.TagName()
 662                                         for more {
 663                                                 _, _, more = z.TagAttr()
 664                                         }
 665                                 case EndTagToken:
 666                                         z.TagName()
 667                                 }
 668                         case highLevel:
 669                                 // Calling z.Token converts []byte values to strings whose validity
 670                                 // extend beyond the next call to z.Next.
 671                                 z.Token()
 672                         }
 673                 }
 674         }
 675 }
 676
 677 func BenchmarkRawLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, rawLevel) }
 678 func BenchmarkLowLevelTokenizer(b *testing.B)  { benchmarkTokenizer(b, lowLevel) }
 679 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }