libgo/go/html/token_test.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "os"
  10         "testing"
  11 )
  12
  13 type tokenTest struct {
  14         // A short description of the test case.
  15         desc string
  16         // The HTML to parse.
  17         html string
  18         // The string representations of the expected tokens.
  19         tokens []string
  20 }
  21
  22 var tokenTests = []tokenTest{
  23         // A single text node. The tokenizer should not break text nodes on whitespace,
  24         // nor should it normalize whitespace within a text node.
  25         {
  26                 "text",
  27                 "foo  bar",
  28                 []string{
  29                         "foo  bar",
  30                 },
  31         },
  32         // An entity.
  33         {
  34                 "entity",
  35                 "one &lt; two",
  36                 []string{
  37                         "one &lt; two",
  38                 },
  39         },
  40         // A start, self-closing and end tag. The tokenizer does not care if the start
  41         // and end tokens don't match; that is the job of the parser.
  42         {
  43                 "tags",
  44                 "<a>b<c/>d</e>",
  45                 []string{
  46                         "<a>",
  47                         "b",
  48                         "<c/>",
  49                         "d",
  50                         "</e>",
  51                 },
  52         },
  53         // An attribute with a backslash.
  54         {
  55                 "backslash",
  56                 `<p id="a\"b">`,
  57                 []string{
  58                         `<p id="a&quot;b">`,
  59                 },
  60         },
  61         // Entities, tag name and attribute key lower-casing, and whitespace
  62         // normalization within a tag.
  63         {
  64                 "tricky",
  65                 "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
  66                 []string{
  67                         `<p id="a&quot;B" foo="bar">`,
  68                         "<em>",
  69                         "te&lt;&amp;;xt",
  70                         "</em>",
  71                         "</p>",
  72                 },
  73         },
  74         // A non-existant entity. Tokenizing and converting back to a string should
  75         // escape the "&" to become "&amp;".
  76         {
  77                 "noSuchEntity",
  78                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
  79                 []string{
  80                         `<a b="c&amp;noSuchEntity;d">`,
  81                         "&lt;&amp;alsoDoesntExist;&amp;",
  82                 },
  83         },
  84 }
  85
  86 func TestTokenizer(t *testing.T) {
  87 loop:
  88         for _, tt := range tokenTests {
  89                 z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
  90                 for i, s := range tt.tokens {
  91                         if z.Next() == Error {
  92                                 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
  93                                 continue loop
  94                         }
  95                         actual := z.Token().String()
  96                         if s != actual {
  97                                 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
  98                                 continue loop
  99                         }
 100                 }
 101                 z.Next()
 102                 if z.Error() != os.EOF {
 103                         t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
 104                 }
 105         }
 106 }
 107
 108 func TestUnescapeEscape(t *testing.T) {
 109         ss := []string{
 110                 ``,
 111                 `abc def`,
 112                 `a & b`,
 113                 `a&amp;b`,
 114                 `a &amp b`,
 115                 `&quot;`,
 116                 `"`,
 117                 `"<&>"`,
 118                 `&quot;&lt;&amp;&gt;&quot;`,
 119                 `3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
 120         }
 121         for _, s := range ss {
 122                 if s != UnescapeString(EscapeString(s)) {
 123                         t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
 124                 }
 125         }
 126 }
 127
 128 func TestBufAPI(t *testing.T) {
 129         s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
 130         z := NewTokenizer(bytes.NewBuffer([]byte(s)))
 131         result := bytes.NewBuffer(nil)
 132         depth := 0
 133 loop:
 134         for {
 135                 tt := z.Next()
 136                 switch tt {
 137                 case Error:
 138                         if z.Error() != os.EOF {
 139                                 t.Error(z.Error())
 140                         }
 141                         break loop
 142                 case Text:
 143                         if depth > 0 {
 144                                 result.Write(z.Text())
 145                         }
 146                 case StartTag, EndTag:
 147                         tn, _ := z.TagName()
 148                         if len(tn) == 1 && tn[0] == 'a' {
 149                                 if tt == StartTag {
 150                                         depth++
 151                                 } else {
 152                                         depth--
 153                                 }
 154                         }
 155                 }
 156         }
 157         u := "14567"
 158         v := string(result.Bytes())
 159         if u != v {
 160                 t.Errorf("TestBufAPI: want %q got %q", u, v)
 161         }
 162 }