support/htmlentities.rb

   1 #
   2 # HTML entity encoding and decoding for Ruby
   3 #
   4 # Author::  Paul BATTLEY (pbattley @ gmail.com)
   5 # Version:: 2.2
   6 # Date::    2005-11-07
   7 #
   8 # == About
   9 #
  10 # This library extends the String class to allow encoding and decoding of
  11 # HTML/XML entities from/to their corresponding UTF-8 codepoints.
  12 #
  13 # == Licence
  14 #
  15 # Copyright (c) 2005 Paul Battley
  16 #
  17 # Usage of the works is permitted provided that this instrument is retained
  18 # with the works, so that any entity that uses the works is notified of this
  19 # instrument.
  20 #
  21 # DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
  22 #
  23
  24 module HTMLEntities
  25
  26     VERSION = '2.2'
  27
  28     #
  29     # MAP is a hash of all the HTML entities I could discover, as taken
  30     # from the w3schools page on the subject:
  31     # http://www.w3schools.com/html/html_entitiesref.asp
  32     # The format is 'entity name' => codepoint where entity name is given
  33     # without the surrounding ampersand and semicolon.
  34     #
  35     MAP = {
  36         'quot'      => 34,
  37         'apos'      => 39,
  38         'amp'       => 38,
  39         'lt'        => 60,
  40         'gt'        => 62,
  41         'nbsp'      => 160,
  42         'iexcl'     => 161,
  43         'curren'    => 164,
  44         'cent'      => 162,
  45         'pound'     => 163,
  46         'yen'       => 165,
  47         'brvbar'    => 166,
  48         'sect'      => 167,
  49         'uml'       => 168,
  50         'copy'      => 169,
  51         'ordf'      => 170,
  52         'laquo'     => 171,
  53         'not'       => 172,
  54         'shy'       => 173,
  55         'reg'       => 174,
  56         'trade'     => 8482,
  57         'macr'      => 175,
  58         'deg'       => 176,
  59         'plusmn'    => 177,
  60         'sup2'      => 178,
  61         'sup3'      => 179,
  62         'acute'     => 180,
  63         'micro'     => 181,
  64         'para'      => 182,
  65         'middot'    => 183,
  66         'cedil'     => 184,
  67         'sup1'      => 185,
  68         'ordm'      => 186,
  69         'raquo'     => 187,
  70         'frac14'    => 188,
  71         'frac12'    => 189,
  72         'frac34'    => 190,
  73         'iquest'    => 191,
  74         'times'     => 215,
  75         'divide'    => 247,
  76         'Agrave'    => 192,
  77         'Aacute'    => 193,
  78         'Acirc'     => 194,
  79         'Atilde'    => 195,
  80         'Auml'      => 196,
  81         'Aring'     => 197,
  82         'AElig'     => 198,
  83         'Ccedil'    => 199,
  84         'Egrave'    => 200,
  85         'Eacute'    => 201,
  86         'Ecirc'     => 202,
  87         'Euml'      => 203,
  88         'Igrave'    => 204,
  89         'Iacute'    => 205,
  90         'Icirc'     => 206,
  91         'Iuml'      => 207,
  92         'ETH'       => 208,
  93         'Ntilde'    => 209,
  94         'Ograve'    => 210,
  95         'Oacute'    => 211,
  96         'Ocirc'     => 212,
  97         'Otilde'    => 213,
  98         'Ouml'      => 214,
  99         'Oslash'    => 216,
 100         'Ugrave'    => 217,
 101         'Uacute'    => 218,
 102         'Ucirc'     => 219,
 103         'Uuml'      => 220,
 104         'Yacute'    => 221,
 105         'THORN'     => 222,
 106         'szlig'     => 223,
 107         'agrave'    => 224,
 108         'aacute'    => 225,
 109         'acirc'     => 226,
 110         'atilde'    => 227,
 111         'auml'      => 228,
 112         'aring'     => 229,
 113         'aelig'     => 230,
 114         'ccedil'    => 231,
 115         'egrave'    => 232,
 116         'eacute'    => 233,
 117         'ecirc'     => 234,
 118         'euml'      => 235,
 119         'igrave'    => 236,
 120         'iacute'    => 237,
 121         'icirc'     => 238,
 122         'iuml'      => 239,
 123         'eth'       => 240,
 124         'ntilde'    => 241,
 125         'ograve'    => 242,
 126         'oacute'    => 243,
 127         'ocirc'     => 244,
 128         'otilde'    => 245,
 129         'ouml'      => 246,
 130         'oslash'    => 248,
 131         'ugrave'    => 249,
 132         'uacute'    => 250,
 133         'ucirc'     => 251,
 134         'uuml'      => 252,
 135         'yacute'    => 253,
 136         'thorn'     => 254,
 137         'yuml'      => 255,
 138         'OElig'     => 338,
 139         'oelig'     => 339,
 140         'Scaron'    => 352,
 141         'scaron'    => 353,
 142         'Yuml'      => 376,
 143         'circ'      => 710,
 144         'tilde'     => 732,
 145         'ensp'      => 8194,
 146         'emsp'      => 8195,
 147         'thinsp'    => 8201,
 148         'zwnj'      => 8204,
 149         'zwj'       => 8205,
 150         'lrm'       => 8206,
 151         'rlm'       => 8207,
 152         'ndash'     => 8211,
 153         'mdash'     => 8212,
 154         'lsquo'     => 8216,
 155         'rsquo'     => 8217,
 156         'sbquo'     => 8218,
 157         'ldquo'     => 8220,
 158         'rdquo'     => 8221,
 159         'bdquo'     => 8222,
 160         'dagger'    => 8224,
 161         'Dagger'    => 8225,
 162         'hellip'    => 8230,
 163         'permil'    => 8240,
 164         'lsaquo'    => 8249,
 165         'rsaquo'    => 8250,
 166         'euro'      => 8364
 167     }
 168
 169     MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
 170     MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
 171
 172     # Precompile the regexp
 173     NAMED_ENTITY_REGEXP =
 174         /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
 175
 176     # Reverse map for converting characters to named entities
 177     REVERSE_MAP = MAP.invert
 178
 179     BASIC_ENTITY_REGEXP = /[<>'"&]/
 180
 181     UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
 182
 183 end
 184
 185 class String
 186
 187     # Because there's no need to make the user worry about the order here,
 188     # let's handle it.
 189     ENCODE_ENTITIES_COMMAND_ORDER = {
 190         :basic => 0,
 191         :named => 1,
 192         :decimal => 2,
 193         :hexadecimal => 3
 194     }
 195
 196     #
 197     # Decode XML and HTML 4.01 entities in a string into their UTF-8
 198     # equivalents.  Obviously, if your string is not already in UTF-8, you'd
 199     # better convert it before using this method, or the output will be mixed
 200     # up.
 201     # Unknown named entities are not converted
 202     #
 203     def decode_entities
 204         return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
 205             HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
 206         }.gsub(/&#([0-9]{1,7});/) {
 207             [$1.to_i].pack('U')
 208         }.gsub(/&#x([0-9a-f]{1,6});/i) {
 209             [$1.to_i(16)].pack('U')
 210         }
 211     end
 212
 213     #
 214     # Encode codepoints into their corresponding entities.  Various operations
 215     # are possible, and may be specified in order:
 216     #
 217     # :basic :: Convert the five XML entities ('"<>&)
 218     # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
 219     # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
 220     # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
 221     #
 222     # You can specify the commands in any order, but they will be executed in
 223     # the order listed above to ensure that entity ampersands are not
 224     # clobbered and that named entities are replaced before numeric ones.
 225     #
 226     # If no instructions are specified, :basic will be used.
 227     #
 228     # Examples:
 229     #   str.encode_entities - XML-safe
 230     #   str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
 231     #   str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
 232     #   non-ASCII characters replaced with their named entity where possible, and
 233     #   decimal equivalents otherwise.
 234     #
 235     # Note: It is the program's responsibility to ensure that the string
 236     # contains valid UTF-8 before calling this method.
 237     #
 238     def encode_entities(*instructions)
 239         str = nil
 240         if (instructions.empty?)
 241             instructions = [:basic]
 242         else
 243             instructions.each do |instr|
 244                 unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
 245                     raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
 246                 end
 247             end
 248             instructions.sort! { |a,b|
 249                 ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
 250                 ENCODE_ENTITIES_COMMAND_ORDER[b]
 251             }
 252         end
 253         instructions.each do |instruction|
 254             case instruction
 255             when :basic
 256                 # Handled as basic ASCII
 257                 str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
 258                     # It's safe to use the simpler [0] here because we know
 259                     # that the basic entities are ASCII.
 260                     '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
 261                 }
 262             when :named
 263                 # Test everything except printable ASCII
 264                 str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
 265                     cp = $&.unpack('U')[0]
 266                     (e = HTMLEntities::REVERSE_MAP[cp]) ?  "&#{e};" : $&
 267                 }
 268             when :decimal
 269                 str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
 270                     "&##{$&.unpack('U')[0]};"
 271                 }
 272             when :hexadecimal
 273                 str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
 274                     "&#x#{$&.unpack('U')[0].to_s(16)};"
 275                 }
 276             end
 277         end
 278         return str
 279     end
 280
 281 end