2 # HTML entity encoding and decoding for Ruby
4 # Author:: Paul BATTLEY (pbattley @ gmail.com)
10 # This library extends the String class to allow encoding and decoding of
11 # HTML/XML entities from/to their corresponding UTF-8 codepoints.
15 # Copyright (c) 2005 Paul Battley
17 # Usage of the works is permitted provided that this instrument is retained
18 # with the works, so that any entity that uses the works is notified of this
21 # DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
29 # MAP is a hash of all the HTML entities I could discover, as taken
30 # from the w3schools page on the subject:
31 # http://www.w3schools.com/html/html_entitiesref.asp
32 # The format is 'entity name' => codepoint where entity name is given
33 # without the surrounding ampersand and semicolon.
169 MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
170 MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
172 # Precompile the regexp
173 NAMED_ENTITY_REGEXP =
174 /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
176 # Reverse map for converting characters to named entities
177 REVERSE_MAP = MAP.invert
179 BASIC_ENTITY_REGEXP = /[<>'"&]/
181 UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
187 # Because there's no need to make the user worry about the order here,
189 ENCODE_ENTITIES_COMMAND_ORDER = {
197 # Decode XML and HTML 4.01 entities in a string into their UTF-8
198 # equivalents. Obviously, if your string is not already in UTF-8, you'd
199 # better convert it before using this method, or the output will be mixed
201 # Unknown named entities are not converted
204 return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
205 HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
206 }.gsub(/&#([0-9]{1,7});/) {
208 }.gsub(/&#x([0-9a-f]{1,6});/i) {
209 [$1.to_i(16)].pack('U')
214 # Encode codepoints into their corresponding entities. Various operations
215 # are possible, and may be specified in order:
217 # :basic :: Convert the five XML entities ('"<>&)
218 # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
219 # :decimal :: Convert non-ASCII characters to decimal entities (e.g. Ӓ)
220 # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # ካ)
222 # You can specify the commands in any order, but they will be executed in
223 # the order listed above to ensure that entity ampersands are not
224 # clobbered and that named entities are replaced before numeric ones.
226 # If no instructions are specified, :basic will be used.
229 # str.encode_entities - XML-safe
230 # str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
231 # str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
232 # non-ASCII characters replaced with their named entity where possible, and
233 # decimal equivalents otherwise.
235 # Note: It is the program's responsibility to ensure that the string
236 # contains valid UTF-8 before calling this method.
238 def encode_entities(*instructions)
240 if (instructions.empty?)
241 instructions = [:basic]
243 instructions.each do |instr|
244 unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
245 raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
248 instructions.sort! { |a,b|
249 ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
250 ENCODE_ENTITIES_COMMAND_ORDER[b]
253 instructions.each do |instruction|
256 # Handled as basic ASCII
257 str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
258 # It's safe to use the simpler [0] here because we know
259 # that the basic entities are ASCII.
260 '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
263 # Test everything except printable ASCII
264 str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
265 cp = $&.unpack('U')[0]
266 (e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
269 str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
270 "&##{$&.unpack('U')[0]};"
273 str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
274 "&#x#{$&.unpack('U')[0].to_s(16)};"