Deal with old libxml incompatibilities.
[htmlpurifier.git] / library / HTMLPurifier / EntityParser.php
blobc372b5a6a6c0f9641cf94bb3feb97996898aca2c
1 <?php
3 // if want to implement error collecting here, we'll need to use some sort
4 // of global data (probably trigger_error) because it's impossible to pass
5 // $config or $context to the callback functions.
7 /**
8 * Handles referencing and derefencing character entities
9 */
10 class HTMLPurifier_EntityParser
13 /**
14 * Reference to entity lookup table.
15 * @type HTMLPurifier_EntityLookup
17 protected $_entity_lookup;
19 /**
20 * Callback regex string for entities in text.
21 * @type string
23 protected $_textEntitiesRegex;
25 /**
26 * Callback regex string for entities in attributes.
27 * @type string
29 protected $_attrEntitiesRegex;
31 /**
32 * Tests if the beginning of a string is a semi-optional regex
34 protected $_semiOptionalPrefixRegex;
36 public function __construct() {
37 // From
38 // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
39 $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
41 // NB: three empty captures to put the fourth match in the right
42 // place
43 $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
45 $this->_textEntitiesRegex =
46 '/&(?:'.
47 // hex
48 '[#]x([a-fA-F0-9]+);?|'.
49 // dec
50 '[#]0*(\d+);?|'.
51 // string (mandatory semicolon)
52 // NB: order matters: match semicolon preferentially
53 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
54 // string (optional semicolon)
55 "($semi_optional)".
56 ')/';
58 $this->_attrEntitiesRegex =
59 '/&(?:'.
60 // hex
61 '[#]x([a-fA-F0-9]+);?|'.
62 // dec
63 '[#]0*(\d+);?|'.
64 // string (mandatory semicolon)
65 // NB: order matters: match semicolon preferentially
66 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
67 // string (optional semicolon)
68 // don't match if trailing is equals or alphanumeric (URL
69 // like)
70 "($semi_optional)(?![=;A-Za-z0-9])".
71 ')/';
75 /**
76 * Substitute entities with the parsed equivalents. Use this on
77 * textual data in an HTML document (as opposed to attributes.)
79 * @param string $string String to have entities parsed.
80 * @return string Parsed string.
82 public function substituteTextEntities($string)
84 return preg_replace_callback(
85 $this->_textEntitiesRegex,
86 array($this, 'entityCallback'),
87 $string
91 /**
92 * Substitute entities with the parsed equivalents. Use this on
93 * attribute contents in documents.
95 * @param string $string String to have entities parsed.
96 * @return string Parsed string.
98 public function substituteAttrEntities($string)
100 return preg_replace_callback(
101 $this->_attrEntitiesRegex,
102 array($this, 'entityCallback'),
103 $string
108 * Callback function for substituteNonSpecialEntities() that does the work.
110 * @param array $matches PCRE matches array, with 0 the entire match, and
111 * either index 1, 2 or 3 set with a hex value, dec value,
112 * or string (respectively).
113 * @return string Replacement string.
116 protected function entityCallback($matches)
118 $entity = $matches[0];
119 $hex_part = @$matches[1];
120 $dec_part = @$matches[2];
121 $named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
122 if ($hex_part !== NULL && $hex_part !== "") {
123 return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
124 } elseif ($dec_part !== NULL && $dec_part !== "") {
125 return HTMLPurifier_Encoder::unichr((int) $dec_part);
126 } else {
127 if (!$this->_entity_lookup) {
128 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
130 if (isset($this->_entity_lookup->table[$named_part])) {
131 return $this->_entity_lookup->table[$named_part];
132 } else {
133 // exact match didn't match anything, so test if
134 // any of the semicolon optional match the prefix.
135 // Test that this is an EXACT match is important to
136 // prevent infinite loop
137 if (!empty($matches[3])) {
138 return preg_replace_callback(
139 $this->_semiOptionalPrefixRegex,
140 array($this, 'entityCallback'),
141 $entity
144 return $entity;
149 // LEGACY CODE BELOW
152 * Callback regex string for parsing entities.
153 * @type string
155 protected $_substituteEntitiesRegex =
156 '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
157 // 1. hex 2. dec 3. string (XML style)
160 * Decimal to parsed string conversion table for special entities.
161 * @type array
163 protected $_special_dec2str =
164 array(
165 34 => '"',
166 38 => '&',
167 39 => "'",
168 60 => '<',
169 62 => '>'
173 * Stripped entity names to decimal conversion table for special entities.
174 * @type array
176 protected $_special_ent2dec =
177 array(
178 'quot' => 34,
179 'amp' => 38,
180 'lt' => 60,
181 'gt' => 62
185 * Substitutes non-special entities with their parsed equivalents. Since
186 * running this whenever you have parsed character is t3h 5uck, we run
187 * it before everything else.
189 * @param string $string String to have non-special entities parsed.
190 * @return string Parsed string.
192 public function substituteNonSpecialEntities($string)
194 // it will try to detect missing semicolons, but don't rely on it
195 return preg_replace_callback(
196 $this->_substituteEntitiesRegex,
197 array($this, 'nonSpecialEntityCallback'),
198 $string
203 * Callback function for substituteNonSpecialEntities() that does the work.
205 * @param array $matches PCRE matches array, with 0 the entire match, and
206 * either index 1, 2 or 3 set with a hex value, dec value,
207 * or string (respectively).
208 * @return string Replacement string.
211 protected function nonSpecialEntityCallback($matches)
213 // replaces all but big five
214 $entity = $matches[0];
215 $is_num = (@$matches[0][1] === '#');
216 if ($is_num) {
217 $is_hex = (@$entity[2] === 'x');
218 $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
219 // abort for special characters
220 if (isset($this->_special_dec2str[$code])) {
221 return $entity;
223 return HTMLPurifier_Encoder::unichr($code);
224 } else {
225 if (isset($this->_special_ent2dec[$matches[3]])) {
226 return $entity;
228 if (!$this->_entity_lookup) {
229 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
231 if (isset($this->_entity_lookup->table[$matches[3]])) {
232 return $this->_entity_lookup->table[$matches[3]];
233 } else {
234 return $entity;
240 * Substitutes only special entities with their parsed equivalents.
242 * @notice We try to avoid calling this function because otherwise, it
243 * would have to be called a lot (for every parsed section).
245 * @param string $string String to have non-special entities parsed.
246 * @return string Parsed string.
248 public function substituteSpecialEntities($string)
250 return preg_replace_callback(
251 $this->_substituteEntitiesRegex,
252 array($this, 'specialEntityCallback'),
253 $string
258 * Callback function for substituteSpecialEntities() that does the work.
260 * This callback has same syntax as nonSpecialEntityCallback().
262 * @param array $matches PCRE-style matches array, with 0 the entire match, and
263 * either index 1, 2 or 3 set with a hex value, dec value,
264 * or string (respectively).
265 * @return string Replacement string.
267 protected function specialEntityCallback($matches)
269 $entity = $matches[0];
270 $is_num = (@$matches[0][1] === '#');
271 if ($is_num) {
272 $is_hex = (@$entity[2] === 'x');
273 $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
274 return isset($this->_special_dec2str[$int]) ?
275 $this->_special_dec2str[$int] :
276 $entity;
277 } else {
278 return isset($this->_special_ent2dec[$matches[3]]) ?
279 $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
280 $entity;
285 // vim: et sw=4 sts=4