[3.0.0] Convert all $context calls away from references
[htmlpurifier.git] / library / HTMLPurifier / Lexer / DirectLex.php
blobaf333015ee14e9c55c64292cce21cb6c0c2303ce
1 <?php
3 require_once 'HTMLPurifier/Lexer.php';
5 HTMLPurifier_ConfigSchema::define(
6 'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
7 <p>
8 Specifies the number of tokens the DirectLex line number tracking
9 implementations should process before attempting to resyncronize the
10 current line count by manually counting all previous new-lines. When
11 at 0, this functionality is disabled. Lower values will decrease
12 performance, and this is only strictly necessary if the counting
13 algorithm is buggy (in which case you should report it as a bug).
14 This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
15 not being used. This directive has been available since 2.0.0.
16 </p>
17 ');
19 /**
20 * Our in-house implementation of a parser.
22 * A pure PHP parser, DirectLex has absolutely no dependencies, making
23 * it a reasonably good default for PHP4. Written with efficiency in mind,
24 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
25 * pales in comparison to HTMLPurifier_Lexer_DOMLex.
27 * @todo Reread XML spec and document differences.
29 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
32 /**
33 * Whitespace characters for str(c)spn.
35 protected $_whitespace = "\x20\x09\x0D\x0A";
37 /**
38 * Callback function for script CDATA fudge
39 * @param $matches, in form of array(opening tag, contents, closing tag)
41 protected function scriptCallback($matches) {
42 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
45 public function tokenizeHTML($html, $config, $context) {
47 // special normalization for script tags without any armor
48 // our "armor" heurstic is a < sign any number of whitespaces after
49 // the first script tag
50 if ($config->get('HTML', 'Trusted')) {
51 $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
52 array($this, 'scriptCallback'), $html);
55 $html = $this->normalize($html, $config, $context);
57 $cursor = 0; // our location in the text
58 $inside_tag = false; // whether or not we're parsing the inside of a tag
59 $array = array(); // result array
61 $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
63 if ($maintain_line_numbers === null) {
64 // automatically determine line numbering by checking
65 // if error collection is on
66 $maintain_line_numbers = $config->get('Core', 'CollectErrors');
69 if ($maintain_line_numbers) $current_line = 1;
70 else $current_line = false;
71 $context->register('CurrentLine', $current_line);
72 $nl = "\n";
73 // how often to manually recalculate. This will ALWAYS be right,
74 // but it's pretty wasteful. Set to 0 to turn off
75 $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
77 $e = false;
78 if ($config->get('Core', 'CollectErrors')) {
79 $e =& $context->get('ErrorCollector');
82 // infinite loop protection
83 // has to be pretty big, since html docs can be big
84 // we're allow two hundred thousand tags... more than enough?
85 // NOTE: this is also used for synchronization, so watch out
86 $loops = 0;
88 while(true) {
90 // infinite loop protection
91 if (++$loops > 200000) return array();
93 // recalculate lines
94 if (
95 $maintain_line_numbers && // line number tracking is on
96 $synchronize_interval && // synchronization is on
97 $cursor > 0 && // cursor is further than zero
98 $loops % $synchronize_interval === 0 // time to synchronize!
99 ) {
100 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
103 $position_next_lt = strpos($html, '<', $cursor);
104 $position_next_gt = strpos($html, '>', $cursor);
106 // triggers on "<b>asdf</b>" but not "asdf <b></b>"
107 // special case to set up context
108 if ($position_next_lt === $cursor) {
109 $inside_tag = true;
110 $cursor++;
113 if (!$inside_tag && $position_next_lt !== false) {
114 // We are not inside tag and there still is another tag to parse
115 $token = new
116 HTMLPurifier_Token_Text(
117 $this->parseData(
118 substr(
119 $html, $cursor, $position_next_lt - $cursor
123 if ($maintain_line_numbers) {
124 $token->line = $current_line;
125 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
127 $array[] = $token;
128 $cursor = $position_next_lt + 1;
129 $inside_tag = true;
130 continue;
131 } elseif (!$inside_tag) {
132 // We are not inside tag but there are no more tags
133 // If we're already at the end, break
134 if ($cursor === strlen($html)) break;
135 // Create Text of rest of string
136 $token = new
137 HTMLPurifier_Token_Text(
138 $this->parseData(
139 substr(
140 $html, $cursor
144 if ($maintain_line_numbers) $token->line = $current_line;
145 $array[] = $token;
146 break;
147 } elseif ($inside_tag && $position_next_gt !== false) {
148 // We are in tag and it is well formed
149 // Grab the internals of the tag
150 $strlen_segment = $position_next_gt - $cursor;
152 if ($strlen_segment < 1) {
153 // there's nothing to process!
154 $token = new HTMLPurifier_Token_Text('<');
155 $cursor++;
156 continue;
159 $segment = substr($html, $cursor, $strlen_segment);
161 if ($segment === false) {
162 // somehow, we attempted to access beyond the end of
163 // the string, defense-in-depth, reported by Nate Abele
164 break;
167 // Check if it's a comment
168 if (
169 substr($segment, 0, 3) === '!--'
171 // re-determine segment length, looking for -->
172 $position_comment_end = strpos($html, '-->', $cursor);
173 if ($position_comment_end === false) {
174 // uh oh, we have a comment that extends to
175 // infinity. Can't be helped: set comment
176 // end position to end of string
177 if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
178 $position_comment_end = strlen($html);
179 $end = true;
180 } else {
181 $end = false;
183 $strlen_segment = $position_comment_end - $cursor;
184 $segment = substr($html, $cursor, $strlen_segment);
185 $token = new
186 HTMLPurifier_Token_Comment(
187 substr(
188 $segment, 3, $strlen_segment - 3
191 if ($maintain_line_numbers) {
192 $token->line = $current_line;
193 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
195 $array[] = $token;
196 $cursor = $end ? $position_comment_end : $position_comment_end + 3;
197 $inside_tag = false;
198 continue;
201 // Check if it's an end tag
202 $is_end_tag = (strpos($segment,'/') === 0);
203 if ($is_end_tag) {
204 $type = substr($segment, 1);
205 $token = new HTMLPurifier_Token_End($type);
206 if ($maintain_line_numbers) {
207 $token->line = $current_line;
208 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
210 $array[] = $token;
211 $inside_tag = false;
212 $cursor = $position_next_gt + 1;
213 continue;
216 // Check leading character is alnum, if not, we may
217 // have accidently grabbed an emoticon. Translate into
218 // text and go our merry way
219 if (!ctype_alpha($segment[0])) {
220 // XML: $segment[0] !== '_' && $segment[0] !== ':'
221 if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
222 $token = new
223 HTMLPurifier_Token_Text(
224 '<' .
225 $this->parseData(
226 $segment
227 ) .
230 if ($maintain_line_numbers) {
231 $token->line = $current_line;
232 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
234 $array[] = $token;
235 $cursor = $position_next_gt + 1;
236 $inside_tag = false;
237 continue;
240 // Check if it is explicitly self closing, if so, remove
241 // trailing slash. Remember, we could have a tag like <br>, so
242 // any later token processing scripts must convert improperly
243 // classified EmptyTags from StartTags.
244 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
245 if ($is_self_closing) {
246 $strlen_segment--;
247 $segment = substr($segment, 0, $strlen_segment);
250 // Check if there are any attributes
251 $position_first_space = strcspn($segment, $this->_whitespace);
253 if ($position_first_space >= $strlen_segment) {
254 if ($is_self_closing) {
255 $token = new HTMLPurifier_Token_Empty($segment);
256 } else {
257 $token = new HTMLPurifier_Token_Start($segment);
259 if ($maintain_line_numbers) {
260 $token->line = $current_line;
261 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
263 $array[] = $token;
264 $inside_tag = false;
265 $cursor = $position_next_gt + 1;
266 continue;
269 // Grab out all the data
270 $type = substr($segment, 0, $position_first_space);
271 $attribute_string =
272 trim(
273 substr(
274 $segment, $position_first_space
277 if ($attribute_string) {
278 $attr = $this->parseAttributeString(
279 $attribute_string
280 , $config, $context
282 } else {
283 $attr = array();
286 if ($is_self_closing) {
287 $token = new HTMLPurifier_Token_Empty($type, $attr);
288 } else {
289 $token = new HTMLPurifier_Token_Start($type, $attr);
291 if ($maintain_line_numbers) {
292 $token->line = $current_line;
293 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
295 $array[] = $token;
296 $cursor = $position_next_gt + 1;
297 $inside_tag = false;
298 continue;
299 } else {
300 // inside tag, but there's no ending > sign
301 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
302 $token = new
303 HTMLPurifier_Token_Text(
304 '<' .
305 $this->parseData(
306 substr($html, $cursor)
309 if ($maintain_line_numbers) $token->line = $current_line;
310 // no cursor scroll? Hmm...
311 $array[] = $token;
312 break;
314 break;
317 $context->destroy('CurrentLine');
318 return $array;
322 * PHP 4 compatible substr_count that implements offset and length
324 protected function substrCount($haystack, $needle, $offset, $length) {
325 static $oldVersion;
326 if ($oldVersion === null) {
327 $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
329 if ($oldVersion) {
330 $haystack = substr($haystack, $offset, $length);
331 return substr_count($haystack, $needle);
332 } else {
333 return substr_count($haystack, $needle, $offset, $length);
338 * Takes the inside of an HTML tag and makes an assoc array of attributes.
340 * @param $string Inside of tag excluding name.
341 * @returns Assoc array of attributes.
343 public function parseAttributeString($string, $config, $context) {
344 $string = (string) $string; // quick typecast
346 if ($string == '') return array(); // no attributes
348 $e = false;
349 if ($config->get('Core', 'CollectErrors')) {
350 $e =& $context->get('ErrorCollector');
353 // let's see if we can abort as quickly as possible
354 // one equal sign, no spaces => one attribute
355 $num_equal = substr_count($string, '=');
356 $has_space = strpos($string, ' ');
357 if ($num_equal === 0 && !$has_space) {
358 // bool attribute
359 return array($string => $string);
360 } elseif ($num_equal === 1 && !$has_space) {
361 // only one attribute
362 list($key, $quoted_value) = explode('=', $string);
363 $quoted_value = trim($quoted_value);
364 if (!$key) {
365 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
366 return array();
368 if (!$quoted_value) return array($key => '');
369 $first_char = @$quoted_value[0];
370 $last_char = @$quoted_value[strlen($quoted_value)-1];
372 $same_quote = ($first_char == $last_char);
373 $open_quote = ($first_char == '"' || $first_char == "'");
375 if ( $same_quote && $open_quote) {
376 // well behaved
377 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
378 } else {
379 // not well behaved
380 if ($open_quote) {
381 if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
382 $value = substr($quoted_value, 1);
383 } else {
384 $value = $quoted_value;
387 if ($value === false) $value = '';
388 return array($key => $value);
391 // setup loop environment
392 $array = array(); // return assoc array of attributes
393 $cursor = 0; // current position in string (moves forward)
394 $size = strlen($string); // size of the string (stays the same)
396 // if we have unquoted attributes, the parser expects a terminating
397 // space, so let's guarantee that there's always a terminating space.
398 $string .= ' ';
400 // infinite loop protection
401 $loops = 0;
402 while(true) {
404 // infinite loop protection
405 if (++$loops > 1000) {
406 trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING);
407 return array();
410 if ($cursor >= $size) {
411 break;
414 $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
415 // grab the key
417 $key_begin = $cursor; //we're currently at the start of the key
419 // scroll past all characters that are the key (not whitespace or =)
420 $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
422 $key_end = $cursor; // now at the end of the key
424 $key = substr($string, $key_begin, $key_end - $key_begin);
426 if (!$key) {
427 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
428 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
429 continue; // empty key
432 // scroll past all whitespace
433 $cursor += strspn($string, $this->_whitespace, $cursor);
435 if ($cursor >= $size) {
436 $array[$key] = $key;
437 break;
440 // if the next character is an equal sign, we've got a regular
441 // pair, otherwise, it's a bool attribute
442 $first_char = @$string[$cursor];
444 if ($first_char == '=') {
445 // key="value"
447 $cursor++;
448 $cursor += strspn($string, $this->_whitespace, $cursor);
450 if ($cursor === false) {
451 $array[$key] = '';
452 break;
455 // we might be in front of a quote right now
457 $char = @$string[$cursor];
459 if ($char == '"' || $char == "'") {
460 // it's quoted, end bound is $char
461 $cursor++;
462 $value_begin = $cursor;
463 $cursor = strpos($string, $char, $cursor);
464 $value_end = $cursor;
465 } else {
466 // it's not quoted, end bound is whitespace
467 $value_begin = $cursor;
468 $cursor += strcspn($string, $this->_whitespace, $cursor);
469 $value_end = $cursor;
472 // we reached a premature end
473 if ($cursor === false) {
474 $cursor = $size;
475 $value_end = $cursor;
478 $value = substr($string, $value_begin, $value_end - $value_begin);
479 if ($value === false) $value = '';
480 $array[$key] = $this->parseData($value);
481 $cursor++;
483 } else {
484 // boolattr
485 if ($key !== '') {
486 $array[$key] = $key;
487 } else {
488 // purely theoretical
489 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
494 return $array;