3 require_once 'HTMLPurifier/Lexer.php';
5 HTMLPurifier_ConfigSchema
::define(
6 'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
8 Specifies the number of tokens the DirectLex line number tracking
9 implementations should process before attempting to resyncronize the
10 current line count by manually counting all previous new-lines. When
11 at 0, this functionality is disabled. Lower values will decrease
12 performance, and this is only strictly necessary if the counting
13 algorithm is buggy (in which case you should report it as a bug).
14 This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
15 not being used. This directive has been available since 2.0.0.
20 * Our in-house implementation of a parser.
22 * A pure PHP parser, DirectLex has absolutely no dependencies, making
23 * it a reasonably good default for PHP4. Written with efficiency in mind,
24 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
25 * pales in comparison to HTMLPurifier_Lexer_DOMLex.
27 * @todo Reread XML spec and document differences.
29 class HTMLPurifier_Lexer_DirectLex
extends HTMLPurifier_Lexer
33 * Whitespace characters for str(c)spn.
35 protected $_whitespace = "\x20\x09\x0D\x0A";
38 * Callback function for script CDATA fudge
39 * @param $matches, in form of array(opening tag, contents, closing tag)
41 protected function scriptCallback($matches) {
42 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT
, 'UTF-8') . $matches[3];
45 public function tokenizeHTML($html, $config, $context) {
47 // special normalization for script tags without any armor
48 // our "armor" heurstic is a < sign any number of whitespaces after
49 // the first script tag
50 if ($config->get('HTML', 'Trusted')) {
51 $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
52 array($this, 'scriptCallback'), $html);
55 $html = $this->normalize($html, $config, $context);
57 $cursor = 0; // our location in the text
58 $inside_tag = false; // whether or not we're parsing the inside of a tag
59 $array = array(); // result array
61 $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
63 if ($maintain_line_numbers === null) {
64 // automatically determine line numbering by checking
65 // if error collection is on
66 $maintain_line_numbers = $config->get('Core', 'CollectErrors');
69 if ($maintain_line_numbers) $current_line = 1;
70 else $current_line = false;
71 $context->register('CurrentLine', $current_line);
73 // how often to manually recalculate. This will ALWAYS be right,
74 // but it's pretty wasteful. Set to 0 to turn off
75 $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
78 if ($config->get('Core', 'CollectErrors')) {
79 $e =& $context->get('ErrorCollector');
82 // infinite loop protection
83 // has to be pretty big, since html docs can be big
84 // we're allow two hundred thousand tags... more than enough?
85 // NOTE: this is also used for synchronization, so watch out
90 // infinite loop protection
91 if (++
$loops > 200000) return array();
95 $maintain_line_numbers && // line number tracking is on
96 $synchronize_interval && // synchronization is on
97 $cursor > 0 && // cursor is further than zero
98 $loops %
$synchronize_interval === 0 // time to synchronize!
100 $current_line = 1 +
$this->substrCount($html, $nl, 0, $cursor);
103 $position_next_lt = strpos($html, '<', $cursor);
104 $position_next_gt = strpos($html, '>', $cursor);
106 // triggers on "<b>asdf</b>" but not "asdf <b></b>"
107 // special case to set up context
108 if ($position_next_lt === $cursor) {
113 if (!$inside_tag && $position_next_lt !== false) {
114 // We are not inside tag and there still is another tag to parse
116 HTMLPurifier_Token_Text(
119 $html, $cursor, $position_next_lt - $cursor
123 if ($maintain_line_numbers) {
124 $token->line
= $current_line;
125 $current_line +
= $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
128 $cursor = $position_next_lt +
1;
131 } elseif (!$inside_tag) {
132 // We are not inside tag but there are no more tags
133 // If we're already at the end, break
134 if ($cursor === strlen($html)) break;
135 // Create Text of rest of string
137 HTMLPurifier_Token_Text(
144 if ($maintain_line_numbers) $token->line
= $current_line;
147 } elseif ($inside_tag && $position_next_gt !== false) {
148 // We are in tag and it is well formed
149 // Grab the internals of the tag
150 $strlen_segment = $position_next_gt - $cursor;
152 if ($strlen_segment < 1) {
153 // there's nothing to process!
154 $token = new HTMLPurifier_Token_Text('<');
159 $segment = substr($html, $cursor, $strlen_segment);
161 if ($segment === false) {
162 // somehow, we attempted to access beyond the end of
163 // the string, defense-in-depth, reported by Nate Abele
167 // Check if it's a comment
169 substr($segment, 0, 3) === '!--'
171 // re-determine segment length, looking for -->
172 $position_comment_end = strpos($html, '-->', $cursor);
173 if ($position_comment_end === false) {
174 // uh oh, we have a comment that extends to
175 // infinity. Can't be helped: set comment
176 // end position to end of string
177 if ($e) $e->send(E_WARNING
, 'Lexer: Unclosed comment');
178 $position_comment_end = strlen($html);
183 $strlen_segment = $position_comment_end - $cursor;
184 $segment = substr($html, $cursor, $strlen_segment);
186 HTMLPurifier_Token_Comment(
188 $segment, 3, $strlen_segment - 3
191 if ($maintain_line_numbers) {
192 $token->line
= $current_line;
193 $current_line +
= $this->substrCount($html, $nl, $cursor, $strlen_segment);
196 $cursor = $end ?
$position_comment_end : $position_comment_end +
3;
201 // Check if it's an end tag
202 $is_end_tag = (strpos($segment,'/') === 0);
204 $type = substr($segment, 1);
205 $token = new HTMLPurifier_Token_End($type);
206 if ($maintain_line_numbers) {
207 $token->line
= $current_line;
208 $current_line +
= $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
212 $cursor = $position_next_gt +
1;
216 // Check leading character is alnum, if not, we may
217 // have accidently grabbed an emoticon. Translate into
218 // text and go our merry way
219 if (!ctype_alpha($segment[0])) {
220 // XML: $segment[0] !== '_' && $segment[0] !== ':'
221 if ($e) $e->send(E_NOTICE
, 'Lexer: Unescaped lt');
223 HTMLPurifier_Token_Text(
230 if ($maintain_line_numbers) {
231 $token->line
= $current_line;
232 $current_line +
= $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
235 $cursor = $position_next_gt +
1;
240 // Check if it is explicitly self closing, if so, remove
241 // trailing slash. Remember, we could have a tag like <br>, so
242 // any later token processing scripts must convert improperly
243 // classified EmptyTags from StartTags.
244 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
245 if ($is_self_closing) {
247 $segment = substr($segment, 0, $strlen_segment);
250 // Check if there are any attributes
251 $position_first_space = strcspn($segment, $this->_whitespace
);
253 if ($position_first_space >= $strlen_segment) {
254 if ($is_self_closing) {
255 $token = new HTMLPurifier_Token_Empty($segment);
257 $token = new HTMLPurifier_Token_Start($segment);
259 if ($maintain_line_numbers) {
260 $token->line
= $current_line;
261 $current_line +
= $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
265 $cursor = $position_next_gt +
1;
269 // Grab out all the data
270 $type = substr($segment, 0, $position_first_space);
274 $segment, $position_first_space
277 if ($attribute_string) {
278 $attr = $this->parseAttributeString(
286 if ($is_self_closing) {
287 $token = new HTMLPurifier_Token_Empty($type, $attr);
289 $token = new HTMLPurifier_Token_Start($type, $attr);
291 if ($maintain_line_numbers) {
292 $token->line
= $current_line;
293 $current_line +
= $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
296 $cursor = $position_next_gt +
1;
300 // inside tag, but there's no ending > sign
301 if ($e) $e->send(E_WARNING
, 'Lexer: Missing gt');
303 HTMLPurifier_Token_Text(
306 substr($html, $cursor)
309 if ($maintain_line_numbers) $token->line
= $current_line;
310 // no cursor scroll? Hmm...
317 $context->destroy('CurrentLine');
322 * PHP 4 compatible substr_count that implements offset and length
324 protected function substrCount($haystack, $needle, $offset, $length) {
326 if ($oldVersion === null) {
327 $oldVersion = version_compare(PHP_VERSION
, '5.1', '<');
330 $haystack = substr($haystack, $offset, $length);
331 return substr_count($haystack, $needle);
333 return substr_count($haystack, $needle, $offset, $length);
338 * Takes the inside of an HTML tag and makes an assoc array of attributes.
340 * @param $string Inside of tag excluding name.
341 * @returns Assoc array of attributes.
343 public function parseAttributeString($string, $config, $context) {
344 $string = (string) $string; // quick typecast
346 if ($string == '') return array(); // no attributes
349 if ($config->get('Core', 'CollectErrors')) {
350 $e =& $context->get('ErrorCollector');
353 // let's see if we can abort as quickly as possible
354 // one equal sign, no spaces => one attribute
355 $num_equal = substr_count($string, '=');
356 $has_space = strpos($string, ' ');
357 if ($num_equal === 0 && !$has_space) {
359 return array($string => $string);
360 } elseif ($num_equal === 1 && !$has_space) {
361 // only one attribute
362 list($key, $quoted_value) = explode('=', $string);
363 $quoted_value = trim($quoted_value);
365 if ($e) $e->send(E_ERROR
, 'Lexer: Missing attribute key');
368 if (!$quoted_value) return array($key => '');
369 $first_char = @$quoted_value[0];
370 $last_char = @$quoted_value[strlen($quoted_value)-1];
372 $same_quote = ($first_char == $last_char);
373 $open_quote = ($first_char == '"' ||
$first_char == "'");
375 if ( $same_quote && $open_quote) {
377 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
381 if ($e) $e->send(E_ERROR
, 'Lexer: Missing end quote');
382 $value = substr($quoted_value, 1);
384 $value = $quoted_value;
387 if ($value === false) $value = '';
388 return array($key => $value);
391 // setup loop environment
392 $array = array(); // return assoc array of attributes
393 $cursor = 0; // current position in string (moves forward)
394 $size = strlen($string); // size of the string (stays the same)
396 // if we have unquoted attributes, the parser expects a terminating
397 // space, so let's guarantee that there's always a terminating space.
400 // infinite loop protection
404 // infinite loop protection
405 if (++
$loops > 1000) {
406 trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING
);
410 if ($cursor >= $size) {
414 $cursor +
= ($value = strspn($string, $this->_whitespace
, $cursor));
417 $key_begin = $cursor; //we're currently at the start of the key
419 // scroll past all characters that are the key (not whitespace or =)
420 $cursor +
= strcspn($string, $this->_whitespace
. '=', $cursor);
422 $key_end = $cursor; // now at the end of the key
424 $key = substr($string, $key_begin, $key_end - $key_begin);
427 if ($e) $e->send(E_ERROR
, 'Lexer: Missing attribute key');
428 $cursor +
= strcspn($string, $this->_whitespace
, $cursor +
1); // prevent infinite loop
429 continue; // empty key
432 // scroll past all whitespace
433 $cursor +
= strspn($string, $this->_whitespace
, $cursor);
435 if ($cursor >= $size) {
440 // if the next character is an equal sign, we've got a regular
441 // pair, otherwise, it's a bool attribute
442 $first_char = @$string[$cursor];
444 if ($first_char == '=') {
448 $cursor +
= strspn($string, $this->_whitespace
, $cursor);
450 if ($cursor === false) {
455 // we might be in front of a quote right now
457 $char = @$string[$cursor];
459 if ($char == '"' ||
$char == "'") {
460 // it's quoted, end bound is $char
462 $value_begin = $cursor;
463 $cursor = strpos($string, $char, $cursor);
464 $value_end = $cursor;
466 // it's not quoted, end bound is whitespace
467 $value_begin = $cursor;
468 $cursor +
= strcspn($string, $this->_whitespace
, $cursor);
469 $value_end = $cursor;
472 // we reached a premature end
473 if ($cursor === false) {
475 $value_end = $cursor;
478 $value = substr($string, $value_begin, $value_end - $value_begin);
479 if ($value === false) $value = '';
480 $array[$key] = $this->parseData($value);
488 // purely theoretical
489 if ($e) $e->send(E_ERROR
, 'Lexer: Missing attribute key');