library/HTMLPurifier/Lexer/DirectLex.php

   1 <?php
   2
   3 require_once 'HTMLPurifier/Lexer.php';
   4
   5 HTMLPurifier_ConfigSchema::define(
   6     'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
   7 <p>
   8   Specifies the number of tokens the DirectLex line number tracking
   9   implementations should process before attempting to resyncronize the
  10   current line count by manually counting all previous new-lines. When
  11   at 0, this functionality is disabled. Lower values will decrease
  12   performance, and this is only strictly necessary if the counting
  13   algorithm is buggy (in which case you should report it as a bug).
  14   This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
  15   not being used. This directive has been available since 2.0.0.
  16 </p>
  17 ');
  18
  19 /**
  20  * Our in-house implementation of a parser.
  21  *
  22  * A pure PHP parser, DirectLex has absolutely no dependencies, making
  23  * it a reasonably good default for PHP4.  Written with efficiency in mind,
  24  * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  25  * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  26  *
  27  * @todo Reread XML spec and document differences.
  28  */
  29 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  30 {
  31
  32     /**
  33      * Whitespace characters for str(c)spn.
  34      */
  35     protected $_whitespace = "\x20\x09\x0D\x0A";
  36
  37     /**
  38      * Callback function for script CDATA fudge
  39      * @param $matches, in form of array(opening tag, contents, closing tag)
  40      */
  41     protected function scriptCallback($matches) {
  42         return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  43     }
  44
  45     public function tokenizeHTML($html, $config, $context) {
  46
  47         // special normalization for script tags without any armor
  48         // our "armor" heurstic is a < sign any number of whitespaces after
  49         // the first script tag
  50         if ($config->get('HTML', 'Trusted')) {
  51             $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  52                 array($this, 'scriptCallback'), $html);
  53         }
  54
  55         $html = $this->normalize($html, $config, $context);
  56
  57         $cursor = 0; // our location in the text
  58         $inside_tag = false; // whether or not we're parsing the inside of a tag
  59         $array = array(); // result array
  60
  61         $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
  62
  63         if ($maintain_line_numbers === null) {
  64             // automatically determine line numbering by checking
  65             // if error collection is on
  66             $maintain_line_numbers = $config->get('Core', 'CollectErrors');
  67         }
  68
  69         if ($maintain_line_numbers) $current_line = 1;
  70         else $current_line = false;
  71         $context->register('CurrentLine', $current_line);
  72         $nl = "\n";
  73         // how often to manually recalculate. This will ALWAYS be right,
  74         // but it's pretty wasteful. Set to 0 to turn off
  75         $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
  76
  77         $e = false;
  78         if ($config->get('Core', 'CollectErrors')) {
  79             $e =& $context->get('ErrorCollector');
  80         }
  81
  82         // infinite loop protection
  83         // has to be pretty big, since html docs can be big
  84         // we're allow two hundred thousand tags... more than enough?
  85         // NOTE: this is also used for synchronization, so watch out
  86         $loops = 0;
  87
  88         while(true) {
  89
  90             // infinite loop protection
  91             if (++$loops > 200000) return array();
  92
  93             // recalculate lines
  94             if (
  95                 $maintain_line_numbers && // line number tracking is on
  96                 $synchronize_interval &&  // synchronization is on
  97                 $cursor > 0 &&            // cursor is further than zero
  98                 $loops % $synchronize_interval === 0 // time to synchronize!
  99             ) {
 100                 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
 101             }
 102
 103             $position_next_lt = strpos($html, '<', $cursor);
 104             $position_next_gt = strpos($html, '>', $cursor);
 105
 106             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 107             // special case to set up context
 108             if ($position_next_lt === $cursor) {
 109                 $inside_tag = true;
 110                 $cursor++;
 111             }
 112
 113             if (!$inside_tag && $position_next_lt !== false) {
 114                 // We are not inside tag and there still is another tag to parse
 115                 $token = new
 116                     HTMLPurifier_Token_Text(
 117                         $this->parseData(
 118                             substr(
 119                                 $html, $cursor, $position_next_lt - $cursor
 120                             )
 121                         )
 122                     );
 123                 if ($maintain_line_numbers) {
 124                     $token->line = $current_line;
 125                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
 126                 }
 127                 $array[] = $token;
 128                 $cursor  = $position_next_lt + 1;
 129                 $inside_tag = true;
 130                 continue;
 131             } elseif (!$inside_tag) {
 132                 // We are not inside tag but there are no more tags
 133                 // If we're already at the end, break
 134                 if ($cursor === strlen($html)) break;
 135                 // Create Text of rest of string
 136                 $token = new
 137                     HTMLPurifier_Token_Text(
 138                         $this->parseData(
 139                             substr(
 140                                 $html, $cursor
 141                             )
 142                         )
 143                     );
 144                 if ($maintain_line_numbers) $token->line = $current_line;
 145                 $array[] = $token;
 146                 break;
 147             } elseif ($inside_tag && $position_next_gt !== false) {
 148                 // We are in tag and it is well formed
 149                 // Grab the internals of the tag
 150                 $strlen_segment = $position_next_gt - $cursor;
 151
 152                 if ($strlen_segment < 1) {
 153                     // there's nothing to process!
 154                     $token = new HTMLPurifier_Token_Text('<');
 155                     $cursor++;
 156                     continue;
 157                 }
 158
 159                 $segment = substr($html, $cursor, $strlen_segment);
 160
 161                 if ($segment === false) {
 162                     // somehow, we attempted to access beyond the end of
 163                     // the string, defense-in-depth, reported by Nate Abele
 164                     break;
 165                 }
 166
 167                 // Check if it's a comment
 168                 if (
 169                     substr($segment, 0, 3) === '!--'
 170                 ) {
 171                     // re-determine segment length, looking for -->
 172                     $position_comment_end = strpos($html, '-->', $cursor);
 173                     if ($position_comment_end === false) {
 174                         // uh oh, we have a comment that extends to
 175                         // infinity. Can't be helped: set comment
 176                         // end position to end of string
 177                         if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
 178                         $position_comment_end = strlen($html);
 179                         $end = true;
 180                     } else {
 181                         $end = false;
 182                     }
 183                     $strlen_segment = $position_comment_end - $cursor;
 184                     $segment = substr($html, $cursor, $strlen_segment);
 185                     $token = new
 186                         HTMLPurifier_Token_Comment(
 187                             substr(
 188                                 $segment, 3, $strlen_segment - 3
 189                             )
 190                         );
 191                     if ($maintain_line_numbers) {
 192                         $token->line = $current_line;
 193                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
 194                     }
 195                     $array[] = $token;
 196                     $cursor = $end ? $position_comment_end : $position_comment_end + 3;
 197                     $inside_tag = false;
 198                     continue;
 199                 }
 200
 201                 // Check if it's an end tag
 202                 $is_end_tag = (strpos($segment,'/') === 0);
 203                 if ($is_end_tag) {
 204                     $type = substr($segment, 1);
 205                     $token = new HTMLPurifier_Token_End($type);
 206                     if ($maintain_line_numbers) {
 207                         $token->line = $current_line;
 208                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 209                     }
 210                     $array[] = $token;
 211                     $inside_tag = false;
 212                     $cursor = $position_next_gt + 1;
 213                     continue;
 214                 }
 215
 216                 // Check leading character is alnum, if not, we may
 217                 // have accidently grabbed an emoticon. Translate into
 218                 // text and go our merry way
 219                 if (!ctype_alpha($segment[0])) {
 220                     // XML:  $segment[0] !== '_' && $segment[0] !== ':'
 221                     if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
 222                     $token = new
 223                         HTMLPurifier_Token_Text(
 224                             '<' .
 225                             $this->parseData(
 226                                 $segment
 227                             ) .
 228                             '>'
 229                         );
 230                     if ($maintain_line_numbers) {
 231                         $token->line = $current_line;
 232                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 233                     }
 234                     $array[] = $token;
 235                     $cursor = $position_next_gt + 1;
 236                     $inside_tag = false;
 237                     continue;
 238                 }
 239
 240                 // Check if it is explicitly self closing, if so, remove
 241                 // trailing slash. Remember, we could have a tag like <br>, so
 242                 // any later token processing scripts must convert improperly
 243                 // classified EmptyTags from StartTags.
 244                 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
 245                 if ($is_self_closing) {
 246                     $strlen_segment--;
 247                     $segment = substr($segment, 0, $strlen_segment);
 248                 }
 249
 250                 // Check if there are any attributes
 251                 $position_first_space = strcspn($segment, $this->_whitespace);
 252
 253                 if ($position_first_space >= $strlen_segment) {
 254                     if ($is_self_closing) {
 255                         $token = new HTMLPurifier_Token_Empty($segment);
 256                     } else {
 257                         $token = new HTMLPurifier_Token_Start($segment);
 258                     }
 259                     if ($maintain_line_numbers) {
 260                         $token->line = $current_line;
 261                         $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 262                     }
 263                     $array[] = $token;
 264                     $inside_tag = false;
 265                     $cursor = $position_next_gt + 1;
 266                     continue;
 267                 }
 268
 269                 // Grab out all the data
 270                 $type = substr($segment, 0, $position_first_space);
 271                 $attribute_string =
 272                     trim(
 273                         substr(
 274                             $segment, $position_first_space
 275                         )
 276                     );
 277                 if ($attribute_string) {
 278                     $attr = $this->parseAttributeString(
 279                                     $attribute_string
 280                                   , $config, $context
 281                               );
 282                 } else {
 283                     $attr = array();
 284                 }
 285
 286                 if ($is_self_closing) {
 287                     $token = new HTMLPurifier_Token_Empty($type, $attr);
 288                 } else {
 289                     $token = new HTMLPurifier_Token_Start($type, $attr);
 290                 }
 291                 if ($maintain_line_numbers) {
 292                     $token->line = $current_line;
 293                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 294                 }
 295                 $array[] = $token;
 296                 $cursor = $position_next_gt + 1;
 297                 $inside_tag = false;
 298                 continue;
 299             } else {
 300                 // inside tag, but there's no ending > sign
 301                 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
 302                 $token = new
 303                     HTMLPurifier_Token_Text(
 304                         '<' .
 305                         $this->parseData(
 306                             substr($html, $cursor)
 307                         )
 308                     );
 309                 if ($maintain_line_numbers) $token->line = $current_line;
 310                 // no cursor scroll? Hmm...
 311                 $array[] = $token;
 312                 break;
 313             }
 314             break;
 315         }
 316
 317         $context->destroy('CurrentLine');
 318         return $array;
 319     }
 320
 321     /**
 322      * PHP 4 compatible substr_count that implements offset and length
 323      */
 324     protected function substrCount($haystack, $needle, $offset, $length) {
 325         static $oldVersion;
 326         if ($oldVersion === null) {
 327             $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
 328         }
 329         if ($oldVersion) {
 330             $haystack = substr($haystack, $offset, $length);
 331             return substr_count($haystack, $needle);
 332         } else {
 333             return substr_count($haystack, $needle, $offset, $length);
 334         }
 335     }
 336
 337     /**
 338      * Takes the inside of an HTML tag and makes an assoc array of attributes.
 339      *
 340      * @param $string Inside of tag excluding name.
 341      * @returns Assoc array of attributes.
 342      */
 343     public function parseAttributeString($string, $config, $context) {
 344         $string = (string) $string; // quick typecast
 345
 346         if ($string == '') return array(); // no attributes
 347
 348         $e = false;
 349         if ($config->get('Core', 'CollectErrors')) {
 350             $e =& $context->get('ErrorCollector');
 351         }
 352
 353         // let's see if we can abort as quickly as possible
 354         // one equal sign, no spaces => one attribute
 355         $num_equal = substr_count($string, '=');
 356         $has_space = strpos($string, ' ');
 357         if ($num_equal === 0 && !$has_space) {
 358             // bool attribute
 359             return array($string => $string);
 360         } elseif ($num_equal === 1 && !$has_space) {
 361             // only one attribute
 362             list($key, $quoted_value) = explode('=', $string);
 363             $quoted_value = trim($quoted_value);
 364             if (!$key) {
 365                 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 366                 return array();
 367             }
 368             if (!$quoted_value) return array($key => '');
 369             $first_char = @$quoted_value[0];
 370             $last_char  = @$quoted_value[strlen($quoted_value)-1];
 371
 372             $same_quote = ($first_char == $last_char);
 373             $open_quote = ($first_char == '"' || $first_char == "'");
 374
 375             if ( $same_quote && $open_quote) {
 376                 // well behaved
 377                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 378             } else {
 379                 // not well behaved
 380                 if ($open_quote) {
 381                     if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
 382                     $value = substr($quoted_value, 1);
 383                 } else {
 384                     $value = $quoted_value;
 385                 }
 386             }
 387             if ($value === false) $value = '';
 388             return array($key => $value);
 389         }
 390
 391         // setup loop environment
 392         $array  = array(); // return assoc array of attributes
 393         $cursor = 0; // current position in string (moves forward)
 394         $size   = strlen($string); // size of the string (stays the same)
 395
 396         // if we have unquoted attributes, the parser expects a terminating
 397         // space, so let's guarantee that there's always a terminating space.
 398         $string .= ' ';
 399
 400         // infinite loop protection
 401         $loops = 0;
 402         while(true) {
 403
 404             // infinite loop protection
 405             if (++$loops > 1000) {
 406                 trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING);
 407                 return array();
 408             }
 409
 410             if ($cursor >= $size) {
 411                 break;
 412             }
 413
 414             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 415             // grab the key
 416
 417             $key_begin = $cursor; //we're currently at the start of the key
 418
 419             // scroll past all characters that are the key (not whitespace or =)
 420             $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 421
 422             $key_end = $cursor; // now at the end of the key
 423
 424             $key = substr($string, $key_begin, $key_end - $key_begin);
 425
 426             if (!$key) {
 427                 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 428                 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
 429                 continue; // empty key
 430             }
 431
 432             // scroll past all whitespace
 433             $cursor += strspn($string, $this->_whitespace, $cursor);
 434
 435             if ($cursor >= $size) {
 436                 $array[$key] = $key;
 437                 break;
 438             }
 439
 440             // if the next character is an equal sign, we've got a regular
 441             // pair, otherwise, it's a bool attribute
 442             $first_char = @$string[$cursor];
 443
 444             if ($first_char == '=') {
 445                 // key="value"
 446
 447                 $cursor++;
 448                 $cursor += strspn($string, $this->_whitespace, $cursor);
 449
 450                 if ($cursor === false) {
 451                     $array[$key] = '';
 452                     break;
 453                 }
 454
 455                 // we might be in front of a quote right now
 456
 457                 $char = @$string[$cursor];
 458
 459                 if ($char == '"' || $char == "'") {
 460                     // it's quoted, end bound is $char
 461                     $cursor++;
 462                     $value_begin = $cursor;
 463                     $cursor = strpos($string, $char, $cursor);
 464                     $value_end = $cursor;
 465                 } else {
 466                     // it's not quoted, end bound is whitespace
 467                     $value_begin = $cursor;
 468                     $cursor += strcspn($string, $this->_whitespace, $cursor);
 469                     $value_end = $cursor;
 470                 }
 471
 472                 // we reached a premature end
 473                 if ($cursor === false) {
 474                     $cursor = $size;
 475                     $value_end = $cursor;
 476                 }
 477
 478                 $value = substr($string, $value_begin, $value_end - $value_begin);
 479                 if ($value === false) $value = '';
 480                 $array[$key] = $this->parseData($value);
 481                 $cursor++;
 482
 483             } else {
 484                 // boolattr
 485                 if ($key !== '') {
 486                     $array[$key] = $key;
 487                 } else {
 488                     // purely theoretical
 489                     if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
 490                 }
 491
 492             }
 493         }
 494         return $array;
 495     }
 496
 497 }
 498