library/HTMLPurifier/Lexer/DirectLex.php

   1 <?php
   2
   3 require_once 'HTMLPurifier/Lexer.php';
   4
   5 /**
   6  * Our in-house implementation of a parser.
   7  *
   8  * A pure PHP parser, DirectLex has absolutely no dependencies, making
   9  * it a reasonably good default for PHP4.  Written with efficiency in mind,
  10  * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  11  * pales in comparison to HTMLPurifier_Lexer_DOMLex.  It will support UTF-8
  12  * completely eventually.
  13  *
  14  * @todo Reread XML spec and document differences.
  15  * @todo Add support for CDATA sections.
  16  * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
  17  * @todo Optimize main function tokenizeHTML().
  18  * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
  19  */
  20 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  21 {
  22
  23     /**
  24      * Most common entity to raw value conversion table for special entities.
  25      * @protected
  26      */
  27     var $_special_entity2str =
  28             array(
  29                     '&quot;' => '"',
  30                     '&amp;'  => '&',
  31                     '&lt;'   => '<',
  32                     '&gt;'   => '>',
  33                     '&#39;'  => "'",
  34                     '&#039;' => "'",
  35                     '&#x27;' => "'"
  36             );
  37
  38     /**
  39      * Parses special entities into the proper characters.
  40      *
  41      * This string will translate escaped versions of the special characters
  42      * into the correct ones.
  43      *
  44      * @warning
  45      * You should be able to treat the output of this function as
  46      * completely parsed, but that's only because all other entities should
  47      * have been handled previously in substituteNonSpecialEntities()
  48      *
  49      * @param $string String character data to be parsed.
  50      * @returns Parsed character data.
  51      */
  52     function parseData($string) {
  53
  54         // subtracts amps that cannot possibly be escaped
  55         $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
  56             ($string[strlen($string)-1] === '&' ? 1 : 0);
  57
  58         if (!$num_amp) return $string; // abort if no entities
  59         $num_esc_amp = substr_count($string, '&amp;');
  60         $string = strtr($string, $this->_special_entity2str);
  61
  62         // code duplication for sake of optimization, see above
  63         $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
  64             ($string[strlen($string)-1] === '&' ? 1 : 0);
  65
  66         if ($num_amp_2 <= $num_esc_amp) return $string;
  67
  68         // hmm... now we have some uncommon entities. Use the callback.
  69         $string = $this->_encoder->substituteSpecialEntities($string);
  70         return $string;
  71     }
  72
  73     /**
  74      * Whitespace characters for str(c)spn.
  75      * @protected
  76      */
  77     var $_whitespace = "\x20\x09\x0D\x0A";
  78
  79     function tokenizeHTML($string, $config = null) {
  80
  81         if (!$config) $config = HTMLPurifier_Config::createDefault();
  82
  83         // some quick checking (if empty, return empty)
  84         $string = @ (string) $string;
  85         if ($string == '') return array();
  86
  87         if ($config->get('Core', 'AcceptFullDocuments')) {
  88             $string = $this->extractBody($string);
  89         }
  90
  91         $cursor = 0; // our location in the text
  92         $inside_tag = false; // whether or not we're parsing the inside of a tag
  93         $array = array(); // result array
  94
  95         // escape CDATA
  96         $string = $this->escapeCDATA($string);
  97
  98         // expand entities THAT AREN'T THE BIG FIVE
  99         $string = $this->_encoder->substituteNonSpecialEntities($string);
 100
 101         // clean it into wellformed UTF-8 string
 102         $string = $this->_encoder->cleanUTF8($string);
 103
 104         // infinite loop protection
 105         // has to be pretty big, since html docs can be big
 106         // we're allow two hundred thousand tags... more than enough?
 107         $loops = 0;
 108
 109         while(true) {
 110
 111             // infinite loop protection
 112             if (++$loops > 200000) return array();
 113
 114             $position_next_lt = strpos($string, '<', $cursor);
 115             $position_next_gt = strpos($string, '>', $cursor);
 116
 117             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 118             if ($position_next_lt === $cursor) {
 119                 $inside_tag = true;
 120                 $cursor++;
 121             }
 122
 123             if (!$inside_tag && $position_next_lt !== false) {
 124                 // We are not inside tag and there still is another tag to parse
 125                 $array[] = new
 126                     HTMLPurifier_Token_Text(
 127                         $this->parseData(
 128                             substr(
 129                                 $string, $cursor, $position_next_lt - $cursor
 130                             )
 131                         )
 132                     );
 133                 $cursor  = $position_next_lt + 1;
 134                 $inside_tag = true;
 135                 continue;
 136             } elseif (!$inside_tag) {
 137                 // We are not inside tag but there are no more tags
 138                 // If we're already at the end, break
 139                 if ($cursor === strlen($string)) break;
 140                 // Create Text of rest of string
 141                 $array[] = new
 142                     HTMLPurifier_Token_Text(
 143                         $this->parseData(
 144                             substr(
 145                                 $string, $cursor
 146                             )
 147                         )
 148                     );
 149                 break;
 150             } elseif ($inside_tag && $position_next_gt !== false) {
 151                 // We are in tag and it is well formed
 152                 // Grab the internals of the tag
 153                 $strlen_segment = $position_next_gt - $cursor;
 154                 $segment = substr($string, $cursor, $strlen_segment);
 155
 156                 // Check if it's a comment
 157                 if (
 158                     substr($segment, 0, 3) == '!--' &&
 159                     substr($segment, $strlen_segment-2, 2) == '--'
 160                 ) {
 161                     $array[] = new
 162                         HTMLPurifier_Token_Comment(
 163                             substr(
 164                                 $segment, 3, $strlen_segment - 5
 165                             )
 166                         );
 167                     $inside_tag = false;
 168                     $cursor = $position_next_gt + 1;
 169                     continue;
 170                 }
 171
 172                 // Check if it's an end tag
 173                 $is_end_tag = (strpos($segment,'/') === 0);
 174                 if ($is_end_tag) {
 175                     $type = substr($segment, 1);
 176                     $array[] = new HTMLPurifier_Token_End($type);
 177                     $inside_tag = false;
 178                     $cursor = $position_next_gt + 1;
 179                     continue;
 180                 }
 181
 182                 // Check if it is explicitly self closing, if so, remove
 183                 // trailing slash. Remember, we could have a tag like <br>, so
 184                 // any later token processing scripts must convert improperly
 185                 // classified EmptyTags from StartTags.
 186                 $is_self_closing= (strpos($segment,'/') === $strlen_segment-1);
 187                 if ($is_self_closing) {
 188                     $strlen_segment--;
 189                     $segment = substr($segment, 0, $strlen_segment);
 190                 }
 191
 192                 // Check if there are any attributes
 193                 $position_first_space = strcspn($segment, $this->_whitespace);
 194
 195                 if ($position_first_space >= $strlen_segment) {
 196                     if ($is_self_closing) {
 197                         $array[] = new HTMLPurifier_Token_Empty($segment);
 198                     } else {
 199                         $array[] = new HTMLPurifier_Token_Start($segment);
 200                     }
 201                     $inside_tag = false;
 202                     $cursor = $position_next_gt + 1;
 203                     continue;
 204                 }
 205
 206                 // Grab out all the data
 207                 $type = substr($segment, 0, $position_first_space);
 208                 $attribute_string =
 209                     trim(
 210                         substr(
 211                             $segment, $position_first_space
 212                         )
 213                     );
 214                 if ($attribute_string) {
 215                     $attributes = $this->parseAttributeString(
 216                                         $attribute_string
 217                                   );
 218                 } else {
 219                     $attributes = array();
 220                 }
 221
 222                 if ($is_self_closing) {
 223                     $array[] = new HTMLPurifier_Token_Empty($type, $attributes);
 224                 } else {
 225                     $array[] = new HTMLPurifier_Token_Start($type, $attributes);
 226                 }
 227                 $cursor = $position_next_gt + 1;
 228                 $inside_tag = false;
 229                 continue;
 230             } else {
 231                 $array[] = new
 232                     HTMLPurifier_Token_Text(
 233                         '<' .
 234                         $this->parseData(
 235                             substr($string, $cursor)
 236                         )
 237                     );
 238                 break;
 239             }
 240             break;
 241         }
 242         return $array;
 243     }
 244
 245     /**
 246      * Takes the inside of an HTML tag and makes an assoc array of attributes.
 247      *
 248      * @param $string Inside of tag excluding name.
 249      * @returns Assoc array of attributes.
 250      */
 251     function parseAttributeString($string) {
 252         $string = (string) $string; // quick typecast
 253
 254         if ($string == '') return array(); // no attributes
 255
 256         // let's see if we can abort as quickly as possible
 257         // one equal sign, no spaces => one attribute
 258         $num_equal = substr_count($string, '=');
 259         $has_space = strpos($string, ' ');
 260         if ($num_equal === 0 && !$has_space) {
 261             // bool attribute
 262             return array($string => $string);
 263         } elseif ($num_equal === 1 && !$has_space) {
 264             // only one attribute
 265             list($key, $quoted_value) = explode('=', $string);
 266             $quoted_value = trim($quoted_value);
 267             if (!$key) return array();
 268             if (!$quoted_value) return array($key => '');
 269             $first_char = @$quoted_value[0];
 270             $last_char  = @$quoted_value[strlen($quoted_value)-1];
 271
 272             $same_quote = ($first_char == $last_char);
 273             $open_quote = ($first_char == '"' || $first_char == "'");
 274
 275             if ( $same_quote && $open_quote) {
 276                 // well behaved
 277                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 278             } else {
 279                 // not well behaved
 280                 if ($open_quote) {
 281                     $value = substr($quoted_value, 1);
 282                 } else {
 283                     $value = $quoted_value;
 284                 }
 285             }
 286             return array($key => $value);
 287         }
 288
 289         // setup loop environment
 290         $array  = array(); // return assoc array of attributes
 291         $cursor = 0; // current position in string (moves forward)
 292         $size   = strlen($string); // size of the string (stays the same)
 293
 294         // if we have unquoted attributes, the parser expects a terminating
 295         // space, so let's guarantee that there's always a terminating space.
 296         $string .= ' ';
 297
 298         // infinite loop protection
 299         $loops = 0;
 300
 301         while(true) {
 302
 303             // infinite loop protection
 304             if (++$loops > 1000) return array();
 305
 306             if ($cursor >= $size) {
 307                 break;
 308             }
 309
 310             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 311
 312             // grab the key
 313
 314             $key_begin = $cursor; //we're currently at the start of the key
 315
 316             // scroll past all characters that are the key (not whitespace or =)
 317             $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 318
 319             $key_end = $cursor; // now at the end of the key
 320
 321             $key = substr($string, $key_begin, $key_end - $key_begin);
 322
 323             if (!$key) continue; // empty key
 324
 325             // scroll past all whitespace
 326             $cursor += strspn($string, $this->_whitespace, $cursor);
 327
 328             if ($cursor >= $size) {
 329                 $array[$key] = $key;
 330                 break;
 331             }
 332
 333             // if the next character is an equal sign, we've got a regular
 334             // pair, otherwise, it's a bool attribute
 335             $first_char = @$string[$cursor];
 336
 337             if ($first_char == '=') {
 338                 // key="value"
 339
 340                 $cursor++;
 341                 $cursor += strspn($string, $this->_whitespace, $cursor);
 342
 343                 // we might be in front of a quote right now
 344
 345                 $char = @$string[$cursor];
 346
 347                 if ($char == '"' || $char == "'") {
 348                     // it's quoted, end bound is $char
 349                     $cursor++;
 350                     $value_begin = $cursor;
 351                     $cursor = strpos($string, $char, $cursor);
 352                     $value_end = $cursor;
 353                 } else {
 354                     // it's not quoted, end bound is whitespace
 355                     $value_begin = $cursor;
 356                     $cursor += strcspn($string, $this->_whitespace, $cursor);
 357                     $value_end = $cursor;
 358                 }
 359
 360                 $value = substr($string, $value_begin, $value_end - $value_begin);
 361                 $array[$key] = $this->parseData($value);
 362                 $cursor++;
 363
 364             } else {
 365                 // boolattr
 366                 if ($key !== '') {
 367                     $array[$key] = $key;
 368                 }
 369
 370             }
 371         }
 372         return $array;
 373     }
 374
 375 }
 376
 377 ?>