library/HTMLPurifier/Lexer/DirectLex.php

   1 <?php
   2
   3 require_once 'HTMLPurifier/Lexer.php';
   4
   5 /**
   6  * Our in-house implementation of a parser.
   7  *
   8  * A pure PHP parser, DirectLex has absolutely no dependencies, making
   9  * it a reasonably good default for PHP4.  Written with efficiency in mind,
  10  * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  11  * pales in comparison to HTMLPurifier_Lexer_DOMLex.  It will support UTF-8
  12  * completely eventually.
  13  *
  14  * @todo Reread XML spec and document differences.
  15  *
  16  * @todo Determine correct behavior in transforming comment data. (preserve dashes?)
  17  */
  18 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  19 {
  20
  21     /**
  22      * Whitespace characters for str(c)spn.
  23      * @protected
  24      */
  25     var $_whitespace = "\x20\x09\x0D\x0A";
  26
  27     function tokenizeHTML($html, $config, &$context) {
  28
  29         $html = $this->normalize($html, $config, $context);
  30
  31         $cursor = 0; // our location in the text
  32         $inside_tag = false; // whether or not we're parsing the inside of a tag
  33         $array = array(); // result array
  34
  35         // infinite loop protection
  36         // has to be pretty big, since html docs can be big
  37         // we're allow two hundred thousand tags... more than enough?
  38         $loops = 0;
  39
  40         while(true) {
  41
  42             // infinite loop protection
  43             if (++$loops > 200000) return array();
  44
  45             $position_next_lt = strpos($html, '<', $cursor);
  46             $position_next_gt = strpos($html, '>', $cursor);
  47
  48             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
  49             if ($position_next_lt === $cursor) {
  50                 $inside_tag = true;
  51                 $cursor++;
  52             }
  53
  54             if (!$inside_tag && $position_next_lt !== false) {
  55                 // We are not inside tag and there still is another tag to parse
  56                 $array[] = new
  57                     HTMLPurifier_Token_Text(
  58                         $this->parseData(
  59                             substr(
  60                                 $html, $cursor, $position_next_lt - $cursor
  61                             )
  62                         )
  63                     );
  64                 $cursor  = $position_next_lt + 1;
  65                 $inside_tag = true;
  66                 continue;
  67             } elseif (!$inside_tag) {
  68                 // We are not inside tag but there are no more tags
  69                 // If we're already at the end, break
  70                 if ($cursor === strlen($html)) break;
  71                 // Create Text of rest of string
  72                 $array[] = new
  73                     HTMLPurifier_Token_Text(
  74                         $this->parseData(
  75                             substr(
  76                                 $html, $cursor
  77                             )
  78                         )
  79                     );
  80                 break;
  81             } elseif ($inside_tag && $position_next_gt !== false) {
  82                 // We are in tag and it is well formed
  83                 // Grab the internals of the tag
  84                 $strlen_segment = $position_next_gt - $cursor;
  85                 $segment = substr($html, $cursor, $strlen_segment);
  86
  87                 // Check if it's a comment
  88                 if (
  89                     substr($segment, 0, 3) == '!--' &&
  90                     substr($segment, $strlen_segment-2, 2) == '--'
  91                 ) {
  92                     $array[] = new
  93                         HTMLPurifier_Token_Comment(
  94                             substr(
  95                                 $segment, 3, $strlen_segment - 5
  96                             )
  97                         );
  98                     $inside_tag = false;
  99                     $cursor = $position_next_gt + 1;
 100                     continue;
 101                 }
 102
 103                 // Check if it's an end tag
 104                 $is_end_tag = (strpos($segment,'/') === 0);
 105                 if ($is_end_tag) {
 106                     $type = substr($segment, 1);
 107                     $array[] = new HTMLPurifier_Token_End($type);
 108                     $inside_tag = false;
 109                     $cursor = $position_next_gt + 1;
 110                     continue;
 111                 }
 112
 113                 // Check leading character is alnum, if not, we may
 114                 // have accidently grabbed an emoticon. Translate into
 115                 // text and go our merry way
 116                 if (!ctype_alnum($segment[0])) {
 117                     $array[] = new
 118                         HTMLPurifier_Token_Text(
 119                             '<' .
 120                             $this->parseData(
 121                                 $segment
 122                             ) .
 123                             '>'
 124                         );
 125                     $cursor = $position_next_gt + 1;
 126                     $inside_tag = false;
 127                     continue;
 128                 }
 129
 130                 // Check if it is explicitly self closing, if so, remove
 131                 // trailing slash. Remember, we could have a tag like <br>, so
 132                 // any later token processing scripts must convert improperly
 133                 // classified EmptyTags from StartTags.
 134                 $is_self_closing= (strpos($segment,'/') === $strlen_segment-1);
 135                 if ($is_self_closing) {
 136                     $strlen_segment--;
 137                     $segment = substr($segment, 0, $strlen_segment);
 138                 }
 139
 140                 // Check if there are any attributes
 141                 $position_first_space = strcspn($segment, $this->_whitespace);
 142
 143                 if ($position_first_space >= $strlen_segment) {
 144                     if ($is_self_closing) {
 145                         $array[] = new HTMLPurifier_Token_Empty($segment);
 146                     } else {
 147                         $array[] = new HTMLPurifier_Token_Start($segment);
 148                     }
 149                     $inside_tag = false;
 150                     $cursor = $position_next_gt + 1;
 151                     continue;
 152                 }
 153
 154                 // Grab out all the data
 155                 $type = substr($segment, 0, $position_first_space);
 156                 $attribute_string =
 157                     trim(
 158                         substr(
 159                             $segment, $position_first_space
 160                         )
 161                     );
 162                 if ($attribute_string) {
 163                     $attr = $this->parseAttributeString(
 164                                     $attribute_string
 165                                   , $config, $context
 166                               );
 167                 } else {
 168                     $attr = array();
 169                 }
 170
 171                 if ($is_self_closing) {
 172                     $array[] = new HTMLPurifier_Token_Empty($type, $attr);
 173                 } else {
 174                     $array[] = new HTMLPurifier_Token_Start($type, $attr);
 175                 }
 176                 $cursor = $position_next_gt + 1;
 177                 $inside_tag = false;
 178                 continue;
 179             } else {
 180                 $array[] = new
 181                     HTMLPurifier_Token_Text(
 182                         '<' .
 183                         $this->parseData(
 184                             substr($html, $cursor)
 185                         )
 186                     );
 187                 break;
 188             }
 189             break;
 190         }
 191         return $array;
 192     }
 193
 194     /**
 195      * Takes the inside of an HTML tag and makes an assoc array of attributes.
 196      *
 197      * @param $string Inside of tag excluding name.
 198      * @returns Assoc array of attributes.
 199      */
 200     function parseAttributeString($string, $config, &$context) {
 201         $string = (string) $string; // quick typecast
 202
 203         if ($string == '') return array(); // no attributes
 204
 205         // let's see if we can abort as quickly as possible
 206         // one equal sign, no spaces => one attribute
 207         $num_equal = substr_count($string, '=');
 208         $has_space = strpos($string, ' ');
 209         if ($num_equal === 0 && !$has_space) {
 210             // bool attribute
 211             return array($string => $string);
 212         } elseif ($num_equal === 1 && !$has_space) {
 213             // only one attribute
 214             list($key, $quoted_value) = explode('=', $string);
 215             $quoted_value = trim($quoted_value);
 216             if (!$key) return array();
 217             if (!$quoted_value) return array($key => '');
 218             $first_char = @$quoted_value[0];
 219             $last_char  = @$quoted_value[strlen($quoted_value)-1];
 220
 221             $same_quote = ($first_char == $last_char);
 222             $open_quote = ($first_char == '"' || $first_char == "'");
 223
 224             if ( $same_quote && $open_quote) {
 225                 // well behaved
 226                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 227             } else {
 228                 // not well behaved
 229                 if ($open_quote) {
 230                     $value = substr($quoted_value, 1);
 231                 } else {
 232                     $value = $quoted_value;
 233                 }
 234             }
 235             return array($key => $value);
 236         }
 237
 238         // setup loop environment
 239         $array  = array(); // return assoc array of attributes
 240         $cursor = 0; // current position in string (moves forward)
 241         $size   = strlen($string); // size of the string (stays the same)
 242
 243         // if we have unquoted attributes, the parser expects a terminating
 244         // space, so let's guarantee that there's always a terminating space.
 245         $string .= ' ';
 246
 247         // infinite loop protection
 248         $loops = 0;
 249
 250         while(true) {
 251
 252             // infinite loop protection
 253             if (++$loops > 1000) return array();
 254
 255             if ($cursor >= $size) {
 256                 break;
 257             }
 258
 259             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 260
 261             // grab the key
 262
 263             $key_begin = $cursor; //we're currently at the start of the key
 264
 265             // scroll past all characters that are the key (not whitespace or =)
 266             $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 267
 268             $key_end = $cursor; // now at the end of the key
 269
 270             $key = substr($string, $key_begin, $key_end - $key_begin);
 271
 272             if (!$key) continue; // empty key
 273
 274             // scroll past all whitespace
 275             $cursor += strspn($string, $this->_whitespace, $cursor);
 276
 277             if ($cursor >= $size) {
 278                 $array[$key] = $key;
 279                 break;
 280             }
 281
 282             // if the next character is an equal sign, we've got a regular
 283             // pair, otherwise, it's a bool attribute
 284             $first_char = @$string[$cursor];
 285
 286             if ($first_char == '=') {
 287                 // key="value"
 288
 289                 $cursor++;
 290                 $cursor += strspn($string, $this->_whitespace, $cursor);
 291
 292                 // we might be in front of a quote right now
 293
 294                 $char = @$string[$cursor];
 295
 296                 if ($char == '"' || $char == "'") {
 297                     // it's quoted, end bound is $char
 298                     $cursor++;
 299                     $value_begin = $cursor;
 300                     $cursor = strpos($string, $char, $cursor);
 301                     $value_end = $cursor;
 302                 } else {
 303                     // it's not quoted, end bound is whitespace
 304                     $value_begin = $cursor;
 305                     $cursor += strcspn($string, $this->_whitespace, $cursor);
 306                     $value_end = $cursor;
 307                 }
 308
 309                 $value = substr($string, $value_begin, $value_end - $value_begin);
 310                 $array[$key] = $this->parseData($value);
 311                 $cursor++;
 312
 313             } else {
 314                 // boolattr
 315                 if ($key !== '') {
 316                     $array[$key] = $key;
 317                 }
 318
 319             }
 320         }
 321         return $array;
 322     }
 323
 324 }
 325
 326 ?>