From 1de30882762df9e8c7f618a5819a0cbeec96b84e Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 29 Aug 2006 19:36:40 +0000 Subject: [PATCH] Refactor encoding and entity specific processing to HTMLPurifier_Encoder. We also need to refactor the escaping to this class too. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@339 48356398-32a2-884e-a903-53898d9a118a --- docs/examples/demo.php | 2 +- library/HTMLPurifier.php | 42 -- library/HTMLPurifier/{Lexer.php => Encoder.php} | 220 +++------ library/HTMLPurifier/Lexer.php | 629 ++++++------------------ library/HTMLPurifier/Lexer/DOMLex.php | 5 +- library/HTMLPurifier/Lexer/DirectLex.php | 66 +-- library/HTMLPurifier/Lexer/PEARSax3.php | 4 +- smoketests/common.php | 2 +- tests/HTMLPurifier/EncoderTest.php | 96 ++++ tests/HTMLPurifier/Lexer/DirectLexTest.php | 7 - tests/HTMLPurifier/LexerTest.php | 73 --- tests/index.php | 1 + 12 files changed, 333 insertions(+), 814 deletions(-) copy library/HTMLPurifier/{Lexer.php => Encoder.php} (71%) rewrite library/HTMLPurifier/Lexer.php (72%) create mode 100644 tests/HTMLPurifier/EncoderTest.php diff --git a/docs/examples/demo.php b/docs/examples/demo.php index b0a80d52..07630078 100644 --- a/docs/examples/demo.php +++ b/docs/examples/demo.php @@ -62,7 +62,7 @@ if (isset($_GET['profile']) || isset($_GET['XDEBUG_PROFILE'])) { if (isset($html)) { echo htmlspecialchars( - HTMLPurifier_Lexer::cleanUTF8($html), ENT_COMPAT, 'UTF-8'); + HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8'); } ?>
diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index c6611add..7596b0ed 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -28,48 +28,6 @@ require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Strategy/Core.php'; -/* - -// Darn you fellas still using ISO-8859-1! It would be so easy for me -// to just drop the characters that can't be expressed this way, but I'm -// a stickler for code quality, so I won't do that to you. You'll have -// to wait for this functionality to be implemented later. - -HTMLPurifier_ConfigDef::define( - 'Core', 'Encoding', 'utf-8', 'istring', - 'Set this to the encoding your webpages are served as. This defines '. - 'the encoding the HTMLPurifier will convert to and from before passing '. - 'the text back to you. Note that although we offer full HTML document '. - 'parsing functionality, we ignore meta tags in such documents, because '. - 'most modern browsers have already re-encoded the file in the correct '. - 'encoding (though it did not change the meta tag). '. - 'Since browsers do not alter file uploads, '. - 'HTML from a file will fail fantastically if its real encoding is does '. - 'match the encoding passed here (which is often the case).' -); - -if ( !function_exists('iconv') ) { - - // these are the only encodings we offer native PHP support for. - // if iconv is enabled, iconv's encoding support dictates what we can - // use. - - HTMLPurifier_ConfigDef::defineAllowedValues( - 'Core', 'Encoding', array( - 'utf-8', - 'iso-8859-1' - ) - ); - HTMLPurifier_ConfigDef::defineValueAliases( - 'Core', 'Encoding', array( - 'iso8859-1' => 'iso-8859-1' - ) - ); - -} - -*/ - /** * Main library execution class. * diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Encoder.php similarity index 71% copy from library/HTMLPurifier/Lexer.php copy to library/HTMLPurifier/Encoder.php index 1dd984b6..ed02b4ef 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Encoder.php @@ -1,105 +1,23 @@ =')) { - require_once 'HTMLPurifier/Lexer/DOMLex.php'; - $lexer = new HTMLPurifier_Lexer_DOMLex(); - } else { - require_once 'HTMLPurifier/Lexer/DirectLex.php'; - $lexer = new HTMLPurifier_Lexer_DirectLex(); - } - } - return $lexer; - } + * Callback regex string for parsing entities. + * @protected + */ + var $_substituteEntitiesRegex = +'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/'; +// 1. hex 2. dec 3. string + /** * Decimal to parsed string conversion table for special entities. @@ -127,29 +45,6 @@ class HTMLPurifier_Lexer ); /** - * Most common entity to raw value conversion table for special entities. - * @protected - */ - var $_special_entity2str = - array( - '"' => '"', - '&' => '&', - '<' => '<', - '>' => '>', - ''' => "'", - ''' => "'", - ''' => "'" - ); - - /** - * Callback regex string for parsing entities. - * @protected - */ - var $_substituteEntitiesRegex = -'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/'; -// 1. hex 2. dec 3. string - - /** * Substitutes non-special entities with their parsed equivalents. Since * running this whenever you have parsed character is t3h 5uck, we run * it before everything else. @@ -258,54 +153,6 @@ class HTMLPurifier_Lexer } /** - * Contains a copy of the EntityLookup table. - * @protected - */ - var $_entity_lookup; - - /** - * Translates CDATA sections into regular sections (through escaping). - * - * @protected - * @param $string HTML string to process. - * @returns HTML with CDATA sections escaped. - */ - function escapeCDATA($string) { - return preg_replace_callback( - '//', - array('HTMLPurifier_Lexer', 'CDATACallback'), - $string - ); - } - - /** - * Callback function for escapeCDATA() that does the work. - * - * @warning Though this is public in order to let the callback happen, - * calling it directly is not recommended. - * @params $matches PCRE matches array, with index 0 the entire match - * and 1 the inside of the CDATA section. - * @returns Escaped internals of the CDATA section. - */ - function CDATACallback($matches) { - // not exactly sure why the character set is needed, but whatever - return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); - } - - /** - * Takes a string of HTML (fragment or document) and returns the content - */ - function extractBody($html) { - $matches = array(); - $result = preg_match('!]*>(.+?)!is', $html, $matches); - if ($result) { - return $matches[1]; - } else { - return $html; - } - } - - /** * Cleans a UTF-8 string for well-formedness and SGML validity * * It will parse according to UTF-8 and return a valid UTF8 string, with @@ -469,6 +316,51 @@ class HTMLPurifier_Lexer return $out; } + /** + * Substitutes only special entities with their parsed equivalents. + * + * @notice We try to avoid calling this function because otherwise, it + * would have to be called a lot (for every parsed section). + * + * @protected + * @param $string String to have non-special entities parsed. + * @returns Parsed string. + */ + function substituteSpecialEntities($string) { + return preg_replace_callback( + $this->_substituteEntitiesRegex, + array('HTMLPurifier_Encoder', 'specialEntityCallback'), + $string); + } + + /** + * Callback function for substituteSpecialEntities() that does the work. + * + * This callback has same syntax as nonSpecialEntityCallback(). + * + * @warning Though this is public in order to let the callback happen, + * calling it directly is not recommended. + * @param $matches PCRE-style matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @returns Replacement string. + */ + function specialEntityCallback($matches) { + $entity = $matches[0]; + $is_num = (@$matches[0][1] === '#'); + if ($is_num) { + $is_hex = (@$entity[2] === 'x'); + $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; + return isset($this->_special_dec2str[$int]) ? + $this->_special_dec2str[$int] : + $entity; + } else { + return isset($this->_special_ent2dec[$matches[3]]) ? + $this->_special_ent2dec[$matches[3]] : + $entity; + } + } + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php dissimilarity index 72% index 1dd984b6..031e8e3d 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -1,474 +1,155 @@ -=')) { - require_once 'HTMLPurifier/Lexer/DOMLex.php'; - $lexer = new HTMLPurifier_Lexer_DOMLex(); - } else { - require_once 'HTMLPurifier/Lexer/DirectLex.php'; - $lexer = new HTMLPurifier_Lexer_DirectLex(); - } - } - return $lexer; - } - - /** - * Decimal to parsed string conversion table for special entities. - * @protected - */ - var $_special_dec2str = - array( - 34 => '"', - 38 => '&', - 39 => "'", - 60 => '<', - 62 => '>' - ); - - /** - * Stripped entity names to decimal conversion table for special entities. - * @protected - */ - var $_special_ent2dec = - array( - 'quot' => 34, - 'amp' => 38, - 'lt' => 60, - 'gt' => 62 - ); - - /** - * Most common entity to raw value conversion table for special entities. - * @protected - */ - var $_special_entity2str = - array( - '"' => '"', - '&' => '&', - '<' => '<', - '>' => '>', - ''' => "'", - ''' => "'", - ''' => "'" - ); - - /** - * Callback regex string for parsing entities. - * @protected - */ - var $_substituteEntitiesRegex = -'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/'; -// 1. hex 2. dec 3. string - - /** - * Substitutes non-special entities with their parsed equivalents. Since - * running this whenever you have parsed character is t3h 5uck, we run - * it before everything else. - * - * @protected - * @param $string String to have non-special entities parsed. - * @returns Parsed string. - */ - function substituteNonSpecialEntities($string) { - // it will try to detect missing semicolons, but don't rely on it - return preg_replace_callback( - $this->_substituteEntitiesRegex, - array($this, 'nonSpecialEntityCallback'), - $string - ); - } - - /** - * Callback function for substituteNonSpecialEntities() that does the work. - * - * @warning Though this is public in order to let the callback happen, - * calling it directly is not recommended. - * @note Based on Feyd's function at - * , - * which is in public domain. - * @note While we're going to do code point parsing anyway, a good - * optimization would be to refuse to translate code points that - * are non-SGML characters. However, this could lead to duplication. - * @param $matches PCRE matches array, with 0 the entire match, and - * either index 1, 2 or 3 set with a hex value, dec value, - * or string (respectively). - * @returns Replacement string. - * @todo Implement string translations - */ - - // +----------+----------+----------+----------+ - // | 33222222 | 22221111 | 111111 | | - // | 10987654 | 32109876 | 54321098 | 76543210 | bit - // +----------+----------+----------+----------+ - // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F - // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF - // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF - // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF - // +----------+----------+----------+----------+ - // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF) - // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes - // +----------+----------+----------+----------+ - - function nonSpecialEntityCallback($matches) { - // replaces all but big five - $entity = $matches[0]; - $is_num = (@$matches[0][1] === '#'); - if ($is_num) { - $is_hex = (@$entity[2] === 'x'); - $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; - - // abort for special characters - if (isset($this->_special_dec2str[$code])) return $entity; - - if($code > 1114111 or $code < 0 or - ($code >= 55296 and $code <= 57343) ) { - // bits are set outside the "valid" range as defined - // by UNICODE 4.1.0 - return ''; - } - - $x = $y = $z = $w = 0; - if ($code < 128) { - // regular ASCII character - $x = $code; - } else { - // set up bits for UTF-8 - $x = ($code & 63) | 128; - if ($code < 2048) { - $y = (($code & 2047) >> 6) | 192; - } else { - $y = (($code & 4032) >> 6) | 128; - if($code < 65536) { - $z = (($code >> 12) & 15) | 224; - } else { - $z = (($code >> 12) & 63) | 128; - $w = (($code >> 18) & 7) | 240; - } - } - } - // set up the actual character - $ret = ''; - if($w) $ret .= chr($w); - if($z) $ret .= chr($z); - if($y) $ret .= chr($y); - $ret .= chr($x); - - return $ret; - } else { - if (isset($this->_special_ent2dec[$matches[3]])) return $entity; - if (!$this->_entity_lookup) { - require_once 'HTMLPurifier/EntityLookup.php'; - $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); - } - if (isset($this->_entity_lookup->table[$matches[3]])) { - return $this->_entity_lookup->table[$matches[3]]; - } else { - return $entity; - } - } - } - - /** - * Contains a copy of the EntityLookup table. - * @protected - */ - var $_entity_lookup; - - /** - * Translates CDATA sections into regular sections (through escaping). - * - * @protected - * @param $string HTML string to process. - * @returns HTML with CDATA sections escaped. - */ - function escapeCDATA($string) { - return preg_replace_callback( - '//', - array('HTMLPurifier_Lexer', 'CDATACallback'), - $string - ); - } - - /** - * Callback function for escapeCDATA() that does the work. - * - * @warning Though this is public in order to let the callback happen, - * calling it directly is not recommended. - * @params $matches PCRE matches array, with index 0 the entire match - * and 1 the inside of the CDATA section. - * @returns Escaped internals of the CDATA section. - */ - function CDATACallback($matches) { - // not exactly sure why the character set is needed, but whatever - return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); - } - - /** - * Takes a string of HTML (fragment or document) and returns the content - */ - function extractBody($html) { - $matches = array(); - $result = preg_match('!]*>(.+?)!is', $html, $matches); - if ($result) { - return $matches[1]; - } else { - return $html; - } - } - - /** - * Cleans a UTF-8 string for well-formedness and SGML validity - * - * It will parse according to UTF-8 and return a valid UTF8 string, with - * non-SGML codepoints excluded. - * - * @warning This function can find a lot of use, so we may be moving - * it to a dedicated class. - * - * @note Just for reference, the non-SGML code points are 0 to 31 and - * 127 to 159, inclusive. However, we allow code points 9, 10 - * and 13, which are the tab, line feed and carriage return - * respectively. 128 and above the code points map to multibyte - * UTF-8 representations. - * - * @note The functionality provided by the original function could be - * implemented with iconv using 'UTF-8//IGNORE', mbstring, or - * even the PCRE modifier 'u', these do not allow us to strip - * control characters or disallowed code points, and the latter - * does not allow invalid UTF8 characters to be ignored. - * - * @note Decomposing the string into Unicode code points is necessary - * because SGML disallows the use of specific code points, not - * necessarily bytes. A naive implementation that simply strtr - * disallowed code points as bytes will break other Unicode - * characters in which using such bytes is valid. - * - * @note Code adapted from utf8ToUnicode by Henri Sivonen and - * hsivonen@iki.fi at under the - * LGPL license. Notes on what changed are inside. - */ - function cleanUTF8($str) { - $mState = 0; // cached expected number of octets after the current octet - // until the beginning of the next UTF8 character sequence - $mUcs4 = 0; // cached Unicode character - $mBytes = 1; // cached expected number of octets in the current sequence - - // original code involved an $out that was an array of Unicode - // codepoints. Instead of having to convert back into UTF-8, we've - // decided to directly append valid UTF-8 characters onto a string - // $out once they're done. $char accumulates raw bytes, while $mUcs4 - // turns into the Unicode code point, so there's some redundancy. - - $out = ''; - $char = ''; - - $len = strlen($str); - for($i = 0; $i < $len; $i++) { - $in = ord($str{$i}); - $char .= $str[$i]; // append byte to char - if (0 == $mState) { - // When mState is zero we expect either a US-ASCII character - // or a multi-octet sequence. - if (0 == (0x80 & ($in))) { - // US-ASCII, pass straight through. - if (($in <= 31 || $in == 127) && - !($in == 9 || $in == 13 || $in == 10) // save \r\t\n - ) { - // control characters, remove - } else { - $out .= $char; - } - // reset - $char = ''; - $mBytes = 1; - } elseif (0xC0 == (0xE0 & ($in))) { - // First octet of 2 octet sequence - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x1F) << 6; - $mState = 1; - $mBytes = 2; - } elseif (0xE0 == (0xF0 & ($in))) { - // First octet of 3 octet sequence - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x0F) << 12; - $mState = 2; - $mBytes = 3; - } elseif (0xF0 == (0xF8 & ($in))) { - // First octet of 4 octet sequence - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x07) << 18; - $mState = 3; - $mBytes = 4; - } elseif (0xF8 == (0xFC & ($in))) { - // First octet of 5 octet sequence. - // - // This is illegal because the encoded codepoint must be - // either: - // (a) not the shortest form or - // (b) outside the Unicode range of 0-0x10FFFF. - // Rather than trying to resynchronize, we will carry on - // until the end of the sequence and let the later error - // handling code catch it. - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 0x03) << 24; - $mState = 4; - $mBytes = 5; - } elseif (0xFC == (0xFE & ($in))) { - // First octet of 6 octet sequence, see comments for 5 - // octet sequence. - $mUcs4 = ($in); - $mUcs4 = ($mUcs4 & 1) << 30; - $mState = 5; - $mBytes = 6; - } else { - // Current octet is neither in the US-ASCII range nor a - // legal first octet of a multi-octet sequence. - $mState = 0; - $mUcs4 = 0; - $mBytes = 1; - $char = ''; - } - } else { - // When mState is non-zero, we expect a continuation of the - // multi-octet sequence - if (0x80 == (0xC0 & ($in))) { - // Legal continuation. - $shift = ($mState - 1) * 6; - $tmp = $in; - $tmp = ($tmp & 0x0000003F) << $shift; - $mUcs4 |= $tmp; - - if (0 == --$mState) { - // End of the multi-octet sequence. mUcs4 now contains - // the final Unicode codepoint to be output - - // Check for illegal sequences and codepoints. - - // From Unicode 3.1, non-shortest form is illegal - if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || - ((3 == $mBytes) && ($mUcs4 < 0x0800)) || - ((4 == $mBytes) && ($mUcs4 < 0x10000)) || - (4 < $mBytes) || - // From Unicode 3.2, surrogate characters = illegal - (($mUcs4 & 0xFFFFF800) == 0xD800) || - // Codepoints outside the Unicode range are illegal - ($mUcs4 > 0x10FFFF) - ) { - - } elseif (0xFEFF != $mUcs4 && // omit BOM - !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML - ) { - $out .= $char; - } - // initialize UTF8 cache (reset) - $mState = 0; - $mUcs4 = 0; - $mBytes = 1; - $char = ''; - } - } else { - // ((0xC0 & (*in) != 0x80) && (mState != 0)) - // Incomplete multi-octet sequence. - // used to result in complete fail, but we'll reset - $mState = 0; - $mUcs4 = 0; - $mBytes = 1; - $char =''; - } - } - } - return $out; - } - -} - -?> \ No newline at end of file +_encoder = new HTMLPurifier_Encoder(); + } + + var $_encoder; + + /** + * Lexes an HTML string into tokens. + * + * @param $string String HTML. + * @return HTMLPurifier_Token array representation of HTML. + */ + function tokenizeHTML($string, $config = null) { + trigger_error('Call to abstract class', E_USER_ERROR); + } + + /** + * Retrieves or sets the default Lexer as a Prototype Factory. + * + * Depending on what PHP version you are running, the abstract base + * Lexer class will determine which concrete Lexer is best for you: + * HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex + * for PHP 5 and beyond. + * + * Passing the optional prototype lexer parameter will override the + * default with your own implementation. A copy/reference of the prototype + * lexer will now be returned when you request a new lexer. + * + * @note + * Though it is possible to call this factory method from subclasses, + * such usage is not recommended. + * + * @param $prototype Optional prototype lexer. + * @return Concrete lexer. + */ + function create($prototype = null) { + // we don't really care if it's a reference or a copy + static $lexer = null; + if ($prototype) { + $lexer = $prototype; + } + if (empty($lexer)) { + if (version_compare(PHP_VERSION, '5', '>=')) { + require_once 'HTMLPurifier/Lexer/DOMLex.php'; + $lexer = new HTMLPurifier_Lexer_DOMLex(); + } else { + require_once 'HTMLPurifier/Lexer/DirectLex.php'; + $lexer = new HTMLPurifier_Lexer_DirectLex(); + } + } + return $lexer; + } + + /** + * Translates CDATA sections into regular sections (through escaping). + * + * @protected + * @param $string HTML string to process. + * @returns HTML with CDATA sections escaped. + */ + function escapeCDATA($string) { + return preg_replace_callback( + '//', + array('HTMLPurifier_Lexer', 'CDATACallback'), + $string + ); + } + + /** + * Callback function for escapeCDATA() that does the work. + * + * @warning Though this is public in order to let the callback happen, + * calling it directly is not recommended. + * @params $matches PCRE matches array, with index 0 the entire match + * and 1 the inside of the CDATA section. + * @returns Escaped internals of the CDATA section. + */ + function CDATACallback($matches) { + // not exactly sure why the character set is needed, but whatever + return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); + } + + /** + * Takes a string of HTML (fragment or document) and returns the content + */ + function extractBody($html) { + $matches = array(); + $result = preg_match('!]*>(.+?)!is', $html, $matches); + if ($result) { + return $matches[1]; + } else { + return $html; + } + } + +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 230e694e..e408fa84 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -30,6 +30,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer public function __construct() { // setup the factory + parent::HTMLPurifier_Lexer(); $this->factory = new HTMLPurifier_TokenFactory(); } @@ -50,10 +51,10 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer // substitute non-special entities. While DOM is perfectly capable // of doing this, we need to get at the UTF-8 characters in // cleanUTF8 - $string = $this->substituteNonSpecialEntities($string); + $string = $this->_encoder->substituteNonSpecialEntities($string); // clean it into well-formed UTF-8 string - $string = $this->cleanUTF8($string); + $string = $this->_encoder->cleanUTF8($string); // preprocess string, essential for UTF-8 $string = diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index aa1250df..6951c491 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -21,6 +21,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer { /** + * Most common entity to raw value conversion table for special entities. + * @protected + */ + var $_special_entity2str = + array( + '"' => '"', + '&' => '&', + '<' => '<', + '>' => '>', + ''' => "'", + ''' => "'", + ''' => "'" + ); + + /** * Parses special entities into the proper characters. * * This string will translate escaped versions of the special characters @@ -51,7 +66,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($num_amp_2 <= $num_esc_amp) return $string; // hmm... now we have some uncommon entities. Use the callback. - $string = $this->substituteSpecialEntities($string); + $string = $this->_encoder->substituteSpecialEntities($string); return $string; } @@ -61,51 +76,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer */ var $_whitespace = "\x20\x09\x0D\x0A"; - /** - * Substitutes only special entities with their parsed equivalents. - * - * @notice We try to avoid calling this function because otherwise, it - * would have to be called a lot (for every parsed section). - * - * @protected - * @param $string String to have non-special entities parsed. - * @returns Parsed string. - */ - function substituteSpecialEntities($string) { - return preg_replace_callback( - $this->_substituteEntitiesRegex, - array('HTMLPurifier_Lexer_DirectLex', 'specialEntityCallback'), - $string); - } - - /** - * Callback function for substituteSpecialEntities() that does the work. - * - * This callback has same syntax as nonSpecialEntityCallback(). - * - * @warning Though this is public in order to let the callback happen, - * calling it directly is not recommended. - * @param $matches PCRE-style matches array, with 0 the entire match, and - * either index 1, 2 or 3 set with a hex value, dec value, - * or string (respectively). - * @returns Replacement string. - */ - function specialEntityCallback($matches) { - $entity = $matches[0]; - $is_num = (@$matches[0][1] === '#'); - if ($is_num) { - $is_hex = (@$entity[2] === 'x'); - $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; - return isset($this->_special_dec2str[$int]) ? - $this->_special_dec2str[$int] : - $entity; - } else { - return isset($this->_special_ent2dec[$matches[3]]) ? - $this->_special_ent2dec[$matches[3]] : - $entity; - } - } - function tokenizeHTML($string, $config = null) { if (!$config) $config = HTMLPurifier_Config::createDefault(); @@ -126,10 +96,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $string = $this->escapeCDATA($string); // expand entities THAT AREN'T THE BIG FIVE - $string = $this->substituteNonSpecialEntities($string); + $string = $this->_encoder->substituteNonSpecialEntities($string); // clean it into wellformed UTF-8 string - $string = $this->cleanUTF8($string); + $string = $this->_encoder->cleanUTF8($string); // infinite loop protection // has to be pretty big, since html docs can be big diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index a0bdfe66..c042d2f9 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -35,8 +35,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer if ($config->get('Core', 'AcceptFullDocuments')) { $string = $this->extractBody($string); } - $string = $this->substituteNonSpecialEntities($string); - $string = $this->cleanUTF8($string); + $string = $this->_encoder->substituteNonSpecialEntities($string); + $string = $this->_encoder->cleanUTF8($string); $parser=& new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); diff --git a/smoketests/common.php b/smoketests/common.php index a6a8c146..e01d7500 100644 --- a/smoketests/common.php +++ b/smoketests/common.php @@ -6,7 +6,7 @@ set_include_path('../library' . PATH_SEPARATOR . get_include_path()); require_once 'HTMLPurifier.php'; function escapeHTML($string) { - $string = HTMLPurifier_Lexer::cleanUTF8($string); + $string = HTMLPurifier_Encoder::cleanUTF8($string); $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8'); return $string; } diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php new file mode 100644 index 00000000..7b8d7998 --- /dev/null +++ b/tests/HTMLPurifier/EncoderTest.php @@ -0,0 +1,96 @@ +Encoder = new HTMLPurifier_Encoder(); + $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); + } + + function assertCleanUTF8($string, $expect = null) { + if ($expect === null) $expect = $string; + $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect); + } + + function test_cleanUTF8() { + $this->assertCleanUTF8('Normal string.'); + $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters"); + $this->assertCleanUTF8("null byte: \0", 'null byte: '); + $this->assertCleanUTF8("\1\2\3\4\5\6\7", ''); + $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char + $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML + $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte + $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 + } + + function test_substituteNonSpecialEntities() { + $char_theta = $this->_entity_lookup->table['theta']; + $this->assertIdentical($char_theta, + $this->Encoder->substituteNonSpecialEntities('θ') ); + $this->assertIdentical('"', + $this->Encoder->substituteNonSpecialEntities('"') ); + + // numeric tests, adapted from Feyd + $args = array(); + $args[] = array(1114112,false ); + $args[] = array(1114111,'F48FBFBF'); // 0x0010FFFF + $args[] = array(1048576,'F4808080'); // 0x00100000 + $args[] = array(1048575,'F3BFBFBF'); // 0x000FFFFF + $args[] = array(262144, 'F1808080'); // 0x00040000 + $args[] = array(262143, 'F0BFBFBF'); // 0x0003FFFF + $args[] = array(65536, 'F0908080'); // 0x00010000 + $args[] = array(65535, 'EFBFBF' ); // 0x0000FFFF + $args[] = array(57344, 'EE8080' ); // 0x0000E000 + $args[] = array(57343, false ); // 0x0000DFFF these are ill-formed + $args[] = array(56040, false ); // 0x0000DAE8 these are ill-formed + $args[] = array(55296, false ); // 0x0000D800 these are ill-formed + $args[] = array(55295, 'ED9FBF' ); // 0x0000D7FF + $args[] = array(53248, 'ED8080' ); // 0x0000D000 + $args[] = array(53247, 'ECBFBF' ); // 0x0000CFFF + $args[] = array(4096, 'E18080' ); // 0x00001000 + $args[] = array(4095, 'E0BFBF' ); // 0x00000FFF + $args[] = array(2048, 'E0A080' ); // 0x00000800 + $args[] = array(2047, 'DFBF' ); // 0x000007FF + $args[] = array(128, 'C280' ); // 0x00000080 invalid SGML char + $args[] = array(127, '7F' ); // 0x0000007F invalid SGML char + $args[] = array(0, '00' ); // 0x00000000 invalid SGML char + + $args[] = array(20108, 'E4BA8C' ); // 0x00004E8C + $args[] = array(77, '4D' ); // 0x0000004D + $args[] = array(66306, 'F0908C82'); // 0x00010302 + $args[] = array(1072, 'D0B0' ); // 0x00000430 + + foreach ($args as $arg) { + $string = '&#' . $arg[0] . ';' . // decimal + '&#x' . dechex($arg[0]) . ';'; // hex + $expect = ''; + if ($arg[1] !== false) { + $chars = str_split($arg[1], 2); + foreach ($chars as $char) { + $expect .= chr(hexdec($char)); + } + $expect .= $expect; // double it + } + $this->assertIdentical( + $this->Encoder->substituteNonSpecialEntities($string), + $expect, + $arg[0] . ': %s' + ); + } + + } + + function test_specialEntityCallback() { + + $this->assertIdentical("'",$this->Encoder->specialEntityCallback( + array(''', null, '39', null) )); + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php index d1e6f088..2ad14476 100644 --- a/tests/HTMLPurifier/Lexer/DirectLexTest.php +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -11,13 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase $this->DirectLex = new HTMLPurifier_Lexer_DirectLex(); } - function test_specialEntityCallback() { - $HP =& $this->DirectLex; - - $this->assertIdentical("'",$HP->specialEntityCallback( - array(''', null, '39', null) )); - } - function test_parseData() { $HP =& $this->DirectLex; diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 09eec396..25fff13c 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -32,79 +32,6 @@ class HTMLPurifier_LexerTest extends UnitTestCase } - function assertCleanUTF8($string, $expect = null) { - if ($expect === null) $expect = $string; - $this->assertIdentical($this->Lexer->cleanUTF8($string), $expect); - } - - function test_cleanUTF8() { - $this->assertCleanUTF8('Normal string.'); - $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters"); - $this->assertCleanUTF8("null byte: \0", 'null byte: '); - $this->assertCleanUTF8("\1\2\3\4\5\6\7", ''); - $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char - $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML - $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte - $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 - } - - function test_substituteNonSpecialEntities() { - $char_theta = $this->_entity_lookup->table['theta']; - $this->assertIdentical($char_theta, - $this->Lexer->substituteNonSpecialEntities('θ') ); - $this->assertIdentical('"', - $this->Lexer->substituteNonSpecialEntities('"') ); - - // numeric tests, adapted from Feyd - $args = array(); - $args[] = array(1114112,false ); - $args[] = array(1114111,'F48FBFBF'); // 0x0010FFFF - $args[] = array(1048576,'F4808080'); // 0x00100000 - $args[] = array(1048575,'F3BFBFBF'); // 0x000FFFFF - $args[] = array(262144, 'F1808080'); // 0x00040000 - $args[] = array(262143, 'F0BFBFBF'); // 0x0003FFFF - $args[] = array(65536, 'F0908080'); // 0x00010000 - $args[] = array(65535, 'EFBFBF' ); // 0x0000FFFF - $args[] = array(57344, 'EE8080' ); // 0x0000E000 - $args[] = array(57343, false ); // 0x0000DFFF these are ill-formed - $args[] = array(56040, false ); // 0x0000DAE8 these are ill-formed - $args[] = array(55296, false ); // 0x0000D800 these are ill-formed - $args[] = array(55295, 'ED9FBF' ); // 0x0000D7FF - $args[] = array(53248, 'ED8080' ); // 0x0000D000 - $args[] = array(53247, 'ECBFBF' ); // 0x0000CFFF - $args[] = array(4096, 'E18080' ); // 0x00001000 - $args[] = array(4095, 'E0BFBF' ); // 0x00000FFF - $args[] = array(2048, 'E0A080' ); // 0x00000800 - $args[] = array(2047, 'DFBF' ); // 0x000007FF - $args[] = array(128, 'C280' ); // 0x00000080 invalid SGML char - $args[] = array(127, '7F' ); // 0x0000007F invalid SGML char - $args[] = array(0, '00' ); // 0x00000000 invalid SGML char - - $args[] = array(20108, 'E4BA8C' ); // 0x00004E8C - $args[] = array(77, '4D' ); // 0x0000004D - $args[] = array(66306, 'F0908C82'); // 0x00010302 - $args[] = array(1072, 'D0B0' ); // 0x00000430 - - foreach ($args as $arg) { - $string = '&#' . $arg[0] . ';' . // decimal - '&#x' . dechex($arg[0]) . ';'; // hex - $expect = ''; - if ($arg[1] !== false) { - $chars = str_split($arg[1], 2); - foreach ($chars as $char) { - $expect .= chr(hexdec($char)); - } - $expect .= $expect; // double it - } - $this->assertIdentical( - $this->Lexer->substituteNonSpecialEntities($string), - $expect, - $arg[0] . ': %s' - ); - } - - } - function assertExtractBody($text, $extract = true) { $result = $this->Lexer->extractBody($text); if ($extract === true) $extract = $text; diff --git a/tests/index.php b/tests/index.php index 39ef5a6d..de57e739 100644 --- a/tests/index.php +++ b/tests/index.php @@ -86,6 +86,7 @@ $test_files[] = 'AttrTransform/BdoDirTest.php'; $test_files[] = 'AttrTransform/ImgRequiredTest.php'; $test_files[] = 'URISchemeRegistryTest.php'; $test_files[] = 'URISchemeTest.php'; +$test_files[] = 'EncoderTest.php'; if (version_compare(PHP_VERSION, '5', '>=')) { $test_files[] = 'TokenFactoryTest.php'; -- 2.11.4.GIT