From afb007d22fcce0d253519ab5b7b77fcc2e9c1afd Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 27 Mar 2011 20:35:38 +0100 Subject: [PATCH] Protect against font family innerHTML/cssText attacks. Signed-off-by: Edward Z. Yang --- NEWS | 3 + configdoc/usage.xml | 27 +++- library/HTMLPurifier/AttrDef/CSS/FontFamily.php | 141 +++++++++++++++++++-- library/HTMLPurifier/URI.php | 2 +- tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php | 38 +++--- tests/HTMLPurifier/AttrDef/CSS/FontTest.php | 4 +- tests/HTMLPurifier/AttrDef/CSSTest.php | 2 +- .../HTMLPurifier/Filter/ExtractStyleBlocksTest.php | 13 +- .../HTMLT/shift-jis-preserve-yen.htmlt | 2 +- .../HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt | 4 +- 10 files changed, 189 insertions(+), 47 deletions(-) diff --git a/NEWS b/NEWS index 7cb1117c..8dd7d144 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier spaces. This constitutes a slight semantic change, which can be reverted using %Output.FixInnerHTML. Reported by Neike Taika-Tessaro and Mario Heiderich. +# Protect against cssText/innerHTML by restricting allowed characters + used in fonts further than mandated by the specification. Reported + by Neike Taika-Tessaro and Mario Heiderich. ! Added %HTML.Nofollow to add rel="nofollow" to external links. ! More types of SPL autoloaders allowed on later versions of PHP. ! Implementations for position, top, left, right, bottom, z-index diff --git a/configdoc/usage.xml b/configdoc/usage.xml index a401a9ee..5704b095 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -96,27 +96,32 @@ - 56 + 61 + + + + + 62 - 57 + 63 - 58 + 64 - 87 + 93 - 101 + 107 266 @@ -124,7 +129,7 @@ - 102 + 108 @@ -254,6 +259,9 @@ 64 + + 75 + @@ -288,6 +296,11 @@ 12 + + + 50 + + 18 @@ -454,7 +467,7 @@ - 45 + 53 19 diff --git a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php index c29834b6..0d9a4e12 100644 --- a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php +++ b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php @@ -6,6 +6,39 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef { + protected $mask = null; + + public function __construct() { + $this->mask = '- '; + for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c; + for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c; + for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine + // special bytes used by UTF-8 + for ($i = 0x80; $i <= 0xFF; $i++) { + // We don't bother excluding invalid bytes in this range, + // because the our restriction of well-formed UTF-8 will + // prevent these from ever occurring. + $this->mask .= chr($i); + } + + /* + PHP's internal strcspn implementation is + O(length of string * length of mask), making it inefficient + for large masks. However, it's still faster than + preg_match 8) + for (p = s1;;) { + spanp = s2; + do { + if (*spanp == c || p == s1_end) { + return p - s1; + } + } while (spanp++ < (s2_end - 1)); + c = *++p; + } + */ + // possible optimization: invert the mask. + } + public function validate($string, $config, $context) { static $generic_names = array( 'serif' => true, @@ -56,17 +89,103 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef // shouldn't show up regardless $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font); - // These ugly transforms don't pose a security - // risk (as \\ and \" might). We could try to be clever and - // use single-quote wrapping when there is a double quote - // present, but I have choosen not to implement that. - // (warning: this code relies on the selection of quotation - // mark below) - $font = str_replace('\\', '\\5C ', $font); - $font = str_replace('"', '\\22 ', $font); - - // complicated font, requires quoting - $final .= "\"$font\", "; // note that this will later get turned into " + // Here, there are various classes of characters which need + // to be treated differently: + // - Alphanumeric characters are essentially safe. We + // handled these above. + // - Spaces require quoting, though most parsers will do + // the right thing if there aren't any characters that + // can be misinterpreted + // - Dashes rarely occur, but they fairly unproblematic + // for parsing/rendering purposes. + // The above characters cover the majority of Western font + // names. + // - Arbitrary Unicode characters not in ASCII. Because + // most parsers give little thought to Unicode, treatment + // of these codepoints is basically uniform, even for + // punctuation-like codepoints. These characters can + // show up in non-Western pages and are supported by most + // major browsers, for example: "MS 明朝" is a + // legitimate font-name + // . See + // the CSS3 spec for more examples: + // + // You can see live samples of these on the Internet: + // + // However, most of these fonts have ASCII equivalents: + // for example, 'MS Mincho', and it's considered + // professional to use ASCII font names instead of + // Unicode font names. Thanks Takeshi Terada for + // providing this information. + // The following characters, to my knowledge, have not been + // used to name font names. + // - Single quote. While theoretically you might find a + // font name that has a single quote in its name (serving + // as an apostrophe, e.g. Dave's Scribble), I haven't + // been able to find any actual examples of this. + // Internet Explorer's cssText translation (which I + // believe is invoked by innerHTML) normalizes any + // quoting to single quotes, and fails to escape single + // quotes. (Note that this is not IE's behavior for all + // CSS properties, just some sort of special casing for + // font-family). So a single quote *cannot* be used + // safely in the font-family context if there will be an + // innerHTML/cssText translation. Note that Firefox 3.x + // does this too. + // - Double quote. In IE, these get normalized to + // single-quotes, no matter what the encoding. (Fun + // fact, in IE8, the 'content' CSS property gained + // support, where they special cased to preserve encoded + // double quotes, but still translate unadorned double + // quotes into single quotes.) So, because their + // fixpoint behavior is identical to single quotes, they + // cannot be allowed either. Firefox 3.x displays + // single-quote style behavior. + // - Backslashes are reduced by one (so \\ -> \) every + // iteration, so they cannot be used safely. This shows + // up in IE7, IE8 and FF3 + // - Semicolons, commas and backticks are handled properly. + // - The rest of the ASCII punctuation is handled properly. + // We haven't checked what browsers do to unadorned + // versions, but this is not important as long as the + // browser doesn't /remove/ surrounding quotes (as IE does + // for HTML). + // + // With these results in hand, we conclude that there are + // various levels of safety: + // - Paranoid: alphanumeric, spaces and dashes(?) + // - International: Paranoid + non-ASCII Unicode + // - Edgy: Everything except quotes, backslashes + // - NoJS: Standards compliance, e.g. sod IE. Note that + // with some judicious character escaping (since certain + // types of escaping doesn't work) this is theoretically + // OK as long as innerHTML/cssText is not called. + // We believe that international is a reasonable default + // (that we will implement now), and once we do more + // extensive research, we may feel comfortable with dropping + // it down to edgy. + + // Edgy: alphanumeric, spaces, dashes and Unicode. Use of + // str(c)spn assumes that the string was already well formed + // Unicode (which of course it is). + if (strspn($font, $this->mask) !== strlen($font)) { + continue; + } + + // Historical: + // In the absence of innerHTML/cssText, these ugly + // transforms don't pose a security risk (as \\ and \" + // might--these escapes are not supported by most browsers). + // We could try to be clever and use single-quote wrapping + // when there is a double quote present, but I have choosen + // not to implement that. (NOTE: you can reduce the amount + // of escapes by one depending on what quoting style you use) + // $font = str_replace('\\', '\\5C ', $font); + // $font = str_replace('"', '\\22 ', $font); + // $font = str_replace("'", '\\27 ', $font); + + // font possibly with spaces, requires quoting + $final .= "'$font', "; } $final = rtrim($final, ', '); if ($final === '') return false; diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php index 92bff87a..efdfb2c6 100644 --- a/library/HTMLPurifier/URI.php +++ b/library/HTMLPurifier/URI.php @@ -185,7 +185,7 @@ class HTMLPurifier_URI // Reconstruct the result // One might wonder about parsing quirks from browsers after - // this reconstruction. Unfortunately, parsing behaviro depends + // this reconstruction. Unfortunately, parsing behavior depends // on what *scheme* was employed (file:///foo is handled *very* // differently than http:///foo), so unfortunately we have to // defer to the schemes to do the right thing. diff --git a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php index 2060076b..fda8e01f 100644 --- a/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/FontFamilyTest.php @@ -8,30 +8,32 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes $this->def = new HTMLPurifier_AttrDef_CSS_FontFamily(); $this->assertDef('Gill, Helvetica, sans-serif'); - $this->assertDef('"Times New Roman", serif'); - $this->assertDef('\'Times New Roman\'', '"Times New Roman"'); + $this->assertDef("'Times New Roman', serif"); + $this->assertDef("\"Times New Roman\"", "'Times New Roman'"); $this->assertDef('01234'); $this->assertDef(',', false); - $this->assertDef('Times New Roman, serif', '"Times New Roman", serif'); - $this->assertDef($d = '"John\'s Font"'); - $this->assertDef("John's Font", $d); - $this->assertDef($d = "\"\xE5\xAE\x8B\xE4\xBD\x93\""); + $this->assertDef('Times New Roman, serif', "'Times New Roman', serif"); + $this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'"); $this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d); - $this->assertDef("'\\','f'", "\"\\5C \", f"); - $this->assertDef("'\\01'", "\"\""); - $this->assertDef("'\\20'", "\" \""); - $this->assertDef("\\0020", "\" \""); + $this->assertDef("'\\01'", "''"); + $this->assertDef("'\\20'", "' '"); + $this->assertDef("\\0020", "' '"); $this->assertDef("'\\000045'", "E"); $this->assertDef("','", false); - $this->assertDef("',' foobar','", "\" foobar\""); - $this->assertDef("'\\27'", "\"'\""); - $this->assertDef('"\\22"', "\"\\22 \""); - $this->assertDef('"\\""', "\"\\22 \""); - $this->assertDef('"\'"', "\"'\""); + $this->assertDef("',' foobar','", "' foobar'"); $this->assertDef("'\\000045a'", "Ea"); $this->assertDef("'\\00045 a'", "Ea"); - $this->assertDef("'\\00045 a'", "\"E a\""); + $this->assertDef("'\\00045 a'", "'E a'"); $this->assertDef("'\\\nf'", "f"); + // No longer supported, except maybe in NoJS mode (see source + // file for more explanation) + //$this->assertDef($d = '"John\'s Font"'); + //$this->assertDef("John's Font", $d); + //$this->assertDef("'\\','f'", "\"\\5C \", f"); + //$this->assertDef("'\\27'", "\"'\""); + //$this->assertDef('"\\22"', "\"\\22 \""); + //$this->assertDef('"\\""', "\"\\22 \""); + //$this->assertDef('"\'"', "\"'\""); } function testAllowed() { @@ -40,8 +42,8 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes $this->assertDef('serif'); $this->assertDef('sans-serif', false); $this->assertDef('serif, sans-serif', 'serif'); - $this->assertDef('Times New Roman', '"Times New Roman"'); - $this->assertDef('"Times New Roman"'); + $this->assertDef('Times New Roman', "'Times New Roman'"); + $this->assertDef("'Times New Roman'"); $this->assertDef('foo', false); } diff --git a/tests/HTMLPurifier/AttrDef/CSS/FontTest.php b/tests/HTMLPurifier/AttrDef/CSS/FontTest.php index 40559485..91870d13 100644 --- a/tests/HTMLPurifier/AttrDef/CSS/FontTest.php +++ b/tests/HTMLPurifier/AttrDef/CSS/FontTest.php @@ -11,10 +11,10 @@ class HTMLPurifier_AttrDef_CSS_FontTest extends HTMLPurifier_AttrDefHarness // hodgepodge of usage cases from W3C spec, but " -> ' $this->assertDef('12px/14px sans-serif'); $this->assertDef('80% sans-serif'); - $this->assertDef('x-large/110% "New Century Schoolbook", serif'); + $this->assertDef("x-large/110% 'New Century Schoolbook', serif"); $this->assertDef('bold italic large Palatino, serif'); $this->assertDef('normal small-caps 120%/120% fantasy'); - $this->assertDef('300 italic 1.3em/1.7em "FB Armada", sans-serif'); + $this->assertDef("300 italic 1.3em/1.7em 'FB Armada', sans-serif"); $this->assertDef('600 9px Charcoal'); $this->assertDef('600 9px/ 12px Charcoal', '600 9px/12px Charcoal'); diff --git a/tests/HTMLPurifier/AttrDef/CSSTest.php b/tests/HTMLPurifier/AttrDef/CSSTest.php index 72a17e1a..56917aec 100644 --- a/tests/HTMLPurifier/AttrDef/CSSTest.php +++ b/tests/HTMLPurifier/AttrDef/CSSTest.php @@ -62,7 +62,7 @@ class HTMLPurifier_AttrDef_CSSTest extends HTMLPurifier_AttrDefHarness $this->assertDef('width:-50px;', false); $this->assertDef('text-decoration:underline;'); $this->assertDef('font-family:sans-serif;'); - $this->assertDef('font-family:Gill, "Times New Roman", sans-serif;'); + $this->assertDef("font-family:Gill, 'Times New Roman', sans-serif;"); $this->assertDef('font:12px serif;'); $this->assertDef('border:1px solid #000;'); $this->assertDef('border-bottom:2em double #FF00FA;'); diff --git a/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php b/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php index e93d65e6..d3fb38f1 100644 --- a/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php +++ b/tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php @@ -79,10 +79,13 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness } function test_cleanCSS_angledBrackets() { - $this->assertCleanCSS( - ".class {\nfont-family:'';\n}", - ".class {\nfont-family:\"\\3C /style\\3E \";\n}" - ); + // [Content] No longer can smuggle in angled brackets using + // font-family; when we add support for 'content', reinstate + // this test. + //$this->assertCleanCSS( + // ".class {\nfont-family:'';\n}", + // ".class {\nfont-family:\"\\3C /style\\3E \";\n}" + //); } function test_cleanCSS_angledBrackets2() { @@ -97,6 +100,7 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness $this->assertCleanCSS("div {bogus:tree;}", "div {\n}"); } + /* [CONTENT] function test_cleanCSS_escapeCodes() { $this->assertCleanCSS( ".class {\nfont-family:\"\\3C /style\\3E \";\n}" @@ -109,6 +113,7 @@ class HTMLPurifier_Filter_ExtractStyleBlocksTest extends HTMLPurifier_Harness ".class {\nfont-family:\"\";\n}" ); } + */ function test_cleanCSS_scope() { $this->config->set('Filter.ExtractStyleBlocks.Scope', '#foo'); diff --git a/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt b/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt index 6e9640be..f22417c0 100644 --- a/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt +++ b/tests/HTMLPurifier/HTMLT/shift-jis-preserve-yen.htmlt @@ -4,5 +4,5 @@ if (!function_exists('iconv')) return true; Core.Encoding = "Shift_JIS" Core.EscapeNonASCIICharacters = true --HTML-- -111 +111 --# vim: et sw=4 sts=4 diff --git a/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt b/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt index dd6dee6c..6c2d3bc4 100644 --- a/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt +++ b/tests/HTMLPurifier/HTMLT/shift-jis-remove-yen.htmlt @@ -3,7 +3,7 @@ if (!function_exists('iconv')) return true; --INI-- Core.Encoding = Shift_JIS --HTML-- -111 +111 --EXPECT-- -111 +111 --# vim: et sw=4 sts=4 -- 2.11.4.GIT