Fix problem where stacked AttrTransforms clobber each other.
[htmlpurifier.git] / tests / HTMLPurifier / EncoderTest.php
blob6084c39f36506e9cbed9833f78bc71414d51507c
1 <?php
3 class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
6 protected $_entity_lookup;
8 function setUp() {
9 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
10 parent::setUp();
13 function assertCleanUTF8($string, $expect = null) {
14 if ($expect === null) $expect = $string;
15 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
16 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
19 function test_cleanUTF8() {
20 $this->assertCleanUTF8('Normal string.');
21 $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
22 $this->assertCleanUTF8("null byte: \0", 'null byte: ');
23 $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
24 $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
25 $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
26 $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
27 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
28 // invalid codepoints
29 $this->assertCleanUTF8("\xED\xB0\x80", '');
32 function test_convertToUTF8_noConvert() {
33 // UTF-8 means that we don't touch it
34 $this->assertIdentical(
35 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
36 "\xF6", // this is invalid
37 'Expected identical [Binary: F6]'
41 function test_convertToUTF8_spuriousEncoding() {
42 $this->config->set('Core.Encoding', 'utf99');
43 $this->expectError('Invalid encoding utf99');
44 $this->assertIdentical(
45 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
50 function test_convertToUTF8_iso8859_1() {
51 $this->config->set('Core.Encoding', 'ISO-8859-1');
52 $this->assertIdentical(
53 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
54 "\xC3\xB6"
58 function test_convertToUTF8_withoutIconv() {
59 $this->config->set('Core.Encoding', 'ISO-8859-1');
60 $this->config->set('Test.ForceNoIconv', true);
61 $this->assertIdentical(
62 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
63 "\xC3\xB6"
68 function getZhongWen() {
69 return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
72 function test_convertFromUTF8_utf8() {
73 // UTF-8 means that we don't touch it
74 $this->assertIdentical(
75 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
76 "\xC3\xB6"
80 function test_convertFromUTF8_iso8859_1() {
81 $this->config->set('Core.Encoding', 'ISO-8859-1');
82 $this->assertIdentical(
83 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
84 "\xF6",
85 'Expected identical [Binary: F6]'
89 function test_convertFromUTF8_iconvNoChars() {
90 if (!function_exists('iconv')) return;
91 $this->config->set('Core.Encoding', 'ISO-8859-1');
92 $this->assertIdentical(
93 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
94 " (Chinese)"
98 function test_convertFromUTF8_phpNormal() {
99 // Plain PHP implementation has slightly different behavior
100 $this->config->set('Core.Encoding', 'ISO-8859-1');
101 $this->config->set('Test.ForceNoIconv', true);
102 $this->assertIdentical(
103 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
104 "\xF6",
105 'Expected identical [Binary: F6]'
109 function test_convertFromUTF8_phpNoChars() {
110 $this->config->set('Core.Encoding', 'ISO-8859-1');
111 $this->config->set('Test.ForceNoIconv', true);
112 $this->assertIdentical(
113 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
114 "?? (Chinese)"
118 function test_convertFromUTF8_withProtection() {
119 // Preserve the characters!
120 $this->config->set('Core.Encoding', 'ISO-8859-1');
121 $this->config->set('Core.EscapeNonASCIICharacters', true);
122 $this->assertIdentical(
123 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
124 "&#20013;&#25991; (Chinese)"
128 function test_convertFromUTF8_withProtectionButUtf8() {
129 // Preserve the characters!
130 $this->config->set('Core.EscapeNonASCIICharacters', true);
131 $this->assertIdentical(
132 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
133 "&#20013;&#25991; (Chinese)"
137 function test_convertToASCIIDumbLossless() {
139 // Uppercase thorn letter
140 $this->assertIdentical(
141 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
142 "&#222;orn"
145 $this->assertIdentical(
146 HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
147 "an"
150 // test up to four bytes
151 $this->assertIdentical(
152 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
153 "&#917536;"
158 function assertASCIISupportCheck($enc, $ret) {
159 $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
160 if ($test === false) return;
161 $this->assertIdentical(
162 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
163 $ret
165 $this->assertIdentical(
166 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
167 $ret
171 function test_testEncodingSupportsASCII() {
172 $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
173 $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
174 $this->assertASCIISupportCheck('ISO-8859-1', array());
175 $this->assertASCIISupportCheck('dontexist', array()); // canary
178 function testShiftJIS() {
179 if (!function_exists('iconv')) return;
180 $this->config->set('Core.Encoding', 'Shift_JIS');
181 // This actually looks like a Yen, but we're going to treat it differently
182 $this->assertIdentical(
183 HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
184 '\\~'
186 $this->assertIdentical(
187 HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
188 '\\~'
192 function testIconvTruncateBug() {
193 if (!function_exists('iconv')) return;
194 if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
195 $this->config->set('Core.Encoding', 'ISO-8859-1');
196 $this->assertIdentical(
197 HTMLPurifier_Encoder::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config, $this->context),
198 str_repeat('a', 10000)
202 function testIconvChunking() {
203 if (!function_exists('iconv')) return;
204 if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
205 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
206 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
207 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
208 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
209 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
210 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
215 // vim: et sw=4 sts=4