3 class HTMLPurifier_EncoderTest
extends HTMLPurifier_Harness
6 protected $_entity_lookup;
8 public function setUp()
10 $this->_entity_lookup
= HTMLPurifier_EntityLookup
::instance();
14 public function assertCleanUTF8($string, $expect = null)
16 if ($expect === null) $expect = $string;
17 $this->assertIdentical(HTMLPurifier_Encoder
::cleanUTF8($string), $expect, 'iconv: %s');
18 $this->assertIdentical(HTMLPurifier_Encoder
::cleanUTF8($string, true), $expect, 'PHP: %s');
21 public function test_cleanUTF8()
23 $this->assertCleanUTF8('Normal string.');
24 $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
25 $this->assertCleanUTF8("null byte: \0", 'null byte: ');
26 $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
27 $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
28 $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
29 $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
30 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
32 $this->assertCleanUTF8("\xED\xB0\x80", '');
35 public function test_convertToUTF8_noConvert()
37 // UTF-8 means that we don't touch it
38 $this->assertIdentical(
39 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
40 "\xF6", // this is invalid
41 'Expected identical [Binary: F6]'
45 public function test_convertToUTF8_spuriousEncoding()
47 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
48 $this->config
->set('Core.Encoding', 'utf99');
49 $this->expectError('Invalid encoding utf99');
50 $this->assertIdentical(
51 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
56 public function test_convertToUTF8_iso8859_1()
58 $this->config
->set('Core.Encoding', 'ISO-8859-1');
59 $this->assertIdentical(
60 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
65 public function test_convertToUTF8_withoutIconv()
67 $this->config
->set('Core.Encoding', 'ISO-8859-1');
68 $this->config
->set('Test.ForceNoIconv', true);
69 $this->assertIdentical(
70 HTMLPurifier_Encoder
::convertToUTF8("\xF6", $this->config
, $this->context
),
76 public function getZhongWen()
78 return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
81 public function test_convertFromUTF8_utf8()
83 // UTF-8 means that we don't touch it
84 $this->assertIdentical(
85 HTMLPurifier_Encoder
::convertFromUTF8("\xC3\xB6", $this->config
, $this->context
),
90 public function test_convertFromUTF8_iso8859_1()
92 $this->config
->set('Core.Encoding', 'ISO-8859-1');
93 $this->assertIdentical(
94 HTMLPurifier_Encoder
::convertFromUTF8("\xC3\xB6", $this->config
, $this->context
),
96 'Expected identical [Binary: F6]'
100 public function test_convertFromUTF8_iconvNoChars()
102 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
103 $this->config
->set('Core.Encoding', 'ISO-8859-1');
104 $this->assertIdentical(
105 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
110 public function test_convertFromUTF8_phpNormal()
112 // Plain PHP implementation has slightly different behavior
113 $this->config
->set('Core.Encoding', 'ISO-8859-1');
114 $this->config
->set('Test.ForceNoIconv', true);
115 $this->assertIdentical(
116 HTMLPurifier_Encoder
::convertFromUTF8("\xC3\xB6", $this->config
, $this->context
),
118 'Expected identical [Binary: F6]'
122 public function test_convertFromUTF8_phpNoChars()
124 $this->config
->set('Core.Encoding', 'ISO-8859-1');
125 $this->config
->set('Test.ForceNoIconv', true);
126 $this->assertIdentical(
127 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
132 public function test_convertFromUTF8_withProtection()
134 // Preserve the characters!
135 $this->config
->set('Core.Encoding', 'ISO-8859-1');
136 $this->config
->set('Core.EscapeNonASCIICharacters', true);
137 $this->assertIdentical(
138 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
139 "中文 (Chinese)"
143 public function test_convertFromUTF8_withProtectionButUtf8()
145 // Preserve the characters!
146 $this->config
->set('Core.EscapeNonASCIICharacters', true);
147 $this->assertIdentical(
148 HTMLPurifier_Encoder
::convertFromUTF8($this->getZhongWen(), $this->config
, $this->context
),
149 "中文 (Chinese)"
153 public function test_convertToASCIIDumbLossless()
155 // Uppercase thorn letter
156 $this->assertIdentical(
157 HTMLPurifier_Encoder
::convertToASCIIDumbLossless("\xC3\x9Eorn"),
161 $this->assertIdentical(
162 HTMLPurifier_Encoder
::convertToASCIIDumbLossless("an"),
166 // test up to four bytes
167 $this->assertIdentical(
168 HTMLPurifier_Encoder
::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
174 public function assertASCIISupportCheck($enc, $ret)
176 $test = HTMLPurifier_Encoder
::testEncodingSupportsASCII($enc, true);
177 if ($test === false) return;
178 $this->assertIdentical(
179 HTMLPurifier_Encoder
::testEncodingSupportsASCII($enc),
182 $this->assertIdentical(
183 HTMLPurifier_Encoder
::testEncodingSupportsASCII($enc, true),
188 public function test_testEncodingSupportsASCII()
190 if (HTMLPurifier_Encoder
::iconvAvailable()) {
191 $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
192 $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
194 $this->assertASCIISupportCheck('ISO-8859-1', array());
195 $this->assertASCIISupportCheck('dontexist', array()); // canary
198 public function testShiftJIS()
200 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
201 $this->config
->set('Core.Encoding', 'Shift_JIS');
202 // This actually looks like a Yen, but we're going to treat it differently
203 $this->assertIdentical(
204 HTMLPurifier_Encoder
::convertFromUTF8('\\~', $this->config
, $this->context
),
207 $this->assertIdentical(
208 HTMLPurifier_Encoder
::convertToUTF8('\\~', $this->config
, $this->context
),
213 public function testIconvTruncateBug()
215 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
216 if (HTMLPurifier_Encoder
::testIconvTruncateBug() !== HTMLPurifier_Encoder
::ICONV_TRUNCATES
) return;
217 $this->config
->set('Core.Encoding', 'ISO-8859-1');
218 $this->assertIdentical(
219 HTMLPurifier_Encoder
::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config
, $this->context
),
220 str_repeat('a', 10000)
224 public function testIconvChunking()
226 if (!HTMLPurifier_Encoder
::iconvAvailable()) return;
227 if (HTMLPurifier_Encoder
::testIconvTruncateBug() !== HTMLPurifier_Encoder
::ICONV_TRUNCATES
) return;
228 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
229 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
230 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
231 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
232 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
233 $this->assertIdentical(HTMLPurifier_Encoder
::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
238 // vim: et sw=4 sts=4