Do checks against iconvAvailable because PHP 5.4 has botched iconv support.
[htmlpurifier.git] / tests / HTMLPurifier / EncoderTest.php
blobe9593e2c47f2f7633a2318d8811df2888f0cc6a1
1 <?php
3 class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
6 protected $_entity_lookup;
8 function setUp() {
9 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
10 parent::setUp();
13 function assertCleanUTF8($string, $expect = null) {
14 if ($expect === null) $expect = $string;
15 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
16 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
19 function test_cleanUTF8() {
20 $this->assertCleanUTF8('Normal string.');
21 $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
22 $this->assertCleanUTF8("null byte: \0", 'null byte: ');
23 $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
24 $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
25 $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
26 $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
27 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
28 // invalid codepoints
29 $this->assertCleanUTF8("\xED\xB0\x80", '');
32 function test_convertToUTF8_noConvert() {
33 // UTF-8 means that we don't touch it
34 $this->assertIdentical(
35 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
36 "\xF6", // this is invalid
37 'Expected identical [Binary: F6]'
41 function test_convertToUTF8_spuriousEncoding() {
42 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
43 $this->config->set('Core.Encoding', 'utf99');
44 $this->expectError('Invalid encoding utf99');
45 $this->assertIdentical(
46 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
51 function test_convertToUTF8_iso8859_1() {
52 $this->config->set('Core.Encoding', 'ISO-8859-1');
53 $this->assertIdentical(
54 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
55 "\xC3\xB6"
59 function test_convertToUTF8_withoutIconv() {
60 $this->config->set('Core.Encoding', 'ISO-8859-1');
61 $this->config->set('Test.ForceNoIconv', true);
62 $this->assertIdentical(
63 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
64 "\xC3\xB6"
69 function getZhongWen() {
70 return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
73 function test_convertFromUTF8_utf8() {
74 // UTF-8 means that we don't touch it
75 $this->assertIdentical(
76 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
77 "\xC3\xB6"
81 function test_convertFromUTF8_iso8859_1() {
82 $this->config->set('Core.Encoding', 'ISO-8859-1');
83 $this->assertIdentical(
84 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
85 "\xF6",
86 'Expected identical [Binary: F6]'
90 function test_convertFromUTF8_iconvNoChars() {
91 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
92 $this->config->set('Core.Encoding', 'ISO-8859-1');
93 $this->assertIdentical(
94 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
95 " (Chinese)"
99 function test_convertFromUTF8_phpNormal() {
100 // Plain PHP implementation has slightly different behavior
101 $this->config->set('Core.Encoding', 'ISO-8859-1');
102 $this->config->set('Test.ForceNoIconv', true);
103 $this->assertIdentical(
104 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
105 "\xF6",
106 'Expected identical [Binary: F6]'
110 function test_convertFromUTF8_phpNoChars() {
111 $this->config->set('Core.Encoding', 'ISO-8859-1');
112 $this->config->set('Test.ForceNoIconv', true);
113 $this->assertIdentical(
114 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
115 "?? (Chinese)"
119 function test_convertFromUTF8_withProtection() {
120 // Preserve the characters!
121 $this->config->set('Core.Encoding', 'ISO-8859-1');
122 $this->config->set('Core.EscapeNonASCIICharacters', true);
123 $this->assertIdentical(
124 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
125 "&#20013;&#25991; (Chinese)"
129 function test_convertFromUTF8_withProtectionButUtf8() {
130 // Preserve the characters!
131 $this->config->set('Core.EscapeNonASCIICharacters', true);
132 $this->assertIdentical(
133 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
134 "&#20013;&#25991; (Chinese)"
138 function test_convertToASCIIDumbLossless() {
140 // Uppercase thorn letter
141 $this->assertIdentical(
142 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
143 "&#222;orn"
146 $this->assertIdentical(
147 HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
148 "an"
151 // test up to four bytes
152 $this->assertIdentical(
153 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
154 "&#917536;"
159 function assertASCIISupportCheck($enc, $ret) {
160 $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
161 if ($test === false) return;
162 $this->assertIdentical(
163 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
164 $ret
166 $this->assertIdentical(
167 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
168 $ret
172 function test_testEncodingSupportsASCII() {
173 if (HTMLPurifier_Encoder::iconvAvailable()) {
174 $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
175 $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
177 $this->assertASCIISupportCheck('ISO-8859-1', array());
178 $this->assertASCIISupportCheck('dontexist', array()); // canary
181 function testShiftJIS() {
182 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
183 $this->config->set('Core.Encoding', 'Shift_JIS');
184 // This actually looks like a Yen, but we're going to treat it differently
185 $this->assertIdentical(
186 HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
187 '\\~'
189 $this->assertIdentical(
190 HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
191 '\\~'
195 function testIconvTruncateBug() {
196 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
197 if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
198 $this->config->set('Core.Encoding', 'ISO-8859-1');
199 $this->assertIdentical(
200 HTMLPurifier_Encoder::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config, $this->context),
201 str_repeat('a', 10000)
205 function testIconvChunking() {
206 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
207 if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
208 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
209 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
210 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
211 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
212 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
213 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
218 // vim: et sw=4 sts=4