PSR-2 reformatting PHPDoc corrections
[htmlpurifier.git] / tests / HTMLPurifier / EncoderTest.php
blob819d4b117745024cece272a6c01666ba7f5aa0dc
1 <?php
3 class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
6 protected $_entity_lookup;
8 public function setUp()
10 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
11 parent::setUp();
14 public function assertCleanUTF8($string, $expect = null)
16 if ($expect === null) $expect = $string;
17 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
18 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
21 public function test_cleanUTF8()
23 $this->assertCleanUTF8('Normal string.');
24 $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
25 $this->assertCleanUTF8("null byte: \0", 'null byte: ');
26 $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
27 $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
28 $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
29 $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
30 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
31 // invalid codepoints
32 $this->assertCleanUTF8("\xED\xB0\x80", '');
35 public function test_convertToUTF8_noConvert()
37 // UTF-8 means that we don't touch it
38 $this->assertIdentical(
39 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
40 "\xF6", // this is invalid
41 'Expected identical [Binary: F6]'
45 public function test_convertToUTF8_spuriousEncoding()
47 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
48 $this->config->set('Core.Encoding', 'utf99');
49 $this->expectError('Invalid encoding utf99');
50 $this->assertIdentical(
51 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
56 public function test_convertToUTF8_iso8859_1()
58 $this->config->set('Core.Encoding', 'ISO-8859-1');
59 $this->assertIdentical(
60 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
61 "\xC3\xB6"
65 public function test_convertToUTF8_withoutIconv()
67 $this->config->set('Core.Encoding', 'ISO-8859-1');
68 $this->config->set('Test.ForceNoIconv', true);
69 $this->assertIdentical(
70 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
71 "\xC3\xB6"
76 public function getZhongWen()
78 return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
81 public function test_convertFromUTF8_utf8()
83 // UTF-8 means that we don't touch it
84 $this->assertIdentical(
85 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
86 "\xC3\xB6"
90 public function test_convertFromUTF8_iso8859_1()
92 $this->config->set('Core.Encoding', 'ISO-8859-1');
93 $this->assertIdentical(
94 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
95 "\xF6",
96 'Expected identical [Binary: F6]'
100 public function test_convertFromUTF8_iconvNoChars()
102 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
103 $this->config->set('Core.Encoding', 'ISO-8859-1');
104 $this->assertIdentical(
105 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
106 " (Chinese)"
110 public function test_convertFromUTF8_phpNormal()
112 // Plain PHP implementation has slightly different behavior
113 $this->config->set('Core.Encoding', 'ISO-8859-1');
114 $this->config->set('Test.ForceNoIconv', true);
115 $this->assertIdentical(
116 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
117 "\xF6",
118 'Expected identical [Binary: F6]'
122 public function test_convertFromUTF8_phpNoChars()
124 $this->config->set('Core.Encoding', 'ISO-8859-1');
125 $this->config->set('Test.ForceNoIconv', true);
126 $this->assertIdentical(
127 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
128 "?? (Chinese)"
132 public function test_convertFromUTF8_withProtection()
134 // Preserve the characters!
135 $this->config->set('Core.Encoding', 'ISO-8859-1');
136 $this->config->set('Core.EscapeNonASCIICharacters', true);
137 $this->assertIdentical(
138 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
139 "&#20013;&#25991; (Chinese)"
143 public function test_convertFromUTF8_withProtectionButUtf8()
145 // Preserve the characters!
146 $this->config->set('Core.EscapeNonASCIICharacters', true);
147 $this->assertIdentical(
148 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
149 "&#20013;&#25991; (Chinese)"
153 public function test_convertToASCIIDumbLossless()
155 // Uppercase thorn letter
156 $this->assertIdentical(
157 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
158 "&#222;orn"
161 $this->assertIdentical(
162 HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
163 "an"
166 // test up to four bytes
167 $this->assertIdentical(
168 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
169 "&#917536;"
174 public function assertASCIISupportCheck($enc, $ret)
176 $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
177 if ($test === false) return;
178 $this->assertIdentical(
179 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
180 $ret
182 $this->assertIdentical(
183 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
184 $ret
188 public function test_testEncodingSupportsASCII()
190 if (HTMLPurifier_Encoder::iconvAvailable()) {
191 $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
192 $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
194 $this->assertASCIISupportCheck('ISO-8859-1', array());
195 $this->assertASCIISupportCheck('dontexist', array()); // canary
198 public function testShiftJIS()
200 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
201 $this->config->set('Core.Encoding', 'Shift_JIS');
202 // This actually looks like a Yen, but we're going to treat it differently
203 $this->assertIdentical(
204 HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
205 '\\~'
207 $this->assertIdentical(
208 HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
209 '\\~'
213 public function testIconvTruncateBug()
215 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
216 if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
217 $this->config->set('Core.Encoding', 'ISO-8859-1');
218 $this->assertIdentical(
219 HTMLPurifier_Encoder::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config, $this->context),
220 str_repeat('a', 10000)
224 public function testIconvChunking()
226 if (!HTMLPurifier_Encoder::iconvAvailable()) return;
227 if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
228 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
229 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
230 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
231 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
232 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
233 $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
238 // vim: et sw=4 sts=4