Improve auto-paragraph to preserve newlines and handle edge-cases better.
[htmlpurifier.git] / tests / HTMLPurifier / EncoderTest.php
blob03263e7c5493ba2ea252ee2290be348bdbaaf267
1 <?php
3 class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
6 protected $_entity_lookup;
8 function setUp() {
9 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
10 parent::setUp();
13 function assertCleanUTF8($string, $expect = null) {
14 if ($expect === null) $expect = $string;
15 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
16 $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
19 function test_cleanUTF8() {
20 $this->assertCleanUTF8('Normal string.');
21 $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
22 $this->assertCleanUTF8("null byte: \0", 'null byte: ');
23 $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
24 $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
25 $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
26 $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
27 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
28 // invalid codepoints
29 $this->assertCleanUTF8("\xED\xB0\x80", '');
32 function test_convertToUTF8_noConvert() {
33 // UTF-8 means that we don't touch it
34 $this->assertIdentical(
35 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
36 "\xF6", // this is invalid
37 'Expected identical [Binary: F6]'
41 function test_convertToUTF8_spuriousEncoding() {
42 // We don't support this as UTF-8, because UTF-8 is the default and
43 // shouldn't be set if not necessary.
44 $this->config->set('Core', 'Encoding', 'utf8');
45 $this->expectError('Invalid encoding utf8');
46 $this->assertIdentical(
47 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
52 function test_convertToUTF8_iso8859_1() {
53 $this->config->set('Core', 'Encoding', 'ISO-8859-1');
54 $this->assertIdentical(
55 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
56 "\xC3\xB6"
60 function test_convertToUTF8_withoutIconv() {
61 $this->config->set('Core', 'Encoding', 'ISO-8859-1');
62 $this->config->set('Test', 'ForceNoIconv', true);
63 $this->assertIdentical(
64 HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
65 "\xC3\xB6"
70 function getZhongWen() {
71 return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
74 function test_convertFromUTF8_utf8() {
75 // UTF-8 means that we don't touch it
76 $this->assertIdentical(
77 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
78 "\xC3\xB6"
82 function test_convertFromUTF8_iso8859_1() {
83 $this->config->set('Core', 'Encoding', 'ISO-8859-1');
84 $this->assertIdentical(
85 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
86 "\xF6",
87 'Expected identical [Binary: F6]'
91 function test_convertFromUTF8_iconvNoChars() {
92 if (!function_exists('iconv')) return;
93 $this->config->set('Core', 'Encoding', 'ISO-8859-1');
94 $this->assertIdentical(
95 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
96 " (Chinese)"
100 function test_convertFromUTF8_phpNormal() {
101 // Plain PHP implementation has slightly different behavior
102 $this->config->set('Core', 'Encoding', 'ISO-8859-1');
103 $this->config->set('Test', 'ForceNoIconv', true);
104 $this->assertIdentical(
105 HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
106 "\xF6",
107 'Expected identical [Binary: F6]'
111 function test_convertFromUTF8_phpNoChars() {
112 $this->config->set('Core', 'Encoding', 'ISO-8859-1');
113 $this->config->set('Test', 'ForceNoIconv', true);
114 $this->assertIdentical(
115 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
116 "?? (Chinese)"
120 function test_convertFromUTF8_withProtection() {
121 // Preserve the characters!
122 $this->config->set('Core', 'Encoding', 'ISO-8859-1');
123 $this->config->set('Core', 'EscapeNonASCIICharacters', true);
124 $this->assertIdentical(
125 HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
126 "&#20013;&#25991; (Chinese)"
131 function test_convertToASCIIDumbLossless() {
133 // Uppercase thorn letter
134 $this->assertIdentical(
135 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
136 "&#222;orn"
139 $this->assertIdentical(
140 HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
141 "an"
144 // test up to four bytes
145 $this->assertIdentical(
146 HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
147 "&#917536;"
152 function assertASCIISupportCheck($enc, $ret) {
153 $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
154 if ($test === false) return;
155 $this->assertIdentical(
156 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
157 $ret
159 $this->assertIdentical(
160 HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
161 $ret
165 function test_testEncodingSupportsASCII() {
166 $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
167 $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
168 $this->assertASCIISupportCheck('ISO-8859-1', array());
169 $this->assertASCIISupportCheck('dontexist', array()); // canary
172 function testShiftJIS() {
173 if (!function_exists('iconv')) return;
174 $this->config->set('Core', 'Encoding', 'Shift_JIS');
175 // This actually looks like a Yen, but we're going to treat it differently
176 $this->assertIdentical(
177 HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
178 '\\~'
180 $this->assertIdentical(
181 HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
182 '\\~'