6 * @copyright (c) 2005 phpBB Group
7 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
11 if (php_sapi_name() != 'cli')
13 die("This program must be run from the command line.\n");
19 // This script is potentially dangerous.
20 // Remove or comment the next line (die(".... ) to enable this script.
21 // Do NOT FORGET to either remove this script or disable it after you have used it.
23 die("Please read the first lines of this script for instructions on how to enable it");
26 error_reporting(E_ALL
);
28 define('IN_PHPBB', true);
29 define('PHPBB_ROOT_PATH', './../');
30 define('PHP_EXT', substr(strrchr(__FILE__
, '.'), 1));
34 * Let's download some files we need
36 download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
37 download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
40 * Those are the tests we run
45 * c2 == NFC(c1) == NFC(c2) == NFC(c3)
46 * c4 == NFC(c4) == NFC(c5)
49 'c2' => array('c1', 'c2', 'c3'),
50 'c4' => array('c4', 'c5')
55 * c3 == NFD(c1) == NFD(c2) == NFD(c3)
56 * c5 == NFD(c4) == NFD(c5)
59 'c3' => array('c1', 'c2', 'c3'),
60 'c5' => array('c4', 'c5')
65 * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
68 'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
73 * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
76 'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
80 require_once(PHPBB_ROOT_PATH
. 'includes/utf/utf_normalizer.' . PHP_EXT
);
84 $tested_chars = array();
86 $fp = fopen(PHPBB_ROOT_PATH
. 'develop/NormalizationTest.txt', 'rb');
100 echo "\n", substr($line, 1), "\n\n";
104 if (!strpos(' 0123456789ABCDEF', $line[0]))
114 list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
116 if (!strpos($c1, ' '))
119 * We are currently testing a single character, we add it to the list of
120 * characters we have processed so that we can exclude it when testing
123 $tested_chars[$c1] = 1;
126 foreach ($test_suite as $form => $serie)
128 foreach ($serie as $expected => $tests)
130 $hex_expected = $
{$expected};
131 $utf_expected = hexseq_to_utf($hex_expected);
133 foreach ($tests as $test)
135 $utf_result = $utf_expected;
136 call_user_func(array('utf_normalizer', $form), $utf_result);
138 if (strcmp($utf_expected, $utf_result))
141 $hex_result = utf_to_hexseq($utf_result);
143 echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
150 die("\n\nFailed at line $n\n");
157 * Test for invariants
159 echo "\n\nTesting for invariants...\n\n";
161 $fp = fopen(PHPBB_ROOT_PATH
. 'develop/UnicodeData.txt', 'rt');
171 $line = fgets($fp, 1024);
173 if (!$pos = strpos($line, ';'))
178 $hex_tested = $hex_expected = substr($line, 0, $pos);
180 if (isset($tested_chars[$hex_tested]))
185 $utf_expected = hex_to_utf($hex_expected);
187 if ($utf_expected >= utf_normalizer
::UTF8_SURROGATE_FIRST
188 && $utf_expected <= utf_normalizer
::UTF8_SURROGATE_LAST
)
191 * Surrogates are illegal on their own, we expect the normalizer
192 * to return a replacement char
194 $utf_expected = utf_normalizer
::UTF8_REPLACEMENT
;
195 $hex_expected = utf_to_hexseq($utf_expected);
198 foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
200 $utf_result = $utf_expected;
201 call_user_func(array('utf_normalizer', $form), $utf_result);
202 $hex_result = utf_to_hexseq($utf_result);
203 // echo "$form($utf_expected) == $utf_result\n";
205 if (strcmp($utf_expected, $utf_result))
209 echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
215 die("\n\nFailed at line $n\n");
220 die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
223 * Download a file to the develop/ dir
225 * @param string $url URL of the file to download
228 function download($url)
230 if (file_exists(PHPBB_ROOT_PATH
. 'develop/' . basename($url)))
235 echo 'Downloading from ', $url, ' ';
237 if (!$fpr = fopen($url, 'rb'))
239 die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
242 if (!$fpw = fopen(PHPBB_ROOT_PATH
. 'develop/' . basename($url), 'wb'))
244 die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
253 $i +
= fwrite($fpw, fread($fpr, $chunk));
254 echo str_repeat("\x08", strlen($done));
256 $done = ($i >> 10) . ' KiB';
266 * Convert a UTF string to a sequence of codepoints in hexadecimal
268 * @param string $utf UTF string
269 * @return integer Unicode codepoints in hex
271 function utf_to_hexseq($str)
284 $utf_char = substr($str, $pos, 2);
289 $utf_char = substr($str, $pos, 3);
294 $utf_char = substr($str, $pos, 4);
303 $hex = dechex(utf_to_cp($utf_char));
307 $hex = substr('000' . $hex, -4);
313 return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
317 * Convert a UTF-8 char to its codepoint
319 * @param string $utf_char UTF-8 char
320 * @return integer Unicode codepoint
322 function utf_to_cp($utf_char)
324 switch (strlen($utf_char))
327 return ord($utf_char);
330 return ((ord($utf_char[0]) & 0x1F) << 6) |
(ord($utf_char[1]) & 0x3F);
333 return ((ord($utf_char[0]) & 0x0F) << 12) |
((ord($utf_char[1]) & 0x3F) << 6) |
(ord($utf_char[2]) & 0x3F);
336 return ((ord($utf_char[0]) & 0x07) << 18) |
((ord($utf_char[1]) & 0x3F) << 12) |
((ord($utf_char[2]) & 0x3F) << 6) |
(ord($utf_char[3]) & 0x3F);
339 die('UTF-8 chars can only be 1-4 bytes long');
344 * Return a UTF string formed from a sequence of codepoints in hexadecimal
346 * @param string $seq Sequence of codepoints, separated with a space
347 * @return string UTF-8 string
349 function hexseq_to_utf($seq)
351 return implode('', array_map('hex_to_utf', explode(' ', $seq)));
355 * Convert a codepoint in hexadecimal to a UTF-8 char
357 * @param string $hex Codepoint, in hexadecimal
358 * @return string UTF-8 char
360 function hex_to_utf($hex)
362 return cp_to_utf(hexdec($hex));
366 * Convert a codepoint to a UTF-8 char
368 * @param integer $cp Unicode codepoint
369 * @return string UTF-8 string
371 function cp_to_utf($cp)
375 return chr(0xF0 |
($cp >> 18)) . chr(0x80 |
(($cp >> 12) & 0x3F)) . chr(0x80 |
(($cp >> 6) & 0x3F)) . chr(0x80 |
($cp & 0x3F));
377 else if ($cp > 0x7FF)
379 return chr(0xE0 |
($cp >> 12)) . chr(0x80 |
(($cp >> 6) & 0x3F)) . chr(0x80 |
($cp & 0x3F));
383 return chr(0xC0 |
($cp >> 6)) . chr(0x80 |
($cp & 0x3F));