Update code_sniffer build.xml file to be executable on our system
[phpbb.git] / phpBB / develop / utf_normalizer_test.php
blob8a7c31a1b0d59de61808196ffe0148aed7f9c77e
1 <?php
2 /**
4 * @package phpBB3
5 * @version $Id$
6 * @copyright (c) 2005 phpBB Group
7 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
9 */
11 if (php_sapi_name() != 'cli')
13 die("This program must be run from the command line.\n");
17 // Security message:
19 // This script is potentially dangerous.
20 // Remove or comment the next line (die(".... ) to enable this script.
21 // Do NOT FORGET to either remove this script or disable it after you have used it.
23 die("Please read the first lines of this script for instructions on how to enable it");
25 set_time_limit(0);
26 error_reporting(E_ALL);
28 define('IN_PHPBB', true);
29 define('PHPBB_ROOT_PATH', './../');
30 define('PHP_EXT', substr(strrchr(__FILE__, '.'), 1));
33 /**
34 * Let's download some files we need
36 download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
37 download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
39 /**
40 * Those are the tests we run
42 $test_suite = array(
43 /**
44 * NFC
45 * c2 == NFC(c1) == NFC(c2) == NFC(c3)
46 * c4 == NFC(c4) == NFC(c5)
48 'NFC' => array(
49 'c2' => array('c1', 'c2', 'c3'),
50 'c4' => array('c4', 'c5')
53 /**
54 * NFD
55 * c3 == NFD(c1) == NFD(c2) == NFD(c3)
56 * c5 == NFD(c4) == NFD(c5)
58 'NFD' => array(
59 'c3' => array('c1', 'c2', 'c3'),
60 'c5' => array('c4', 'c5')
63 /**
64 * NFKC
65 * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
67 'NFKC' => array(
68 'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
71 /**
72 * NFKD
73 * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
75 'NFKD' => array(
76 'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
80 require_once(PHPBB_ROOT_PATH . 'includes/utf/utf_normalizer.' . PHP_EXT);
82 $i = $n = 0;
83 $failed = false;
84 $tested_chars = array();
86 $fp = fopen(PHPBB_ROOT_PATH . 'develop/NormalizationTest.txt', 'rb');
87 while (!feof($fp))
89 $line = fgets($fp);
90 ++$n;
92 if ($line[0] == '@')
94 if ($i)
96 echo "done\n";
99 $i = 0;
100 echo "\n", substr($line, 1), "\n\n";
101 continue;
104 if (!strpos(' 0123456789ABCDEF', $line[0]))
106 continue;
109 if (++$i % 100 == 0)
111 echo $i, ' ';
114 list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
116 if (!strpos($c1, ' '))
119 * We are currently testing a single character, we add it to the list of
120 * characters we have processed so that we can exclude it when testing
121 * for invariants
123 $tested_chars[$c1] = 1;
126 foreach ($test_suite as $form => $serie)
128 foreach ($serie as $expected => $tests)
130 $hex_expected = ${$expected};
131 $utf_expected = hexseq_to_utf($hex_expected);
133 foreach ($tests as $test)
135 $utf_result = $utf_expected;
136 call_user_func(array('utf_normalizer', $form), $utf_result);
138 if (strcmp($utf_expected, $utf_result))
140 $failed = true;
141 $hex_result = utf_to_hexseq($utf_result);
143 echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
148 if ($failed)
150 die("\n\nFailed at line $n\n");
154 fclose($fp);
157 * Test for invariants
159 echo "\n\nTesting for invariants...\n\n";
161 $fp = fopen(PHPBB_ROOT_PATH . 'develop/UnicodeData.txt', 'rt');
163 $n = 0;
164 while (!feof($fp))
166 if (++$n % 100 == 0)
168 echo $n, ' ';
171 $line = fgets($fp, 1024);
173 if (!$pos = strpos($line, ';'))
175 continue;
178 $hex_tested = $hex_expected = substr($line, 0, $pos);
180 if (isset($tested_chars[$hex_tested]))
182 continue;
185 $utf_expected = hex_to_utf($hex_expected);
187 if ($utf_expected >= utf_normalizer::UTF8_SURROGATE_FIRST
188 && $utf_expected <= utf_normalizer::UTF8_SURROGATE_LAST)
191 * Surrogates are illegal on their own, we expect the normalizer
192 * to return a replacement char
194 $utf_expected = utf_normalizer::UTF8_REPLACEMENT;
195 $hex_expected = utf_to_hexseq($utf_expected);
198 foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
200 $utf_result = $utf_expected;
201 call_user_func(array('utf_normalizer', $form), $utf_result);
202 $hex_result = utf_to_hexseq($utf_result);
203 // echo "$form($utf_expected) == $utf_result\n";
205 if (strcmp($utf_expected, $utf_result))
207 $failed = 1;
209 echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
213 if ($failed)
215 die("\n\nFailed at line $n\n");
218 fclose($fp);
220 die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
223 * Download a file to the develop/ dir
225 * @param string $url URL of the file to download
226 * @return void
228 function download($url)
230 if (file_exists(PHPBB_ROOT_PATH . 'develop/' . basename($url)))
232 return;
235 echo 'Downloading from ', $url, ' ';
237 if (!$fpr = fopen($url, 'rb'))
239 die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
242 if (!$fpw = fopen(PHPBB_ROOT_PATH . 'develop/' . basename($url), 'wb'))
244 die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
247 $i = 0;
248 $chunk = 32768;
249 $done = '';
251 while (!feof($fpr))
253 $i += fwrite($fpw, fread($fpr, $chunk));
254 echo str_repeat("\x08", strlen($done));
256 $done = ($i >> 10) . ' KiB';
257 echo $done;
259 fclose($fpr);
260 fclose($fpw);
262 echo "\n";
266 * Convert a UTF string to a sequence of codepoints in hexadecimal
268 * @param string $utf UTF string
269 * @return integer Unicode codepoints in hex
271 function utf_to_hexseq($str)
273 $pos = 0;
274 $len = strlen($str);
275 $ret = array();
277 while ($pos < $len)
279 $c = $str[$pos];
280 switch ($c & "\xF0")
282 case "\xC0":
283 case "\xD0":
284 $utf_char = substr($str, $pos, 2);
285 $pos += 2;
286 break;
288 case "\xE0":
289 $utf_char = substr($str, $pos, 3);
290 $pos += 3;
291 break;
293 case "\xF0":
294 $utf_char = substr($str, $pos, 4);
295 $pos += 4;
296 break;
298 default:
299 $utf_char = $c;
300 ++$pos;
303 $hex = dechex(utf_to_cp($utf_char));
305 if (!isset($hex[3]))
307 $hex = substr('000' . $hex, -4);
310 $ret[] = $hex;
313 return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
317 * Convert a UTF-8 char to its codepoint
319 * @param string $utf_char UTF-8 char
320 * @return integer Unicode codepoint
322 function utf_to_cp($utf_char)
324 switch (strlen($utf_char))
326 case 1:
327 return ord($utf_char);
329 case 2:
330 return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
332 case 3:
333 return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
335 case 4:
336 return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
338 default:
339 die('UTF-8 chars can only be 1-4 bytes long');
344 * Return a UTF string formed from a sequence of codepoints in hexadecimal
346 * @param string $seq Sequence of codepoints, separated with a space
347 * @return string UTF-8 string
349 function hexseq_to_utf($seq)
351 return implode('', array_map('hex_to_utf', explode(' ', $seq)));
355 * Convert a codepoint in hexadecimal to a UTF-8 char
357 * @param string $hex Codepoint, in hexadecimal
358 * @return string UTF-8 char
360 function hex_to_utf($hex)
362 return cp_to_utf(hexdec($hex));
366 * Convert a codepoint to a UTF-8 char
368 * @param integer $cp Unicode codepoint
369 * @return string UTF-8 string
371 function cp_to_utf($cp)
373 if ($cp > 0xFFFF)
375 return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
377 else if ($cp > 0x7FF)
379 return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
381 else if ($cp > 0x7F)
383 return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
385 else
387 return chr($cp);