phpBB/develop/utf_normalizer_test.php

   1 <?php
   2 /**
   3 *
   4 * @package phpBB3
   5 * @version $Id$
   6 * @copyright (c) 2005 phpBB Group
   7 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
   8 *
   9 */
  10
  11 if (php_sapi_name() != 'cli')
  12 {
  13         die("This program must be run from the command line.\n");
  14 }
  15
  16 //
  17 // Security message:
  18 //
  19 // This script is potentially dangerous.
  20 // Remove or comment the next line (die(".... ) to enable this script.
  21 // Do NOT FORGET to either remove this script or disable it after you have used it.
  22 //
  23 die("Please read the first lines of this script for instructions on how to enable it");
  24
  25 set_time_limit(0);
  26 error_reporting(E_ALL);
  27
  28 define('IN_PHPBB', true);
  29 define('PHPBB_ROOT_PATH', './../');
  30 define('PHP_EXT', substr(strrchr(__FILE__, '.'), 1));
  31
  32
  33 /**
  34 * Let's download some files we need
  35 */
  36 download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
  37 download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
  38
  39 /**
  40 * Those are the tests we run
  41 */
  42 $test_suite = array(
  43         /**
  44         * NFC
  45         *   c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
  46         *   c4 ==  NFC(c4) ==  NFC(c5)
  47         */
  48         'NFC'   =>      array(
  49                 'c2'    =>      array('c1', 'c2', 'c3'),
  50                 'c4'    =>      array('c4', 'c5')
  51         ),
  52
  53         /**
  54         * NFD
  55         *   c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
  56         *   c5 ==  NFD(c4) ==  NFD(c5)
  57         */
  58         'NFD'   =>      array(
  59                 'c3'    =>      array('c1', 'c2', 'c3'),
  60                 'c5'    =>      array('c4', 'c5')
  61         ),
  62
  63         /**
  64         * NFKC
  65         *   c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
  66         */
  67         'NFKC'  =>      array(
  68                 'c4'    =>      array('c1', 'c2', 'c3', 'c4', 'c5')
  69         ),
  70
  71         /**
  72         * NFKD
  73         *   c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
  74         */
  75         'NFKD'  =>      array(
  76                 'c5'    =>      array('c1', 'c2', 'c3', 'c4', 'c5')
  77         )
  78 );
  79
  80 require_once(PHPBB_ROOT_PATH . 'includes/utf/utf_normalizer.' . PHP_EXT);
  81
  82 $i = $n = 0;
  83 $failed = false;
  84 $tested_chars = array();
  85
  86 $fp = fopen(PHPBB_ROOT_PATH . 'develop/NormalizationTest.txt', 'rb');
  87 while (!feof($fp))
  88 {
  89         $line = fgets($fp);
  90         ++$n;
  91
  92         if ($line[0] == '@')
  93         {
  94                 if ($i)
  95                 {
  96                         echo "done\n";
  97                 }
  98
  99                 $i = 0;
 100                 echo "\n", substr($line, 1), "\n\n";
 101                 continue;
 102         }
 103
 104         if (!strpos(' 0123456789ABCDEF', $line[0]))
 105         {
 106                 continue;
 107         }
 108
 109         if (++$i % 100 == 0)
 110         {
 111                 echo $i, ' ';
 112         }
 113
 114         list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
 115
 116         if (!strpos($c1, ' '))
 117         {
 118                 /**
 119                 * We are currently testing a single character, we add it to the list of
 120                 * characters we have processed so that we can exclude it when testing
 121                 * for invariants
 122                 */
 123                 $tested_chars[$c1] = 1;
 124         }
 125
 126         foreach ($test_suite as $form => $serie)
 127         {
 128                 foreach ($serie as $expected => $tests)
 129                 {
 130                         $hex_expected = ${$expected};
 131                         $utf_expected = hexseq_to_utf($hex_expected);
 132
 133                         foreach ($tests as $test)
 134                         {
 135                                 $utf_result = $utf_expected;
 136                                 call_user_func(array('utf_normalizer', $form), $utf_result);
 137
 138                                 if (strcmp($utf_expected, $utf_result))
 139                                 {
 140                                         $failed = true;
 141                                         $hex_result = utf_to_hexseq($utf_result);
 142
 143                                         echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
 144                                 }
 145                         }
 146                 }
 147
 148                 if ($failed)
 149                 {
 150                         die("\n\nFailed at line $n\n");
 151                 }
 152         }
 153 }
 154 fclose($fp);
 155
 156 /**
 157 * Test for invariants
 158 */
 159 echo "\n\nTesting for invariants...\n\n";
 160
 161 $fp = fopen(PHPBB_ROOT_PATH . 'develop/UnicodeData.txt', 'rt');
 162
 163 $n = 0;
 164 while (!feof($fp))
 165 {
 166         if (++$n % 100 == 0)
 167         {
 168                 echo $n, ' ';
 169         }
 170
 171         $line = fgets($fp, 1024);
 172
 173         if (!$pos = strpos($line, ';'))
 174         {
 175                 continue;
 176         }
 177
 178         $hex_tested = $hex_expected = substr($line, 0, $pos);
 179
 180         if (isset($tested_chars[$hex_tested]))
 181         {
 182                 continue;
 183         }
 184
 185         $utf_expected = hex_to_utf($hex_expected);
 186
 187         if ($utf_expected >= utf_normalizer::UTF8_SURROGATE_FIRST
 188          && $utf_expected <= utf_normalizer::UTF8_SURROGATE_LAST)
 189         {
 190                 /**
 191                 * Surrogates are illegal on their own, we expect the normalizer
 192                 * to return a replacement char
 193                 */
 194                 $utf_expected = utf_normalizer::UTF8_REPLACEMENT;
 195                 $hex_expected = utf_to_hexseq($utf_expected);
 196         }
 197
 198         foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
 199         {
 200                 $utf_result = $utf_expected;
 201                 call_user_func(array('utf_normalizer', $form), $utf_result);
 202                 $hex_result = utf_to_hexseq($utf_result);
 203 //              echo "$form($utf_expected) == $utf_result\n";
 204
 205                 if (strcmp($utf_expected, $utf_result))
 206                 {
 207                         $failed = 1;
 208
 209                         echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
 210                 }
 211         }
 212
 213         if ($failed)
 214         {
 215                 die("\n\nFailed at line $n\n");
 216         }
 217 }
 218 fclose($fp);
 219
 220 die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
 221
 222 /**
 223 * Download a file to the develop/ dir
 224 *
 225 * @param        string  $url            URL of the file to download
 226 * @return       void
 227 */
 228 function download($url)
 229 {
 230         if (file_exists(PHPBB_ROOT_PATH . 'develop/' . basename($url)))
 231         {
 232                 return;
 233         }
 234
 235         echo 'Downloading from ', $url, ' ';
 236
 237         if (!$fpr = fopen($url, 'rb'))
 238         {
 239                 die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
 240         }
 241
 242         if (!$fpw = fopen(PHPBB_ROOT_PATH . 'develop/' . basename($url), 'wb'))
 243         {
 244                 die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
 245         }
 246
 247         $i = 0;
 248         $chunk = 32768;
 249         $done = '';
 250
 251         while (!feof($fpr))
 252         {
 253                 $i += fwrite($fpw, fread($fpr, $chunk));
 254                 echo str_repeat("\x08", strlen($done));
 255
 256                 $done = ($i >> 10) . ' KiB';
 257                 echo $done;
 258         }
 259         fclose($fpr);
 260         fclose($fpw);
 261
 262         echo "\n";
 263 }
 264
 265 /**
 266 * Convert a UTF string to a sequence of codepoints in hexadecimal
 267 *
 268 * @param        string  $utf    UTF string
 269 * @return       integer                 Unicode codepoints in hex
 270 */
 271 function utf_to_hexseq($str)
 272 {
 273         $pos = 0;
 274         $len = strlen($str);
 275         $ret = array();
 276
 277         while ($pos < $len)
 278         {
 279                 $c = $str[$pos];
 280                 switch ($c & "\xF0")
 281                 {
 282                         case "\xC0":
 283                         case "\xD0":
 284                                 $utf_char = substr($str, $pos, 2);
 285                                 $pos += 2;
 286                                 break;
 287
 288                         case "\xE0":
 289                                 $utf_char = substr($str, $pos, 3);
 290                                 $pos += 3;
 291                                 break;
 292
 293                         case "\xF0":
 294                                 $utf_char = substr($str, $pos, 4);
 295                                 $pos += 4;
 296                                 break;
 297
 298                         default:
 299                                 $utf_char = $c;
 300                                 ++$pos;
 301                 }
 302
 303                 $hex = dechex(utf_to_cp($utf_char));
 304
 305                 if (!isset($hex[3]))
 306                 {
 307                         $hex = substr('000' . $hex, -4);
 308                 }
 309
 310                 $ret[] = $hex;
 311         }
 312
 313         return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
 314 }
 315
 316 /**
 317 * Convert a UTF-8 char to its codepoint
 318 *
 319 * @param        string  $utf_char       UTF-8 char
 320 * @return       integer                         Unicode codepoint
 321 */
 322 function utf_to_cp($utf_char)
 323 {
 324         switch (strlen($utf_char))
 325         {
 326                 case 1:
 327                         return ord($utf_char);
 328
 329                 case 2:
 330                         return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
 331
 332                 case 3:
 333                         return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
 334
 335                 case 4:
 336                         return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
 337
 338                 default:
 339                         die('UTF-8 chars can only be 1-4 bytes long');
 340         }
 341 }
 342
 343 /**
 344 * Return a UTF string formed from a sequence of codepoints in hexadecimal
 345 *
 346 * @param        string  $seq            Sequence of codepoints, separated with a space
 347 * @return       string                          UTF-8 string
 348 */
 349 function hexseq_to_utf($seq)
 350 {
 351         return implode('', array_map('hex_to_utf', explode(' ', $seq)));
 352 }
 353
 354 /**
 355 * Convert a codepoint in hexadecimal to a UTF-8 char
 356 *
 357 * @param        string  $hex            Codepoint, in hexadecimal
 358 * @return       string                          UTF-8 char
 359 */
 360 function hex_to_utf($hex)
 361 {
 362         return cp_to_utf(hexdec($hex));
 363 }
 364
 365 /**
 366 * Convert a codepoint to a UTF-8 char
 367 *
 368 * @param        integer $cp                     Unicode codepoint
 369 * @return       string                          UTF-8 string
 370 */
 371 function cp_to_utf($cp)
 372 {
 373         if ($cp > 0xFFFF)
 374         {
 375                 return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
 376         }
 377         else if ($cp > 0x7FF)
 378         {
 379                 return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
 380         }
 381         else if ($cp > 0x7F)
 382         {
 383                 return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
 384         }
 385         else
 386         {
 387                 return chr($cp);
 388         }
 389 }