hphp/test/test_ext_icu.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com)         |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/test/test_ext_icu.h"
  18 #include "hphp/runtime/ext/ext_icu.h"
  19 #include <iostream>
  20
  21 ///////////////////////////////////////////////////////////////////////////////
  22
  23 bool TestExtIcu::RunTests(const std::string &which) {
  24   bool ret = true;
  25
  26   RUN_TEST(test_icu_match);
  27   RUN_TEST(test_icu_transliterate);
  28   RUN_TEST(test_icu_tokenize);
  29
  30   return ret;
  31 }
  32
  33 ///////////////////////////////////////////////////////////////////////////////
  34
  35 bool TestExtIcu::test_icu_match() {
  36   // Test subject strings.
  37   String subject = String(
  38     "\u05d6\U00010905 PHP is a scripting language. \ufeb0\ufef3",
  39     CopyString);
  40   String subject_32 = String(
  41     "\U00010905\U00010905\U00010905\U00010905\U00010905\U00010905",
  42     CopyString);
  43   String subject_en = String("this is an english string", CopyString);
  44   // "this is a hebrew string"
  45   String subject_he = String(
  46     "\u05d6\u05d4 \u05d4\u05d5\u05d0 \u05de\u05d7\u05e8\u05d5\u05d6\u05ea "
  47     "\u05e2\u05d1\u05e8\u05d9\u05ea",
  48     CopyString);
  49   // "this is an arabic string"
  50   String subject_ar = String(
  51     "\ufee9\ufeab\ufe8d \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb "
  52     "\ufe8d\ufefa\ufee8\ufea0\ufee0\ufef3\ufeb0\ufef3",
  53     CopyString);
  54   // "this is a hebrew string"
  55   String subject_mixed = String(
  56     "this is a \u05e2\u05d1\u05e8\u05d9\u05ea string",
  57     CopyString);
  58
  59   // Test basic regex parsing functionality.
  60   VERIFY(f_icu_match("scripting", subject));
  61   VERIFY(!f_icu_match("php", subject));
  62   VERIFY(f_icu_match("(\\bPHP\\b)", subject));
  63   VERIFY(!f_icu_match("(\\bPHP\\b))", subject));
  64
  65   // Test returning matches functionality.
  66   Variant matches;
  67   VERIFY(f_icu_match("(PHP) is", subject, ref(matches)));
  68   VS(f_print_r(matches, true),
  69     "Array\n"
  70     "(\n"
  71     "    [0] => PHP is\n"
  72     "    [1] => PHP\n"
  73     ")\n");
  74   VERIFY(f_icu_match("is (a)", subject, ref(matches),
  75                      k_UREGEX_OFFSET_CAPTURE));
  76   VS(f_print_r(matches, true),
  77      "Array\n"
  78      "(\n"
  79      "    [0] => Array\n"
  80      "        (\n"
  81      "            [0] => is a\n"
  82      "            [1] => 7\n"
  83      "        )\n"
  84      "\n"
  85      "    [1] => Array\n"
  86      "        (\n"
  87      "            [0] => a\n"
  88      "            [1] => 10\n"
  89      "        )\n"
  90      "\n"
  91      ")\n");
  92   VERIFY(f_icu_match("\\. \ufeb0", subject, ref(matches),
  93                      k_UREGEX_OFFSET_CAPTURE));
  94   VS(f_print_r(matches, true),
  95     "Array\n"
  96     "(\n"
  97     "    [0] => Array\n"
  98     "        (\n"
  99     "            [0] => . \ufeb0\n"
 100     "            [1] => 30\n"
 101     "        )\n"
 102     "\n"
 103     ")\n");
 104   VERIFY(f_icu_match("\ufee9\ufeed (\ufe8e\ufee0\ufee8\ufebb)",
 105                      subject_ar, ref(matches), k_UREGEX_OFFSET_CAPTURE));
 106   VS(f_print_r(matches, true),
 107     "Array\n"
 108     "(\n"
 109     "    [0] => Array\n"
 110     "        (\n"
 111     "            [0] => \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb\n"
 112     "            [1] => 4\n"
 113     "        )\n"
 114     "\n"
 115     "    [1] => Array\n"
 116     "        (\n"
 117     "            [0] => \ufe8e\ufee0\ufee8\ufebb\n"
 118     "            [1] => 7\n"
 119     "        )\n"
 120     "\n"
 121     ")\n");
 122
 123   // Test match for 32-bit code points.
 124   VERIFY(f_icu_match(".*", subject_32, ref(matches)));
 125   VS(f_print_r(matches, true),
 126     "Array\n"
 127     "(\n"
 128     "    [0] => \U00010905\U00010905\U00010905\U00010905\U00010905\U00010905\n"
 129     ")\n");
 130
 131   // Test regex caching functionality.
 132   VERIFY(f_icu_match("(php)", subject, uninit_null(), k_UREGEX_CASE_INSENSITIVE));
 133   VERIFY(!f_icu_match("(php)", subject));
 134
 135   // Test ICU specific (ie bidi) functionality.
 136   String pattern_ltr = String("\\p{Bidi_Class=Left_To_Right}", CopyString);
 137   String pattern_rtl = String("\\p{Bidi_Class=Right_To_Left}", CopyString);
 138   String pattern_arl = String("\\p{Bidi_Class=Arabic_Letter}", CopyString);
 139
 140  VERIFY(f_icu_match(pattern_ltr, subject_en));
 141   VERIFY(!f_icu_match(pattern_rtl, subject_en));
 142
 143   VERIFY(!f_icu_match(pattern_ltr, subject_he));
 144   VERIFY(f_icu_match(pattern_rtl, subject_he));
 145   VERIFY(!f_icu_match(pattern_arl, subject_he));
 146
 147   VERIFY(!f_icu_match(pattern_ltr, subject_ar));
 148   VERIFY(!f_icu_match(pattern_rtl, subject_ar));
 149   VERIFY(f_icu_match(pattern_arl, subject_ar));
 150
 151   VERIFY(f_icu_match(pattern_ltr, subject_mixed));
 152   VERIFY(f_icu_match(pattern_rtl, subject_mixed));
 153
 154   return Count(true);
 155 }
 156
 157 // Test string lifted from tests/intl/utf8.h
 158 bool TestExtIcu::test_icu_transliterate() {
 159   String input_ru =
 160     String("\xd1\x84\xd0\xb5\xd0\xb9\xd1"
 161            "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba",
 162            CopyString);
 163   String output_ru = f_icu_transliterate(input_ru, false);
 164   // Note: different than php test ('y' -> 'j')
 165   VERIFY(output_ru == "fejsbu\xc5\x93k");
 166
 167   // Verify that removing accents works.
 168   String input_de = String("Ich m\xc3\xb6"
 169                            "chte \xc3\xbc"
 170                            "berzeugend "
 171                             "oder \xc3\xa4hnliche sein",
 172                            CopyString);
 173   String output_de = f_icu_transliterate(input_de, true);
 174   VERIFY(output_de == "Ich mochte uberzeugend oder ahnliche sein");
 175
 176   // Verify that keeping accents works.
 177   VERIFY(f_icu_transliterate(input_de, false) == input_de.c_str());
 178
 179   // Check an non-Latin language.
 180   String input_zh = String("\xe5\x9b\x9b"
 181                            "\xe5\x8d\x81\xe5\x9b\x9b\xe7"
 182                            "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90",
 183                            CopyString);
 184   String output_zh = f_icu_transliterate(input_zh, true);
 185   VERIFY(output_zh == "si shi si shi shi zi");
 186
 187   return Count(true);
 188 }
 189
 190
 191 bool TestExtIcu::test_icu_tokenize() {
 192
 193
 194   String input_eng = String("Hello World");
 195   Array output_eng = f_icu_tokenize(input_eng);
 196
 197   VS(f_print_r(output_eng, true),
 198      "Array\n"
 199      "(\n"
 200      "    [0] => _B_\n"
 201      "    [1] => hello\n"
 202      "    [2] => world\n"
 203      "    [3] => _E_\n"
 204      ")\n"
 205     );
 206   String input_long = String("Hello! You are visitor #1234 to "
 207                             "http://www.facebook.com! "
 208                             "<3 How are you today (6/14/2011),"
 209                             " hello@world.com?");
 210
 211   Array output_long = f_icu_tokenize(input_long);
 212
 213   VS(f_print_r(output_long, true),
 214      "Array\n"
 215      "(\n"
 216      "    [0] => _B_\n"
 217      "    [1] => hello\n"
 218      "    [2] => !\n"
 219      "    [3] => you\n"
 220      "    [4] => are\n"
 221      "    [5] => visitor\n"
 222      "    [6] => #\n"
 223      "    [7] => XXXX\n"
 224      "    [8] => to\n"
 225      "    [9] => TOKEN_URL\n"
 226      "    [10] => !\n"
 227      "    [11] => TOKEN_HEART\n"
 228      "    [12] => how\n"
 229      "    [13] => are\n"
 230      "    [14] => you\n"
 231      "    [15] => today\n"
 232      "    [16] => (\n"
 233      "    [17] => TOKEN_DATE\n"
 234      "    [18] => )\n"
 235      "    [19] => ,\n"
 236      "    [20] => TOKEN_EMAIL\n"
 237      "    [21] => ?\n"
 238      "    [22] => _E_\n"
 239      ")\n"
 240     );
 241
 242   String input_de = String("Ich mÃ¶chte Ã¼berzeugend oder Ã¤hnliche sein");
 243   Array output_de = f_icu_tokenize(input_de);
 244
 245   VS(f_print_r(output_de, true),
 246      "Array\n"
 247      "(\n"
 248      "    [0] => _B_\n"
 249      "    [1] => ich\n"
 250      "    [2] => mã\n"
 251      "    [3] => ¶\n"
 252      "    [4] => chte\n"
 253      "    [5] => ã\n"
 254      "    [6] => ¼\n"
 255      "    [7] => berzeugend\n"
 256      "    [8] => oder\n"
 257      "    [9] => ã\n"
 258      "    [10] => ¤\n"
 259      "    [11] => hnliche\n"
 260      "    [12] => sein\n"
 261      "    [13] => _E_\n"
 262      ")\n");
 263
 264
 265   return Count(true);
 266 }