make #includes consistent
[hiphop-php.git] / hphp / test / test_ext_icu.cpp
bloba3b8dc6813bb8043c7b15be903c43b46eadacbcb
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/test/test_ext_icu.h"
18 #include "hphp/runtime/ext/ext_icu.h"
19 #include <iostream>
21 ///////////////////////////////////////////////////////////////////////////////
23 bool TestExtIcu::RunTests(const std::string &which) {
24 bool ret = true;
26 RUN_TEST(test_icu_match);
27 RUN_TEST(test_icu_transliterate);
28 RUN_TEST(test_icu_tokenize);
30 return ret;
33 ///////////////////////////////////////////////////////////////////////////////
35 bool TestExtIcu::test_icu_match() {
36 // Test subject strings.
37 String subject = String(
38 "\u05d6\U00010905 PHP is a scripting language. \ufeb0\ufef3",
39 CopyString);
40 String subject_32 = String(
41 "\U00010905\U00010905\U00010905\U00010905\U00010905\U00010905",
42 CopyString);
43 String subject_en = String("this is an english string", CopyString);
44 // "this is a hebrew string"
45 String subject_he = String(
46 "\u05d6\u05d4 \u05d4\u05d5\u05d0 \u05de\u05d7\u05e8\u05d5\u05d6\u05ea "
47 "\u05e2\u05d1\u05e8\u05d9\u05ea",
48 CopyString);
49 // "this is an arabic string"
50 String subject_ar = String(
51 "\ufee9\ufeab\ufe8d \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb "
52 "\ufe8d\ufefa\ufee8\ufea0\ufee0\ufef3\ufeb0\ufef3",
53 CopyString);
54 // "this is a hebrew string"
55 String subject_mixed = String(
56 "this is a \u05e2\u05d1\u05e8\u05d9\u05ea string",
57 CopyString);
59 // Test basic regex parsing functionality.
60 VERIFY(f_icu_match("scripting", subject));
61 VERIFY(!f_icu_match("php", subject));
62 VERIFY(f_icu_match("(\\bPHP\\b)", subject));
63 VERIFY(!f_icu_match("(\\bPHP\\b))", subject));
65 // Test returning matches functionality.
66 Variant matches;
67 VERIFY(f_icu_match("(PHP) is", subject, ref(matches)));
68 VS(f_print_r(matches, true),
69 "Array\n"
70 "(\n"
71 " [0] => PHP is\n"
72 " [1] => PHP\n"
73 ")\n");
74 VERIFY(f_icu_match("is (a)", subject, ref(matches),
75 k_UREGEX_OFFSET_CAPTURE));
76 VS(f_print_r(matches, true),
77 "Array\n"
78 "(\n"
79 " [0] => Array\n"
80 " (\n"
81 " [0] => is a\n"
82 " [1] => 7\n"
83 " )\n"
84 "\n"
85 " [1] => Array\n"
86 " (\n"
87 " [0] => a\n"
88 " [1] => 10\n"
89 " )\n"
90 "\n"
91 ")\n");
92 VERIFY(f_icu_match("\\. \ufeb0", subject, ref(matches),
93 k_UREGEX_OFFSET_CAPTURE));
94 VS(f_print_r(matches, true),
95 "Array\n"
96 "(\n"
97 " [0] => Array\n"
98 " (\n"
99 " [0] => . \ufeb0\n"
100 " [1] => 30\n"
101 " )\n"
102 "\n"
103 ")\n");
104 VERIFY(f_icu_match("\ufee9\ufeed (\ufe8e\ufee0\ufee8\ufebb)",
105 subject_ar, ref(matches), k_UREGEX_OFFSET_CAPTURE));
106 VS(f_print_r(matches, true),
107 "Array\n"
108 "(\n"
109 " [0] => Array\n"
110 " (\n"
111 " [0] => \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb\n"
112 " [1] => 4\n"
113 " )\n"
114 "\n"
115 " [1] => Array\n"
116 " (\n"
117 " [0] => \ufe8e\ufee0\ufee8\ufebb\n"
118 " [1] => 7\n"
119 " )\n"
120 "\n"
121 ")\n");
123 // Test match for 32-bit code points.
124 VERIFY(f_icu_match(".*", subject_32, ref(matches)));
125 VS(f_print_r(matches, true),
126 "Array\n"
127 "(\n"
128 " [0] => \U00010905\U00010905\U00010905\U00010905\U00010905\U00010905\n"
129 ")\n");
131 // Test regex caching functionality.
132 VERIFY(f_icu_match("(php)", subject, uninit_null(), k_UREGEX_CASE_INSENSITIVE));
133 VERIFY(!f_icu_match("(php)", subject));
135 // Test ICU specific (ie bidi) functionality.
136 String pattern_ltr = String("\\p{Bidi_Class=Left_To_Right}", CopyString);
137 String pattern_rtl = String("\\p{Bidi_Class=Right_To_Left}", CopyString);
138 String pattern_arl = String("\\p{Bidi_Class=Arabic_Letter}", CopyString);
140 VERIFY(f_icu_match(pattern_ltr, subject_en));
141 VERIFY(!f_icu_match(pattern_rtl, subject_en));
143 VERIFY(!f_icu_match(pattern_ltr, subject_he));
144 VERIFY(f_icu_match(pattern_rtl, subject_he));
145 VERIFY(!f_icu_match(pattern_arl, subject_he));
147 VERIFY(!f_icu_match(pattern_ltr, subject_ar));
148 VERIFY(!f_icu_match(pattern_rtl, subject_ar));
149 VERIFY(f_icu_match(pattern_arl, subject_ar));
151 VERIFY(f_icu_match(pattern_ltr, subject_mixed));
152 VERIFY(f_icu_match(pattern_rtl, subject_mixed));
154 return Count(true);
157 // Test string lifted from tests/intl/utf8.h
158 bool TestExtIcu::test_icu_transliterate() {
159 String input_ru =
160 String("\xd1\x84\xd0\xb5\xd0\xb9\xd1"
161 "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba",
162 CopyString);
163 String output_ru = f_icu_transliterate(input_ru, false);
164 // Note: different than php test ('y' -> 'j')
165 VERIFY(output_ru == "fejsbu\xc5\x93k");
167 // Verify that removing accents works.
168 String input_de = String("Ich m\xc3\xb6"
169 "chte \xc3\xbc"
170 "berzeugend "
171 "oder \xc3\xa4hnliche sein",
172 CopyString);
173 String output_de = f_icu_transliterate(input_de, true);
174 VERIFY(output_de == "Ich mochte uberzeugend oder ahnliche sein");
176 // Verify that keeping accents works.
177 VERIFY(f_icu_transliterate(input_de, false) == input_de.c_str());
179 // Check an non-Latin language.
180 String input_zh = String("\xe5\x9b\x9b"
181 "\xe5\x8d\x81\xe5\x9b\x9b\xe7"
182 "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90",
183 CopyString);
184 String output_zh = f_icu_transliterate(input_zh, true);
185 VERIFY(output_zh == "si shi si shi shi zi");
187 return Count(true);
191 bool TestExtIcu::test_icu_tokenize() {
194 String input_eng = String("Hello World");
195 Array output_eng = f_icu_tokenize(input_eng);
197 VS(f_print_r(output_eng, true),
198 "Array\n"
199 "(\n"
200 " [0] => _B_\n"
201 " [1] => hello\n"
202 " [2] => world\n"
203 " [3] => _E_\n"
204 ")\n"
206 String input_long = String("Hello! You are visitor #1234 to "
207 "http://www.facebook.com! "
208 "<3 How are you today (6/14/2011),"
209 " hello@world.com?");
211 Array output_long = f_icu_tokenize(input_long);
213 VS(f_print_r(output_long, true),
214 "Array\n"
215 "(\n"
216 " [0] => _B_\n"
217 " [1] => hello\n"
218 " [2] => !\n"
219 " [3] => you\n"
220 " [4] => are\n"
221 " [5] => visitor\n"
222 " [6] => #\n"
223 " [7] => XXXX\n"
224 " [8] => to\n"
225 " [9] => TOKEN_URL\n"
226 " [10] => !\n"
227 " [11] => TOKEN_HEART\n"
228 " [12] => how\n"
229 " [13] => are\n"
230 " [14] => you\n"
231 " [15] => today\n"
232 " [16] => (\n"
233 " [17] => TOKEN_DATE\n"
234 " [18] => )\n"
235 " [19] => ,\n"
236 " [20] => TOKEN_EMAIL\n"
237 " [21] => ?\n"
238 " [22] => _E_\n"
239 ")\n"
242 String input_de = String("Ich möchte überzeugend oder ähnliche sein");
243 Array output_de = f_icu_tokenize(input_de);
245 VS(f_print_r(output_de, true),
246 "Array\n"
247 "(\n"
248 " [0] => _B_\n"
249 " [1] => ich\n"
250 " [2] => mã\n"
251 " [3] => ¶\n"
252 " [4] => chte\n"
253 " [5] => ã\n"
254 " [6] => ¼\n"
255 " [7] => berzeugend\n"
256 " [8] => oder\n"
257 " [9] => ã\n"
258 " [10] => ¤\n"
259 " [11] => hnliche\n"
260 " [12] => sein\n"
261 " [13] => _E_\n"
262 ")\n");
265 return Count(true);