2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/test/test_ext_icu.h"
18 #include "hphp/runtime/ext/ext_icu.h"
21 ///////////////////////////////////////////////////////////////////////////////
23 bool TestExtIcu::RunTests(const std::string
&which
) {
26 RUN_TEST(test_icu_match
);
27 RUN_TEST(test_icu_transliterate
);
28 RUN_TEST(test_icu_tokenize
);
33 ///////////////////////////////////////////////////////////////////////////////
35 bool TestExtIcu::test_icu_match() {
36 // Test subject strings.
37 String subject
= String(
38 "\u05d6\U00010905 PHP is a scripting language. \ufeb0\ufef3",
40 String subject_32
= String(
41 "\U00010905\U00010905\U00010905\U00010905\U00010905\U00010905",
43 String subject_en
= String("this is an english string", CopyString
);
44 // "this is a hebrew string"
45 String subject_he
= String(
46 "\u05d6\u05d4 \u05d4\u05d5\u05d0 \u05de\u05d7\u05e8\u05d5\u05d6\u05ea "
47 "\u05e2\u05d1\u05e8\u05d9\u05ea",
49 // "this is an arabic string"
50 String subject_ar
= String(
51 "\ufee9\ufeab\ufe8d \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb "
52 "\ufe8d\ufefa\ufee8\ufea0\ufee0\ufef3\ufeb0\ufef3",
54 // "this is a hebrew string"
55 String subject_mixed
= String(
56 "this is a \u05e2\u05d1\u05e8\u05d9\u05ea string",
59 // Test basic regex parsing functionality.
60 VERIFY(f_icu_match("scripting", subject
));
61 VERIFY(!f_icu_match("php", subject
));
62 VERIFY(f_icu_match("(\\bPHP\\b)", subject
));
63 VERIFY(!f_icu_match("(\\bPHP\\b))", subject
));
65 // Test returning matches functionality.
67 VERIFY(f_icu_match("(PHP) is", subject
, ref(matches
)));
68 VS(f_print_r(matches
, true),
74 VERIFY(f_icu_match("is (a)", subject
, ref(matches
),
75 k_UREGEX_OFFSET_CAPTURE
));
76 VS(f_print_r(matches
, true),
92 VERIFY(f_icu_match("\\. \ufeb0", subject
, ref(matches
),
93 k_UREGEX_OFFSET_CAPTURE
));
94 VS(f_print_r(matches
, true),
104 VERIFY(f_icu_match("\ufee9\ufeed (\ufe8e\ufee0\ufee8\ufebb)",
105 subject_ar
, ref(matches
), k_UREGEX_OFFSET_CAPTURE
));
106 VS(f_print_r(matches
, true),
111 " [0] => \ufee9\ufeed \ufe8e\ufee0\ufee8\ufebb\n"
117 " [0] => \ufe8e\ufee0\ufee8\ufebb\n"
123 // Test match for 32-bit code points.
124 VERIFY(f_icu_match(".*", subject_32
, ref(matches
)));
125 VS(f_print_r(matches
, true),
128 " [0] => \U00010905\U00010905\U00010905\U00010905\U00010905\U00010905\n"
131 // Test regex caching functionality.
132 VERIFY(f_icu_match("(php)", subject
, uninit_null(), k_UREGEX_CASE_INSENSITIVE
));
133 VERIFY(!f_icu_match("(php)", subject
));
135 // Test ICU specific (ie bidi) functionality.
136 String pattern_ltr
= String("\\p{Bidi_Class=Left_To_Right}", CopyString
);
137 String pattern_rtl
= String("\\p{Bidi_Class=Right_To_Left}", CopyString
);
138 String pattern_arl
= String("\\p{Bidi_Class=Arabic_Letter}", CopyString
);
140 VERIFY(f_icu_match(pattern_ltr
, subject_en
));
141 VERIFY(!f_icu_match(pattern_rtl
, subject_en
));
143 VERIFY(!f_icu_match(pattern_ltr
, subject_he
));
144 VERIFY(f_icu_match(pattern_rtl
, subject_he
));
145 VERIFY(!f_icu_match(pattern_arl
, subject_he
));
147 VERIFY(!f_icu_match(pattern_ltr
, subject_ar
));
148 VERIFY(!f_icu_match(pattern_rtl
, subject_ar
));
149 VERIFY(f_icu_match(pattern_arl
, subject_ar
));
151 VERIFY(f_icu_match(pattern_ltr
, subject_mixed
));
152 VERIFY(f_icu_match(pattern_rtl
, subject_mixed
));
157 // Test string lifted from tests/intl/utf8.h
158 bool TestExtIcu::test_icu_transliterate() {
160 String("\xd1\x84\xd0\xb5\xd0\xb9\xd1"
161 "\x81\xd0\xb1\xd1\x83\xc5\x93\xd0\xba",
163 String output_ru
= f_icu_transliterate(input_ru
, false);
164 // Note: different than php test ('y' -> 'j')
165 VERIFY(output_ru
== "fejsbu\xc5\x93k");
167 // Verify that removing accents works.
168 String input_de
= String("Ich m\xc3\xb6"
171 "oder \xc3\xa4hnliche sein",
173 String output_de
= f_icu_transliterate(input_de
, true);
174 VERIFY(output_de
== "Ich mochte uberzeugend oder ahnliche sein");
176 // Verify that keeping accents works.
177 VERIFY(f_icu_transliterate(input_de
, false) == input_de
.c_str());
179 // Check an non-Latin language.
180 String input_zh
= String("\xe5\x9b\x9b"
181 "\xe5\x8d\x81\xe5\x9b\x9b\xe7"
182 "\x9f\xb3\xe7\x8d\x85\xe5\xad\x90",
184 String output_zh
= f_icu_transliterate(input_zh
, true);
185 VERIFY(output_zh
== "si shi si shi shi zi");
191 bool TestExtIcu::test_icu_tokenize() {
194 String input_eng
= String("Hello World");
195 Array output_eng
= f_icu_tokenize(input_eng
);
197 VS(f_print_r(output_eng
, true),
206 String input_long
= String("Hello! You are visitor #1234 to "
207 "http://www.facebook.com! "
208 "<3 How are you today (6/14/2011),"
209 " hello@world.com?");
211 Array output_long
= f_icu_tokenize(input_long
);
213 VS(f_print_r(output_long
, true),
225 " [9] => TOKEN_URL\n"
227 " [11] => TOKEN_HEART\n"
233 " [17] => TOKEN_DATE\n"
236 " [20] => TOKEN_EMAIL\n"
242 String input_de
= String("Ich möchte überzeugend oder ähnliche sein");
243 Array output_de
= f_icu_tokenize(input_de
);
245 VS(f_print_r(output_de
, true),
255 " [7] => berzeugend\n"