Rename runtime/base/zend_* to zend-
[hiphop-php.git] / hphp / runtime / ext / ext_mb.cpp
blobce394f4b250f8324803e8330873538f9e25c94c6
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2010 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/ext/ext_mb.h"
19 #include "hphp/runtime/base/string_buffer.h"
20 #include "hphp/runtime/base/request_local.h"
21 #include "hphp/runtime/ext/php_unicode.h"
22 #include "hphp/runtime/ext/unicode_data.h"
23 #include "hphp/runtime/ext/ext_process.h"
24 #include "hphp/runtime/base/zend-url.h"
25 #include "hphp/runtime/base/zend-string.h"
26 #include "hphp/runtime/base/ini_setting.h"
28 extern "C" {
29 #include "mbfl/mbfl_convert.h"
30 #include "mbfl/mbfilter.h"
31 #include <oniguruma.h>
34 #define php_mb_re_pattern_buffer re_pattern_buffer
35 #define php_mb_regex_t regex_t
36 #define php_mb_re_registers re_registers
38 extern void mbfl_memory_device_unput(mbfl_memory_device *device);
40 #define PARSE_POST 0
41 #define PARSE_GET 1
42 #define PARSE_COOKIE 2
43 #define PARSE_STRING 3
44 #define PARSE_ENV 4
45 #define PARSE_SERVER 5
46 #define PARSE_SESSION 6
48 namespace HPHP {
50 static class mbstringExtension : public Extension {
51 public:
52 mbstringExtension() : Extension("mbstring") {}
54 virtual void moduleInit() {
55 IniSetting::SetGlobalDefault("mbstring.http_input", "pass");
56 IniSetting::SetGlobalDefault("mbstring.http_output", "pass");
59 } s_mbstring_extension;
61 ///////////////////////////////////////////////////////////////////////////////
62 // statics
64 #define PHP_MBSTR_STACK_BLOCK_SIZE 32
66 typedef struct _php_mb_nls_ident_list {
67 mbfl_no_language lang;
68 mbfl_no_encoding* list;
69 int list_size;
70 } php_mb_nls_ident_list;
72 static mbfl_no_encoding php_mb_default_identify_list_ja[] = {
73 mbfl_no_encoding_ascii,
74 mbfl_no_encoding_jis,
75 mbfl_no_encoding_utf8,
76 mbfl_no_encoding_euc_jp,
77 mbfl_no_encoding_sjis
80 static mbfl_no_encoding php_mb_default_identify_list_cn[] = {
81 mbfl_no_encoding_ascii,
82 mbfl_no_encoding_utf8,
83 mbfl_no_encoding_euc_cn,
84 mbfl_no_encoding_cp936
87 static mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
88 mbfl_no_encoding_ascii,
89 mbfl_no_encoding_utf8,
90 mbfl_no_encoding_euc_tw,
91 mbfl_no_encoding_big5
94 static mbfl_no_encoding php_mb_default_identify_list_kr[] = {
95 mbfl_no_encoding_ascii,
96 mbfl_no_encoding_utf8,
97 mbfl_no_encoding_euc_kr,
98 mbfl_no_encoding_uhc
101 static mbfl_no_encoding php_mb_default_identify_list_ru[] = {
102 mbfl_no_encoding_ascii,
103 mbfl_no_encoding_utf8,
104 mbfl_no_encoding_koi8r,
105 mbfl_no_encoding_cp1251,
106 mbfl_no_encoding_cp866
109 static mbfl_no_encoding php_mb_default_identify_list_hy[] = {
110 mbfl_no_encoding_ascii,
111 mbfl_no_encoding_utf8,
112 mbfl_no_encoding_armscii8
115 static mbfl_no_encoding php_mb_default_identify_list_tr[] = {
116 mbfl_no_encoding_ascii,
117 mbfl_no_encoding_utf8,
118 mbfl_no_encoding_8859_9
121 static mbfl_no_encoding php_mb_default_identify_list_neut[] = {
122 mbfl_no_encoding_ascii,
123 mbfl_no_encoding_utf8
126 static php_mb_nls_ident_list php_mb_default_identify_list[] = {
127 { mbfl_no_language_japanese, php_mb_default_identify_list_ja,
128 sizeof(php_mb_default_identify_list_ja) /
129 sizeof(php_mb_default_identify_list_ja[0]) },
130 { mbfl_no_language_korean, php_mb_default_identify_list_kr,
131 sizeof(php_mb_default_identify_list_kr) /
132 sizeof(php_mb_default_identify_list_kr[0]) },
133 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk,
134 sizeof(php_mb_default_identify_list_tw_hk) /
135 sizeof(php_mb_default_identify_list_tw_hk[0]) },
136 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn,
137 sizeof(php_mb_default_identify_list_cn) /
138 sizeof(php_mb_default_identify_list_cn[0]) },
139 { mbfl_no_language_russian, php_mb_default_identify_list_ru,
140 sizeof(php_mb_default_identify_list_ru) /
141 sizeof(php_mb_default_identify_list_ru[0]) },
142 { mbfl_no_language_armenian, php_mb_default_identify_list_hy,
143 sizeof(php_mb_default_identify_list_hy) /
144 sizeof(php_mb_default_identify_list_hy[0]) },
145 { mbfl_no_language_turkish, php_mb_default_identify_list_tr,
146 sizeof(php_mb_default_identify_list_tr) /
147 sizeof(php_mb_default_identify_list_tr[0]) },
148 { mbfl_no_language_neutral, php_mb_default_identify_list_neut,
149 sizeof(php_mb_default_identify_list_neut) /
150 sizeof(php_mb_default_identify_list_neut[0]) }
153 ///////////////////////////////////////////////////////////////////////////////
154 // globals
155 typedef std::map<std::string, php_mb_regex_t *> RegexCache;
157 class MBGlobals : public RequestEventHandler {
158 public:
159 mbfl_no_language language;
160 mbfl_no_language current_language;
161 mbfl_no_encoding internal_encoding;
162 mbfl_no_encoding current_internal_encoding;
163 mbfl_no_encoding http_output_encoding;
164 mbfl_no_encoding current_http_output_encoding;
165 mbfl_no_encoding http_input_identify;
166 mbfl_no_encoding http_input_identify_get;
167 mbfl_no_encoding http_input_identify_post;
168 mbfl_no_encoding http_input_identify_cookie;
169 mbfl_no_encoding http_input_identify_string;
170 mbfl_no_encoding *http_input_list;
171 int http_input_list_size;
172 mbfl_no_encoding *detect_order_list;
173 int detect_order_list_size;
174 mbfl_no_encoding *current_detect_order_list;
175 int current_detect_order_list_size;
176 mbfl_no_encoding *default_detect_order_list;
177 int default_detect_order_list_size;
178 int filter_illegal_mode;
179 int filter_illegal_substchar;
180 int current_filter_illegal_mode;
181 int current_filter_illegal_substchar;
182 bool encoding_translation;
183 long strict_detection;
184 long illegalchars;
185 mbfl_buffer_converter *outconv;
187 OnigEncoding default_mbctype;
188 OnigEncoding current_mbctype;
189 RegexCache ht_rc;
190 std::string search_str;
191 unsigned int search_pos;
192 php_mb_regex_t *search_re;
193 OnigRegion *search_regs;
194 OnigOptionType regex_default_options;
195 OnigSyntaxType *regex_default_syntax;
197 MBGlobals() :
198 language(mbfl_no_language_uni),
199 current_language(mbfl_no_language_uni),
200 internal_encoding(mbfl_no_encoding_utf8),
201 current_internal_encoding(mbfl_no_encoding_utf8),
202 http_output_encoding(mbfl_no_encoding_pass),
203 current_http_output_encoding(mbfl_no_encoding_pass),
204 http_input_identify(mbfl_no_encoding_invalid),
205 http_input_identify_get(mbfl_no_encoding_invalid),
206 http_input_identify_post(mbfl_no_encoding_invalid),
207 http_input_identify_cookie(mbfl_no_encoding_invalid),
208 http_input_identify_string(mbfl_no_encoding_invalid),
209 http_input_list(NULL),
210 http_input_list_size(0),
211 detect_order_list(NULL),
212 detect_order_list_size(0),
213 current_detect_order_list(NULL),
214 current_detect_order_list_size(0),
215 default_detect_order_list
216 ((mbfl_no_encoding *)php_mb_default_identify_list_neut),
217 default_detect_order_list_size
218 (sizeof(php_mb_default_identify_list_neut) /
219 sizeof(php_mb_default_identify_list_neut[0])),
220 filter_illegal_mode(MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR),
221 filter_illegal_substchar(0x3f), /* '?' */
222 current_filter_illegal_mode(MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR),
223 current_filter_illegal_substchar(0x3f), /* '?' */
224 encoding_translation(0),
225 strict_detection(0),
226 illegalchars(0),
227 outconv(NULL),
228 default_mbctype(ONIG_ENCODING_EUC_JP),
229 current_mbctype(ONIG_ENCODING_EUC_JP),
230 search_pos(0),
231 search_re((php_mb_regex_t*)NULL),
232 search_regs((OnigRegion*)NULL),
233 regex_default_options(ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE),
234 regex_default_syntax(ONIG_SYNTAX_RUBY) {
237 virtual void requestInit() {
238 current_language = language;
239 current_internal_encoding = internal_encoding;
240 current_http_output_encoding = http_output_encoding;
241 current_filter_illegal_mode = filter_illegal_mode;
242 current_filter_illegal_substchar = filter_illegal_substchar;
243 if (!encoding_translation) {
244 illegalchars = 0;
247 mbfl_no_encoding *list=NULL, *entry;
248 int n = 0;
249 if (detect_order_list) {
250 list = detect_order_list;
251 n = detect_order_list_size;
253 if (n <= 0) {
254 list = default_detect_order_list;
255 n = default_detect_order_list_size;
257 entry = (mbfl_no_encoding *)malloc(n * sizeof(int));
258 current_detect_order_list = entry;
259 current_detect_order_list_size = n;
260 while (n > 0) {
261 *entry++ = *list++;
262 n--;
266 virtual void requestShutdown() {
267 if (current_detect_order_list != NULL) {
268 free(current_detect_order_list);
269 current_detect_order_list = NULL;
270 current_detect_order_list_size = 0;
272 if (outconv != NULL) {
273 illegalchars += mbfl_buffer_illegalchars(outconv);
274 mbfl_buffer_converter_delete(outconv);
275 outconv = NULL;
278 /* clear http input identification. */
279 http_input_identify = mbfl_no_encoding_invalid;
280 http_input_identify_post = mbfl_no_encoding_invalid;
281 http_input_identify_get = mbfl_no_encoding_invalid;
282 http_input_identify_cookie = mbfl_no_encoding_invalid;
283 http_input_identify_string = mbfl_no_encoding_invalid;
285 current_mbctype = default_mbctype;
287 search_str.clear();
288 search_pos = 0;
290 if (search_regs != NULL) {
291 onig_region_free(search_regs, 1);
292 search_regs = (OnigRegion *)NULL;
294 for (RegexCache::const_iterator it = ht_rc.begin(); it != ht_rc.end();
295 ++it) {
296 onig_free(it->second);
298 ht_rc.clear();
301 IMPLEMENT_STATIC_REQUEST_LOCAL(MBGlobals, s_mb_globals);
302 #define MBSTRG(name) s_mb_globals->name
304 ///////////////////////////////////////////////////////////////////////////////
305 // unicode functions
308 * A simple array of 32-bit masks for lookup.
310 static unsigned long masks32[32] = {
311 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
312 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
313 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
314 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
315 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
316 0x40000000, 0x80000000
319 static int prop_lookup(unsigned long code, unsigned long n) {
320 long l, r, m;
323 * There is an extra node on the end of the offsets to allow this routine
324 * to work right. If the index is 0xffff, then there are no nodes for the
325 * property.
327 if ((l = _ucprop_offsets[n]) == 0xffff)
328 return 0;
331 * Locate the next offset that is not 0xffff. The sentinel at the end of
332 * the array is the max index value.
334 for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
337 r = _ucprop_offsets[n + m] - 1;
339 while (l <= r) {
341 * Determine a "mid" point and adjust to make sure the mid point is at
342 * the beginning of a range pair.
344 m = (l + r) >> 1;
345 m -= (m & 1);
346 if (code > _ucprop_ranges[m + 1])
347 l = m + 2;
348 else if (code < _ucprop_ranges[m])
349 r = m - 2;
350 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
351 return 1;
353 return 0;
357 static int php_unicode_is_prop(unsigned long code, unsigned long mask1,
358 unsigned long mask2) {
359 unsigned long i;
361 if (mask1 == 0 && mask2 == 0)
362 return 0;
364 for (i = 0; mask1 && i < 32; i++) {
365 if ((mask1 & masks32[i]) && prop_lookup(code, i))
366 return 1;
369 for (i = 32; mask2 && i < _ucprop_size; i++) {
370 if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
371 return 1;
374 return 0;
377 static unsigned long case_lookup(unsigned long code, long l, long r,
378 int field) {
379 long m;
382 * Do the binary search.
384 while (l <= r) {
386 * Determine a "mid" point and adjust to make sure the mid point is at
387 * the beginning of a case mapping triple.
389 m = (l + r) >> 1;
390 m -= (m % 3);
391 if (code > _uccase_map[m])
392 l = m + 3;
393 else if (code < _uccase_map[m])
394 r = m - 3;
395 else if (code == _uccase_map[m])
396 return _uccase_map[m + field];
399 return code;
402 static unsigned long php_turkish_toupper(unsigned long code, long l, long r,
403 int field) {
404 if (code == 0x0069L) {
405 return 0x0130L;
407 return case_lookup(code, l, r, field);
410 static unsigned long php_turkish_tolower(unsigned long code, long l, long r,
411 int field) {
412 if (code == 0x0049L) {
413 return 0x0131L;
415 return case_lookup(code, l, r, field);
418 static unsigned long php_unicode_toupper(unsigned long code,
419 enum mbfl_no_encoding enc) {
420 int field;
421 long l, r;
423 if (php_unicode_is_upper(code))
424 return code;
426 if (php_unicode_is_lower(code)) {
428 * The character is lower case.
430 field = 2;
431 l = _uccase_len[0];
432 r = (l + _uccase_len[1]) - 3;
434 if (enc == mbfl_no_encoding_8859_9) {
435 return php_turkish_toupper(code, l, r, field);
438 } else {
440 * The character is title case.
442 field = 1;
443 l = _uccase_len[0] + _uccase_len[1];
444 r = _uccase_size - 3;
446 return case_lookup(code, l, r, field);
449 static unsigned long php_unicode_tolower(unsigned long code,
450 enum mbfl_no_encoding enc) {
451 int field;
452 long l, r;
454 if (php_unicode_is_lower(code))
455 return code;
457 if (php_unicode_is_upper(code)) {
459 * The character is upper case.
461 field = 1;
462 l = 0;
463 r = _uccase_len[0] - 3;
465 if (enc == mbfl_no_encoding_8859_9) {
466 return php_turkish_tolower(code, l, r, field);
469 } else {
471 * The character is title case.
473 field = 2;
474 l = _uccase_len[0] + _uccase_len[1];
475 r = _uccase_size - 3;
477 return case_lookup(code, l, r, field);
480 static unsigned long php_unicode_totitle(unsigned long code,
481 enum mbfl_no_encoding enc) {
482 int field;
483 long l, r;
485 if (php_unicode_is_title(code))
486 return code;
489 * The offset will always be the same for converting to title case.
491 field = 2;
493 if (php_unicode_is_upper(code)) {
495 * The character is upper case.
497 l = 0;
498 r = _uccase_len[0] - 3;
499 } else {
501 * The character is lower case.
503 l = _uccase_len[0];
504 r = (l + _uccase_len[1]) - 3;
506 return case_lookup(code, l, r, field);
510 #define BE_ARY_TO_UINT32(ptr) (\
511 ((unsigned char*)(ptr))[0]<<24 |\
512 ((unsigned char*)(ptr))[1]<<16 |\
513 ((unsigned char*)(ptr))[2]<< 8 |\
514 ((unsigned char*)(ptr))[3] )
516 #define UINT32_TO_BE_ARY(ptr,val) { \
517 unsigned int v = val; \
518 ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
519 ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
520 ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
521 ((unsigned char*)(ptr))[3] = (v ) & 0xff;\
525 * Return 0 if input contains any illegal encoding, otherwise 1.
526 * Even if any illegal encoding is detected the result may contain a list
527 * of parsed encodings.
529 static int php_mb_parse_encoding_list(const char *value, int value_length,
530 mbfl_no_encoding **return_list,
531 int *return_size, int persistent) {
532 int n, l, size, bauto, ret = 1;
533 char *p, *p1, *p2, *endp, *tmpstr;
534 mbfl_no_encoding no_encoding;
535 mbfl_no_encoding *src, *entry, *list;
537 list = NULL;
538 if (value == NULL || value_length <= 0) {
539 if (return_list) {
540 *return_list = NULL;
542 if (return_size) {
543 *return_size = 0;
545 return 0;
546 } else {
547 mbfl_no_encoding *identify_list;
548 int identify_list_size;
550 identify_list = MBSTRG(default_detect_order_list);
551 identify_list_size = MBSTRG(default_detect_order_list_size);
553 /* copy the value string for work */
554 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
555 tmpstr = (char *)strndup(value+1, value_length-2);
556 value_length -= 2;
558 else
559 tmpstr = (char *)strndup(value, value_length);
560 if (tmpstr == NULL) {
561 return 0;
563 /* count the number of listed encoding names */
564 endp = tmpstr + value_length;
565 n = 1;
566 p1 = tmpstr;
567 while ((p2 = (char*)string_memnstr(p1, ",", 1, endp)) != NULL) {
568 p1 = p2 + 1;
569 n++;
571 size = n + identify_list_size;
572 /* make list */
573 list = (mbfl_no_encoding *)calloc(size, sizeof(int));
574 if (list != NULL) {
575 entry = list;
576 n = 0;
577 bauto = 0;
578 p1 = tmpstr;
579 do {
580 p2 = p = (char*)string_memnstr(p1, ",", 1, endp);
581 if (p == NULL) {
582 p = endp;
584 *p = '\0';
585 /* trim spaces */
586 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
587 p1++;
589 p--;
590 while (p > p1 && (*p == ' ' || *p == '\t')) {
591 *p = '\0';
592 p--;
594 /* convert to the encoding number and check encoding */
595 if (strcasecmp(p1, "auto") == 0) {
596 if (!bauto) {
597 bauto = 1;
598 l = identify_list_size;
599 src = identify_list;
600 while (l > 0) {
601 *entry++ = *src++;
602 l--;
603 n++;
606 } else {
607 no_encoding = mbfl_name2no_encoding(p1);
608 if (no_encoding != mbfl_no_encoding_invalid) {
609 *entry++ = no_encoding;
610 n++;
611 } else {
612 ret = 0;
615 p1 = p2 + 1;
616 } while (n < size && p2 != NULL);
617 if (n > 0) {
618 if (return_list) {
619 *return_list = list;
620 } else {
621 free(list);
623 } else {
624 free(list);
625 if (return_list) {
626 *return_list = NULL;
628 ret = 0;
630 if (return_size) {
631 *return_size = n;
633 } else {
634 if (return_list) {
635 *return_list = NULL;
637 if (return_size) {
638 *return_size = 0;
640 ret = 0;
642 free(tmpstr);
645 return ret;
648 static char *php_mb_convert_encoding(const char *input, size_t length,
649 const char *_to_encoding,
650 const char *_from_encodings,
651 unsigned int *output_len) {
652 mbfl_string string, result, *ret;
653 mbfl_no_encoding from_encoding, to_encoding;
654 mbfl_buffer_converter *convd;
655 int size;
656 mbfl_no_encoding *list;
657 char *output = NULL;
659 if (output_len) {
660 *output_len = 0;
662 if (!input) {
663 return NULL;
665 /* new encoding */
666 if (_to_encoding && strlen(_to_encoding)) {
667 to_encoding = mbfl_name2no_encoding(_to_encoding);
668 if (to_encoding == mbfl_no_encoding_invalid) {
669 raise_warning("Unknown encoding \"%s\"", _to_encoding);
670 return NULL;
672 } else {
673 to_encoding = MBSTRG(current_internal_encoding);
676 /* initialize string */
677 mbfl_string_init(&string);
678 mbfl_string_init(&result);
679 from_encoding = MBSTRG(current_internal_encoding);
680 string.no_encoding = from_encoding;
681 string.no_language = MBSTRG(current_language);
682 string.val = (unsigned char *)input;
683 string.len = length;
685 /* pre-conversion encoding */
686 if (_from_encodings) {
687 list = NULL;
688 size = 0;
689 php_mb_parse_encoding_list(_from_encodings, strlen(_from_encodings),
690 &list, &size, 0);
691 if (size == 1) {
692 from_encoding = *list;
693 string.no_encoding = from_encoding;
694 } else if (size > 1) {
695 /* auto detect */
696 from_encoding = mbfl_identify_encoding_no(&string, list, size,
697 MBSTRG(strict_detection));
698 if (from_encoding != mbfl_no_encoding_invalid) {
699 string.no_encoding = from_encoding;
700 } else {
701 raise_warning("Unable to detect character encoding");
702 from_encoding = mbfl_no_encoding_pass;
703 to_encoding = from_encoding;
704 string.no_encoding = from_encoding;
706 } else {
707 raise_warning("Illegal character encoding specified");
709 if (list != NULL) {
710 free((void *)list);
714 /* initialize converter */
715 convd = mbfl_buffer_converter_new(from_encoding, to_encoding, string.len);
716 if (convd == NULL) {
717 raise_warning("Unable to create character encoding converter");
718 return NULL;
720 mbfl_buffer_converter_illegal_mode
721 (convd, MBSTRG(current_filter_illegal_mode));
722 mbfl_buffer_converter_illegal_substchar
723 (convd, MBSTRG(current_filter_illegal_substchar));
725 /* do it */
726 ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
727 if (ret) {
728 if (output_len) {
729 *output_len = ret->len;
731 output = (char *)ret->val;
734 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
735 mbfl_buffer_converter_delete(convd);
736 return output;
739 static char *php_unicode_convert_case(int case_mode, const char *srcstr,
740 size_t srclen, unsigned int *ret_len,
741 const char *src_encoding) {
742 char *unicode, *newstr;
743 unsigned int unicode_len;
744 unsigned char *unicode_ptr;
745 size_t i;
746 enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
748 unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding,
749 &unicode_len);
750 if (unicode == NULL)
751 return NULL;
753 unicode_ptr = (unsigned char *)unicode;
755 switch(case_mode) {
756 case PHP_UNICODE_CASE_UPPER:
757 for (i = 0; i < unicode_len; i+=4) {
758 UINT32_TO_BE_ARY(&unicode_ptr[i],
759 php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]),
760 _src_encoding));
762 break;
764 case PHP_UNICODE_CASE_LOWER:
765 for (i = 0; i < unicode_len; i+=4) {
766 UINT32_TO_BE_ARY(&unicode_ptr[i],
767 php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]),
768 _src_encoding));
770 break;
772 case PHP_UNICODE_CASE_TITLE:
774 int mode = 0;
776 for (i = 0; i < unicode_len; i+=4) {
777 int res = php_unicode_is_prop
778 (BE_ARY_TO_UINT32(&unicode_ptr[i]),
779 UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT, 0);
780 if (mode) {
781 if (res) {
782 UINT32_TO_BE_ARY
783 (&unicode_ptr[i],
784 php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]),
785 _src_encoding));
786 } else {
787 mode = 0;
789 } else {
790 if (res) {
791 mode = 1;
792 UINT32_TO_BE_ARY
793 (&unicode_ptr[i],
794 php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]),
795 _src_encoding));
800 break;
803 newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding,
804 "UCS-4BE", ret_len);
805 free(unicode);
806 return newstr;
809 ///////////////////////////////////////////////////////////////////////////////
810 // helpers
813 * Return 0 if input contains any illegal encoding, otherwise 1.
814 * Even if any illegal encoding is detected the result may contain a list
815 * of parsed encodings.
817 static int php_mb_parse_encoding_array(CArrRef array,
818 mbfl_no_encoding **return_list,
819 int *return_size, int persistent) {
820 int n, l, size, bauto,ret = 1;
821 mbfl_no_encoding no_encoding;
822 mbfl_no_encoding *src, *list, *entry;
824 list = NULL;
825 mbfl_no_encoding *identify_list = MBSTRG(default_detect_order_list);
826 int identify_list_size = MBSTRG(default_detect_order_list_size);
828 size = array.size() + identify_list_size;
829 list = (mbfl_no_encoding *)calloc(size, sizeof(int));
830 if (list != NULL) {
831 entry = list;
832 bauto = 0;
833 n = 0;
834 for (ArrayIter iter(array); iter; ++iter) {
835 String hash_entry = iter.second();
836 if (strcasecmp(hash_entry.data(), "auto") == 0) {
837 if (!bauto) {
838 bauto = 1;
839 l = identify_list_size;
840 src = identify_list;
841 while (l > 0) {
842 *entry++ = *src++;
843 l--;
844 n++;
847 } else {
848 no_encoding = mbfl_name2no_encoding(hash_entry.data());
849 if (no_encoding != mbfl_no_encoding_invalid) {
850 *entry++ = no_encoding;
851 n++;
852 } else {
853 ret = 0;
857 if (n > 0) {
858 if (return_list) {
859 *return_list = list;
860 } else {
861 free(list);
863 } else {
864 free(list);
865 if (return_list) {
866 *return_list = NULL;
868 ret = 0;
870 if (return_size) {
871 *return_size = n;
873 } else {
874 if (return_list) {
875 *return_list = NULL;
877 if (return_size) {
878 *return_size = 0;
880 ret = 0;
882 return ret;
885 static bool php_mb_parse_encoding(CVarRef encoding,
886 mbfl_no_encoding **return_list,
887 int *return_size, bool persistent) {
888 bool ret;
889 if (encoding.is(KindOfArray)) {
890 ret = php_mb_parse_encoding_array(encoding.toArray(),
891 return_list, return_size,
892 persistent ? 1 : 0);
893 } else {
894 String enc = encoding.toString();
895 ret = php_mb_parse_encoding_list(enc.data(), enc.size(),
896 return_list, return_size,
897 persistent ? 1 : 0);
899 if (!ret) {
900 if (return_list && *return_list) {
901 free(*return_list);
902 *return_list = NULL;
904 return_size = 0;
906 return ret;
909 static int php_mb_nls_get_default_detect_order_list(mbfl_no_language lang,
910 mbfl_no_encoding **plist,
911 int* plist_size) {
912 size_t i;
913 *plist = (mbfl_no_encoding *) php_mb_default_identify_list_neut;
914 *plist_size = sizeof(php_mb_default_identify_list_neut) /
915 sizeof(php_mb_default_identify_list_neut[0]);
917 for (i = 0; i < sizeof(php_mb_default_identify_list) /
918 sizeof(php_mb_default_identify_list[0]); i++) {
919 if (php_mb_default_identify_list[i].lang == lang) {
920 *plist = php_mb_default_identify_list[i].list;
921 *plist_size = php_mb_default_identify_list[i].list_size;
922 return 1;
925 return 0;
928 static size_t php_mb_mbchar_bytes_ex(const char *s, const mbfl_encoding *enc) {
929 if (enc != NULL) {
930 if (enc->flag & MBFL_ENCTYPE_MBCS) {
931 if (enc->mblen_table != NULL) {
932 if (s != NULL) return enc->mblen_table[*(unsigned char *)s];
934 } else if (enc->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) {
935 return 2;
936 } else if (enc->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) {
937 return 4;
940 return 1;
943 static int php_mb_stripos(int mode,
944 const char *old_haystack, int old_haystack_len,
945 const char *old_needle, int old_needle_len,
946 long offset, const char *from_encoding) {
947 int n;
948 mbfl_string haystack, needle;
949 n = -1;
951 mbfl_string_init(&haystack);
952 mbfl_string_init(&needle);
953 haystack.no_language = MBSTRG(current_language);
954 haystack.no_encoding = MBSTRG(current_internal_encoding);
955 needle.no_language = MBSTRG(current_language);
956 needle.no_encoding = MBSTRG(current_internal_encoding);
958 do {
959 haystack.val = (unsigned char *)php_unicode_convert_case
960 (PHP_UNICODE_CASE_UPPER, old_haystack, (size_t)old_haystack_len,
961 &haystack.len, from_encoding);
962 if (!haystack.val) {
963 break;
965 if (haystack.len <= 0) {
966 break;
969 needle.val = (unsigned char *)php_unicode_convert_case
970 (PHP_UNICODE_CASE_UPPER, old_needle, (size_t)old_needle_len,
971 &needle.len, from_encoding);
972 if (!needle.val) {
973 break;
975 if (needle.len <= 0) {
976 break;
979 haystack.no_encoding = needle.no_encoding =
980 mbfl_name2no_encoding(from_encoding);
981 if (haystack.no_encoding == mbfl_no_encoding_invalid) {
982 raise_warning("Unknown encoding \"%s\"", from_encoding);
983 break;
986 int haystack_char_len = mbfl_strlen(&haystack);
987 if (mode) {
988 if ((offset > 0 && offset > haystack_char_len) ||
989 (offset < 0 && -offset > haystack_char_len)) {
990 raise_warning("Offset is greater than the length of haystack string");
991 break;
993 } else {
994 if (offset < 0 || offset > haystack_char_len) {
995 raise_warning("Offset not contained in string.");
996 break;
1000 n = mbfl_strpos(&haystack, &needle, offset, mode);
1001 } while(0);
1003 if (haystack.val) {
1004 free(haystack.val);
1006 if (needle.val) {
1007 free(needle.val);
1009 return n;
1012 ///////////////////////////////////////////////////////////////////////////////
1014 Array f_mb_list_encodings() {
1015 Array ret;
1016 int i = 0;
1017 const mbfl_encoding **encodings = mbfl_get_supported_encodings();
1018 const mbfl_encoding *encoding;
1019 while ((encoding = encodings[i++]) != NULL) {
1020 ret.append(String(encoding->name, CopyString));
1022 return ret;
1025 Variant f_mb_list_encodings_alias_names(CStrRef name /* = null_string */) {
1026 const mbfl_encoding **encodings;
1027 const mbfl_encoding *encoding;
1028 mbfl_no_encoding no_encoding;
1029 int i, j;
1031 Array ret;
1032 if (name.isNull()) {
1033 i = 0;
1034 encodings = mbfl_get_supported_encodings();
1035 while ((encoding = encodings[i++]) != NULL) {
1036 Array row;
1037 if (encoding->aliases != NULL) {
1038 j = 0;
1039 while ((*encoding->aliases)[j] != NULL) {
1040 row.append(String((*encoding->aliases)[j], CopyString));
1041 j++;
1044 ret.set(String(encoding->name, CopyString), row);
1046 } else {
1047 no_encoding = mbfl_name2no_encoding(name.data());
1048 if (no_encoding == mbfl_no_encoding_invalid) {
1049 raise_warning("Unknown encoding \"%s\"", name.data());
1050 return false;
1053 char *name = (char *)mbfl_no_encoding2name(no_encoding);
1054 if (name != NULL) {
1055 i = 0;
1056 encodings = mbfl_get_supported_encodings();
1057 while ((encoding = encodings[i++]) != NULL) {
1058 if (strcmp(encoding->name, name) != 0) continue;
1060 if (encoding->aliases != NULL) {
1061 j = 0;
1062 while ((*encoding->aliases)[j] != NULL) {
1063 ret.append(String((*encoding->aliases)[j], CopyString));
1064 j++;
1068 break;
1070 } else {
1071 return false;
1074 return ret;
1077 Variant f_mb_list_mime_names(CStrRef name /* = null_string */) {
1078 const mbfl_encoding **encodings;
1079 const mbfl_encoding *encoding;
1080 mbfl_no_encoding no_encoding;
1081 int i;
1083 Array ret;
1084 if (name.isNull()) {
1085 i = 0;
1086 encodings = mbfl_get_supported_encodings();
1087 while ((encoding = encodings[i++]) != NULL) {
1088 if (encoding->mime_name != NULL) {
1089 ret.set(String(encoding->name, CopyString),
1090 String(encoding->mime_name, CopyString));
1091 } else{
1092 ret.set(String(encoding->name, CopyString), "");
1095 } else {
1096 no_encoding = mbfl_name2no_encoding(name.data());
1097 if (no_encoding == mbfl_no_encoding_invalid) {
1098 raise_warning("Unknown encoding \"%s\"", name.data());
1099 return false;
1102 char *name = (char *)mbfl_no_encoding2name(no_encoding);
1103 if (name != NULL) {
1104 i = 0;
1105 encodings = mbfl_get_supported_encodings();
1106 while ((encoding = encodings[i++]) != NULL) {
1107 if (strcmp(encoding->name, name) != 0) continue;
1108 if (encoding->mime_name != NULL) {
1109 return String(encoding->mime_name, CopyString);
1111 break;
1113 return "";
1114 } else {
1115 return false;
1118 return ret;
1121 bool f_mb_check_encoding(CStrRef var /* = null_string */,
1122 CStrRef encoding /* = null_string */) {
1123 mbfl_buffer_converter *convd;
1124 mbfl_no_encoding no_encoding = MBSTRG(current_internal_encoding);
1125 mbfl_string string, result, *ret = NULL;
1126 long illegalchars = 0;
1128 if (var.isNull()) {
1129 return MBSTRG(illegalchars) == 0;
1132 if (!encoding.isNull()) {
1133 no_encoding = mbfl_name2no_encoding(encoding.data());
1134 if (no_encoding == mbfl_no_encoding_invalid ||
1135 no_encoding == mbfl_no_encoding_pass) {
1136 raise_warning("Invalid encoding \"%s\"", encoding.data());
1137 return false;
1141 convd = mbfl_buffer_converter_new(no_encoding, no_encoding, 0);
1142 if (convd == NULL) {
1143 raise_warning("Unable to create converter");
1144 return false;
1146 mbfl_buffer_converter_illegal_mode
1147 (convd, MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE);
1148 mbfl_buffer_converter_illegal_substchar
1149 (convd, 0);
1151 /* initialize string */
1152 mbfl_string_init_set(&string, mbfl_no_language_neutral, no_encoding);
1153 mbfl_string_init(&result);
1155 string.val = (unsigned char *)var.data();
1156 string.len = var.size();
1157 ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
1158 illegalchars = mbfl_buffer_illegalchars(convd);
1159 mbfl_buffer_converter_delete(convd);
1161 if (ret != NULL) {
1162 MBSTRG(illegalchars) += illegalchars;
1163 if (illegalchars == 0 && string.len == ret->len &&
1164 memcmp((const char *)string.val, (const char *)ret->val,
1165 string.len) == 0) {
1166 mbfl_string_clear(&result);
1167 return true;
1168 } else {
1169 mbfl_string_clear(&result);
1170 return false;
1172 } else {
1173 return false;
1177 Variant f_mb_convert_case(CStrRef str, int mode,
1178 CStrRef encoding /* = null_string */) {
1179 const char *enc = NULL;
1180 if (encoding.empty()) {
1181 enc = mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
1184 unsigned int ret_len;
1185 char *newstr = php_unicode_convert_case(mode, str.data(), str.size(),
1186 &ret_len, enc);
1187 if (newstr) {
1188 return String(newstr, ret_len, AttachString);
1190 return false;
1193 Variant f_mb_convert_encoding(CStrRef str, CStrRef to_encoding,
1194 CVarRef from_encoding /* = null_variant */) {
1195 String encoding = from_encoding.toString();
1196 if (from_encoding.is(KindOfArray)) {
1197 StringBuffer _from_encodings;
1198 Array encs = from_encoding.toArray();
1199 for (ArrayIter iter(encs); iter; ++iter) {
1200 if (!_from_encodings.empty()) {
1201 _from_encodings.append(",");
1203 _from_encodings.append(iter.second().toString());
1205 encoding = _from_encodings.detach();
1208 unsigned int size;
1209 char *ret = php_mb_convert_encoding(str.data(), str.size(),
1210 to_encoding.data(),
1211 (!encoding.empty() ?
1212 encoding.data() : NULL),
1213 &size);
1214 if (ret != NULL) {
1215 return String(ret, size, AttachString);
1217 return false;
1220 Variant f_mb_convert_kana(CStrRef str, CStrRef option /* = null_string */,
1221 CStrRef encoding /* = null_string */) {
1222 mbfl_string string, result, *ret;
1223 mbfl_string_init(&string);
1224 string.no_language = MBSTRG(current_language);
1225 string.no_encoding = MBSTRG(current_internal_encoding);
1226 string.val = (unsigned char *)str.data();
1227 string.len = str.size();
1229 int opt = 0x900;
1230 if (!option.empty()) {
1231 const char *p = option.data();
1232 int n = option.size();
1233 int i = 0;
1234 opt = 0;
1235 while (i < n) {
1236 i++;
1237 switch (*p++) {
1238 case 'A': opt |= 0x1; break;
1239 case 'a': opt |= 0x10; break;
1240 case 'R': opt |= 0x2; break;
1241 case 'r': opt |= 0x20; break;
1242 case 'N': opt |= 0x4; break;
1243 case 'n': opt |= 0x40; break;
1244 case 'S': opt |= 0x8; break;
1245 case 's': opt |= 0x80; break;
1246 case 'K': opt |= 0x100; break;
1247 case 'k': opt |= 0x1000; break;
1248 case 'H': opt |= 0x200; break;
1249 case 'h': opt |= 0x2000; break;
1250 case 'V': opt |= 0x800; break;
1251 case 'C': opt |= 0x10000; break;
1252 case 'c': opt |= 0x20000; break;
1253 case 'M': opt |= 0x100000; break;
1254 case 'm': opt |= 0x200000; break;
1259 /* encoding */
1260 if (!encoding.empty()) {
1261 string.no_encoding = mbfl_name2no_encoding(encoding.data());
1262 if (string.no_encoding == mbfl_no_encoding_invalid) {
1263 raise_warning("Unknown encoding \"%s\"", encoding.data());
1264 return false;
1268 ret = mbfl_ja_jp_hantozen(&string, &result, opt);
1269 if (ret != NULL) {
1270 return String((const char*)ret->val, ret->len, AttachString);
1272 return false;
1275 static bool php_mbfl_encoding_detect(CVarRef var,
1276 mbfl_encoding_detector *identd,
1277 mbfl_string *string) {
1278 if (var.is(KindOfArray) || var.is(KindOfObject)) {
1279 Array items = var.toArray();
1280 for (ArrayIter iter(items); iter; ++iter) {
1281 if (php_mbfl_encoding_detect(iter.second(), identd, string)) {
1282 return true;
1285 } else if (var.isString()) {
1286 String svar = var.toString();
1287 string->val = (unsigned char *)svar.data();
1288 string->len = svar.size();
1289 if (mbfl_encoding_detector_feed(identd, string)) {
1290 return true;
1293 return false;
1296 static Variant php_mbfl_convert(CVarRef var,
1297 mbfl_buffer_converter *convd,
1298 mbfl_string *string,
1299 mbfl_string *result) {
1300 if (var.is(KindOfArray)) {
1301 Array ret;
1302 Array items = var.toArray();
1303 for (ArrayIter iter(items); iter; ++iter) {
1304 ret.set(iter.first(),
1305 php_mbfl_convert(iter.second(), convd, string, result));
1307 return ret;
1310 if (var.is(KindOfObject)) {
1311 Object obj = var.toObject();
1312 Array items = var.toArray();
1313 for (ArrayIter iter(items); iter; ++iter) {
1314 obj->o_set(iter.first().toString(),
1315 php_mbfl_convert(iter.second().toString().data(), convd,
1316 string, result));
1318 return var; // which still has obj
1321 if (var.isString()) {
1322 String svar = var.toString();
1323 string->val = (unsigned char *)svar.data();
1324 string->len = svar.size();
1325 mbfl_string *ret =
1326 mbfl_buffer_converter_feed_result(convd, string, result);
1327 return String((const char*)ret->val, ret->len, AttachString);
1330 return var;
1333 Variant f_mb_convert_variables(int _argc, CStrRef to_encoding,
1334 CVarRef from_encoding, VRefParam vars,
1335 CArrRef _argv /* = null_array */) {
1336 mbfl_string string, result;
1337 mbfl_no_encoding _from_encoding, _to_encoding;
1338 mbfl_encoding_detector *identd;
1339 mbfl_buffer_converter *convd;
1340 int elistsz;
1341 mbfl_no_encoding *elist;
1342 char *name;
1344 /* new encoding */
1345 _to_encoding = mbfl_name2no_encoding(to_encoding.data());
1346 if (_to_encoding == mbfl_no_encoding_invalid) {
1347 raise_warning("Unknown encoding \"%s\"", to_encoding.data());
1348 return false;
1351 /* initialize string */
1352 mbfl_string_init(&string);
1353 mbfl_string_init(&result);
1354 _from_encoding = MBSTRG(current_internal_encoding);
1355 string.no_encoding = _from_encoding;
1356 string.no_language = MBSTRG(current_language);
1358 /* pre-conversion encoding */
1359 elist = NULL;
1360 elistsz = 0;
1361 php_mb_parse_encoding(from_encoding, &elist, &elistsz, false);
1362 if (elistsz <= 0) {
1363 _from_encoding = mbfl_no_encoding_pass;
1364 } else if (elistsz == 1) {
1365 _from_encoding = *elist;
1366 } else {
1367 /* auto detect */
1368 _from_encoding = mbfl_no_encoding_invalid;
1369 identd = mbfl_encoding_detector_new(elist, elistsz,
1370 MBSTRG(strict_detection));
1371 if (identd != NULL) {
1372 for (int n = -1; n < _argv.size(); n++) {
1373 if (php_mbfl_encoding_detect(n < 0 ? (Variant&)vars : _argv[n],
1374 identd, &string)) {
1375 break;
1378 _from_encoding = mbfl_encoding_detector_judge(identd);
1379 mbfl_encoding_detector_delete(identd);
1382 if (_from_encoding == mbfl_no_encoding_invalid) {
1383 raise_warning("Unable to detect encoding");
1384 _from_encoding = mbfl_no_encoding_pass;
1387 if (elist != NULL) {
1388 free((void *)elist);
1391 /* create converter */
1392 convd = NULL;
1393 if (_from_encoding != mbfl_no_encoding_pass) {
1394 convd = mbfl_buffer_converter_new(_from_encoding, _to_encoding, 0);
1395 if (convd == NULL) {
1396 raise_warning("Unable to create converter");
1397 return false;
1399 mbfl_buffer_converter_illegal_mode
1400 (convd, MBSTRG(current_filter_illegal_mode));
1401 mbfl_buffer_converter_illegal_substchar
1402 (convd, MBSTRG(current_filter_illegal_substchar));
1405 /* convert */
1406 if (convd != NULL) {
1407 vars = php_mbfl_convert(vars, convd, &string, &result);
1408 for (int n = 0; n < _argv.size(); n++) {
1409 const_cast<Array&>(_argv).lval(n) =
1410 php_mbfl_convert(_argv[n], convd, &string, &result);
1412 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
1413 mbfl_buffer_converter_delete(convd);
1416 name = (char *)mbfl_no_encoding2name(_from_encoding);
1417 if (name != NULL) {
1418 return String(name, CopyString);
1420 return false;
1423 Variant f_mb_decode_mimeheader(CStrRef str) {
1424 mbfl_string string, result, *ret;
1426 mbfl_string_init(&string);
1427 string.no_language = MBSTRG(current_language);
1428 string.no_encoding = MBSTRG(current_internal_encoding);
1429 string.val = (unsigned char *)str.data();
1430 string.len = str.size();
1432 mbfl_string_init(&result);
1433 ret = mbfl_mime_header_decode(&string, &result,
1434 MBSTRG(current_internal_encoding));
1435 if (ret != NULL) {
1436 return String((const char*)ret->val, ret->len, AttachString);
1438 return false;
1441 static Variant php_mb_numericentity_exec(CStrRef str, CVarRef convmap,
1442 CStrRef encoding, int type) {
1443 int mapsize=0;
1444 mbfl_string string, result, *ret;
1445 mbfl_no_encoding no_encoding;
1447 mbfl_string_init(&string);
1448 string.no_language = MBSTRG(current_language);
1449 string.no_encoding = MBSTRG(current_internal_encoding);
1450 string.val = (unsigned char *)str.data();
1451 string.len = str.size();
1453 /* encoding */
1454 if (!encoding.empty()) {
1455 no_encoding = mbfl_name2no_encoding(encoding.data());
1456 if (no_encoding == mbfl_no_encoding_invalid) {
1457 raise_warning("Unknown encoding \"%s\"", encoding.data());
1458 return false;
1459 } else {
1460 string.no_encoding = no_encoding;
1464 /* conversion map */
1465 int *iconvmap = NULL;
1466 if (convmap.is(KindOfArray)) {
1467 Array convs = convmap.toArray();
1468 mapsize = convs.size();
1469 if (mapsize > 0) {
1470 iconvmap = (int*)malloc(mapsize * sizeof(int));
1471 int *mapelm = iconvmap;
1472 for (ArrayIter iter(convs); iter; ++iter) {
1473 *mapelm++ = iter.second().toInt32();
1477 if (iconvmap == NULL) {
1478 return false;
1480 mapsize /= 4;
1482 ret = mbfl_html_numeric_entity(&string, &result, iconvmap, mapsize, type);
1483 free(iconvmap);
1484 if (ret != NULL) {
1485 return String((const char*)ret->val, ret->len, AttachString);
1487 return false;
1490 Variant f_mb_decode_numericentity(CStrRef str, CVarRef convmap,
1491 CStrRef encoding /* = null_string */) {
1492 return php_mb_numericentity_exec(str, convmap, encoding, 1);
1495 Variant f_mb_detect_encoding(CStrRef str, CVarRef encoding_list /* = null_variant */,
1496 CVarRef strict /* = null_variant */) {
1497 mbfl_string string;
1498 const char *ret;
1499 mbfl_no_encoding *elist;
1500 int size;
1501 mbfl_no_encoding *list = 0;
1503 /* make encoding list */
1504 list = NULL;
1505 size = 0;
1506 php_mb_parse_encoding(encoding_list, &list, &size, false);
1507 if (size > 0 && list != NULL) {
1508 elist = list;
1509 } else {
1510 elist = MBSTRG(current_detect_order_list);
1511 size = MBSTRG(current_detect_order_list_size);
1514 long nstrict = 0;
1515 if (!strict.isNull()) {
1516 nstrict = strict.toInt64();
1517 } else {
1518 nstrict = MBSTRG(strict_detection);
1521 mbfl_string_init(&string);
1522 string.no_language = MBSTRG(current_language);
1523 string.val = (unsigned char *)str.data();
1524 string.len = str.size();
1525 ret = mbfl_identify_encoding_name(&string, elist, size, nstrict);
1526 if (list != NULL) {
1527 free(list);
1529 if (ret != NULL) {
1530 return String(ret, CopyString);
1532 return false;
1535 Variant f_mb_detect_order(CVarRef encoding_list /* = null_variant */) {
1536 int n, size;
1537 mbfl_no_encoding *list, *entry;
1539 if (encoding_list.isNull()) {
1540 Array ret;
1541 entry = MBSTRG(current_detect_order_list);
1542 n = MBSTRG(current_detect_order_list_size);
1543 while (n > 0) {
1544 char *name = (char *)mbfl_no_encoding2name(*entry);
1545 if (name) {
1546 ret.append(String(name, CopyString));
1548 entry++;
1549 n--;
1551 return ret;
1554 list = NULL;
1555 size = 0;
1556 if (!php_mb_parse_encoding(encoding_list, &list, &size, false) ||
1557 list == NULL) {
1558 return false;
1560 if (MBSTRG(current_detect_order_list)) {
1561 free(MBSTRG(current_detect_order_list));
1563 MBSTRG(current_detect_order_list) = list;
1564 MBSTRG(current_detect_order_list_size) = size;
1565 return true;
1568 Variant f_mb_encode_mimeheader(CStrRef str, CStrRef charset /* = null_string */,
1569 CStrRef transfer_encoding /* = null_string */,
1570 CStrRef linefeed /* = "\r\n" */,
1571 int indent /* = 0 */) {
1572 mbfl_no_encoding charsetenc, transenc;
1573 mbfl_string string, result, *ret;
1575 mbfl_string_init(&string);
1576 string.no_language = MBSTRG(current_language);
1577 string.no_encoding = MBSTRG(current_internal_encoding);
1578 string.val = (unsigned char *)str.data();
1579 string.len = str.size();
1581 charsetenc = mbfl_no_encoding_pass;
1582 transenc = mbfl_no_encoding_base64;
1584 if (!charset.empty()) {
1585 charsetenc = mbfl_name2no_encoding(charset.data());
1586 if (charsetenc == mbfl_no_encoding_invalid) {
1587 raise_warning("Unknown encoding \"%s\"", charset.data());
1588 return false;
1590 } else {
1591 const mbfl_language *lang = mbfl_no2language(MBSTRG(current_language));
1592 if (lang != NULL) {
1593 charsetenc = lang->mail_charset;
1594 transenc = lang->mail_header_encoding;
1598 if (!transfer_encoding.empty()) {
1599 char ch = *transfer_encoding.data();
1600 if (ch == 'B' || ch == 'b') {
1601 transenc = mbfl_no_encoding_base64;
1602 } else if (ch == 'Q' || ch == 'q') {
1603 transenc = mbfl_no_encoding_qprint;
1607 mbfl_string_init(&result);
1608 ret = mbfl_mime_header_encode(&string, &result, charsetenc, transenc,
1609 linefeed.data(), indent);
1610 if (ret != NULL) {
1611 return String((const char*)ret->val, ret->len, AttachString);
1613 return false;
1616 Variant f_mb_encode_numericentity(CStrRef str, CVarRef convmap,
1617 CStrRef encoding /* = null_string */) {
1618 return php_mb_numericentity_exec(str, convmap, encoding, 0);
1621 const StaticString
1622 s_internal_encoding("internal_encoding"),
1623 s_http_input("http_input"),
1624 s_http_output("http_output"),
1625 s_mail_charset("mail_charset"),
1626 s_mail_header_encoding("mail_header_encoding"),
1627 s_mail_body_encoding("mail_body_encoding"),
1628 s_illegal_chars("illegal_chars"),
1629 s_encoding_translation("encoding_translation"),
1630 s_On("On"),
1631 s_Off("Off"),
1632 s_language("language"),
1633 s_detect_order("detect_order"),
1634 s_substitute_character("substitute_character"),
1635 s_strict_detection("strict_detection"),
1636 s_none("none"),
1637 s_long("long"),
1638 s_entity("entity");
1640 Variant f_mb_get_info(CStrRef type /* = null_string */) {
1641 const mbfl_language *lang = mbfl_no2language(MBSTRG(current_language));
1642 mbfl_no_encoding *entry;
1643 int n;
1645 char *name;
1646 if (type.empty() || strcasecmp(type.data(), "all") == 0) {
1647 Array ret;
1648 if ((name = (char *)mbfl_no_encoding2name
1649 (MBSTRG(current_internal_encoding))) != NULL) {
1650 ret.set(s_internal_encoding, String(name, CopyString));
1652 if ((name = (char *)mbfl_no_encoding2name
1653 (MBSTRG(http_input_identify))) != NULL) {
1654 ret.set(s_http_input, String(name, CopyString));
1656 if ((name = (char *)mbfl_no_encoding2name
1657 (MBSTRG(current_http_output_encoding))) != NULL) {
1658 ret.set(s_http_output, String(name, CopyString));
1660 if (lang != NULL) {
1661 if ((name = (char *)mbfl_no_encoding2name
1662 (lang->mail_charset)) != NULL) {
1663 ret.set(s_mail_charset, String(name, CopyString));
1665 if ((name = (char *)mbfl_no_encoding2name
1666 (lang->mail_header_encoding)) != NULL) {
1667 ret.set(s_mail_header_encoding, String(name, CopyString));
1669 if ((name = (char *)mbfl_no_encoding2name
1670 (lang->mail_body_encoding)) != NULL) {
1671 ret.set(s_mail_body_encoding, String(name, CopyString));
1674 ret.set(s_illegal_chars, MBSTRG(illegalchars));
1675 ret.set(s_encoding_translation,
1676 MBSTRG(encoding_translation) ? s_On : s_Off);
1677 if ((name = (char *)mbfl_no_language2name
1678 (MBSTRG(current_language))) != NULL) {
1679 ret.set(s_language, String(name, CopyString));
1681 n = MBSTRG(current_detect_order_list_size);
1682 entry = MBSTRG(current_detect_order_list);
1683 if (n > 0) {
1684 Array row;
1685 while (n > 0) {
1686 if ((name = (char *)mbfl_no_encoding2name(*entry)) != NULL) {
1687 row.append(String(name, CopyString));
1689 entry++;
1690 n--;
1692 ret.set(s_detect_order, row);
1694 switch (MBSTRG(current_filter_illegal_mode)) {
1695 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE:
1696 ret.set(s_substitute_character, s_none);
1697 break;
1698 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
1699 ret.set(s_substitute_character, s_long);
1700 break;
1701 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
1702 ret.set(s_substitute_character, s_entity);
1703 break;
1704 default:
1705 ret.set(s_substitute_character,
1706 MBSTRG(current_filter_illegal_substchar));
1708 ret.set(s_strict_detection, MBSTRG(strict_detection) ? s_On : s_Off);
1709 return ret;
1710 } else if (strcasecmp(type.data(), "internal_encoding") == 0) {
1711 if ((name = (char *)mbfl_no_encoding2name
1712 (MBSTRG(current_internal_encoding))) != NULL) {
1713 return String(name, CopyString);
1715 } else if (strcasecmp(type.data(), "http_input") == 0) {
1716 if ((name = (char *)mbfl_no_encoding2name
1717 (MBSTRG(http_input_identify))) != NULL) {
1718 return String(name, CopyString);
1720 } else if (strcasecmp(type.data(), "http_output") == 0) {
1721 if ((name = (char *)mbfl_no_encoding2name
1722 (MBSTRG(current_http_output_encoding))) != NULL) {
1723 return String(name, CopyString);
1725 } else if (strcasecmp(type.data(), "mail_charset") == 0) {
1726 if (lang != NULL &&
1727 (name = (char *)mbfl_no_encoding2name
1728 (lang->mail_charset)) != NULL) {
1729 return String(name, CopyString);
1731 } else if (strcasecmp(type.data(), "mail_header_encoding") == 0) {
1732 if (lang != NULL &&
1733 (name = (char *)mbfl_no_encoding2name
1734 (lang->mail_header_encoding)) != NULL) {
1735 return String(name, CopyString);
1737 } else if (strcasecmp(type.data(), "mail_body_encoding") == 0) {
1738 if (lang != NULL &&
1739 (name = (char *)mbfl_no_encoding2name
1740 (lang->mail_body_encoding)) != NULL) {
1741 return String(name, CopyString);
1743 } else if (strcasecmp(type.data(), "illegal_chars") == 0) {
1744 return MBSTRG(illegalchars);
1745 } else if (strcasecmp(type.data(), "encoding_translation") == 0) {
1746 return MBSTRG(encoding_translation) ? "On" : "Off";
1747 } else if (strcasecmp(type.data(), "language") == 0) {
1748 if ((name = (char *)mbfl_no_language2name
1749 (MBSTRG(current_language))) != NULL) {
1750 return String(name, CopyString);
1752 } else if (strcasecmp(type.data(), "detect_order") == 0) {
1753 n = MBSTRG(current_detect_order_list_size);
1754 entry = MBSTRG(current_detect_order_list);
1755 if (n > 0) {
1756 Array ret;
1757 while (n > 0) {
1758 name = (char *)mbfl_no_encoding2name(*entry);
1759 if (name) {
1760 ret.append(String(name, CopyString));
1762 entry++;
1763 n--;
1766 } else if (strcasecmp(type.data(), "substitute_character") == 0) {
1767 if (MBSTRG(current_filter_illegal_mode) ==
1768 MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
1769 return s_none;
1770 } else if (MBSTRG(current_filter_illegal_mode) ==
1771 MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
1772 return s_long;
1773 } else if (MBSTRG(current_filter_illegal_mode) ==
1774 MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
1775 return s_entity;
1776 } else {
1777 return MBSTRG(current_filter_illegal_substchar);
1779 } else if (strcasecmp(type.data(), "strict_detection") == 0) {
1780 return MBSTRG(strict_detection) ? s_On : s_Off;
1782 return false;
1785 Variant f_mb_http_input(CStrRef type /* = null_string */) {
1786 int n;
1787 char *name;
1788 mbfl_no_encoding *entry;
1789 mbfl_no_encoding result = mbfl_no_encoding_invalid;
1791 if (type.empty()) {
1792 result = MBSTRG(http_input_identify);
1793 } else {
1794 switch (*type.data()) {
1795 case 'G': case 'g': result = MBSTRG(http_input_identify_get); break;
1796 case 'P': case 'p': result = MBSTRG(http_input_identify_post); break;
1797 case 'C': case 'c': result = MBSTRG(http_input_identify_cookie); break;
1798 case 'S': case 's': result = MBSTRG(http_input_identify_string); break;
1799 case 'I': case 'i':
1801 Array ret;
1802 entry = MBSTRG(http_input_list);
1803 n = MBSTRG(http_input_list_size);
1804 while (n > 0) {
1805 name = (char *)mbfl_no_encoding2name(*entry);
1806 if (name) {
1807 ret.append(String(name, CopyString));
1809 entry++;
1810 n--;
1812 return ret;
1814 case 'L': case 'l':
1816 entry = MBSTRG(http_input_list);
1817 n = MBSTRG(http_input_list_size);
1818 StringBuffer list;
1819 while (n > 0) {
1820 name = (char *)mbfl_no_encoding2name(*entry);
1821 if (name) {
1822 if (list.empty()) {
1823 list.append(name);
1824 } else {
1825 list.append(',');
1826 list.append(name);
1829 entry++;
1830 n--;
1832 if (list.empty()) {
1833 return false;
1835 return list.detach();
1837 default:
1838 result = MBSTRG(http_input_identify);
1839 break;
1843 if (result != mbfl_no_encoding_invalid &&
1844 (name = (char *)mbfl_no_encoding2name(result)) != NULL) {
1845 return String(name, CopyString);
1847 return false;
1850 Variant f_mb_http_output(CStrRef encoding /* = null_string */) {
1851 if (encoding.empty()) {
1852 char *name = (char *)mbfl_no_encoding2name
1853 (MBSTRG(current_http_output_encoding));
1854 if (name != NULL) {
1855 return String(name, CopyString);
1857 return false;
1860 mbfl_no_encoding no_encoding = mbfl_name2no_encoding(encoding.data());
1861 if (no_encoding == mbfl_no_encoding_invalid) {
1862 raise_warning("Unknown encoding \"%s\"", encoding.data());
1863 return false;
1865 MBSTRG(current_http_output_encoding) = no_encoding;
1866 return true;
1869 Variant f_mb_internal_encoding(CStrRef encoding /* = null_string */) {
1870 if (encoding.empty()) {
1871 char *name = (char *)mbfl_no_encoding2name
1872 (MBSTRG(current_internal_encoding));
1873 if (name != NULL) {
1874 return String(name, CopyString);
1876 return false;
1879 mbfl_no_encoding no_encoding = mbfl_name2no_encoding(encoding.data());
1880 if (no_encoding == mbfl_no_encoding_invalid) {
1881 raise_warning("Unknown encoding \"%s\"", encoding.data());
1882 return false;
1885 MBSTRG(current_internal_encoding) = no_encoding;
1886 return true;
1889 Variant f_mb_language(CStrRef language /* = null_string */) {
1890 if (language.empty()) {
1891 return String(mbfl_no_language2name(MBSTRG(current_language)), CopyString);
1894 mbfl_no_language no_language = mbfl_name2no_language(language.data());
1895 if (no_language == mbfl_no_language_invalid) {
1896 raise_warning("Unknown language \"%s\"", language.data());
1897 return false;
1900 php_mb_nls_get_default_detect_order_list
1901 (no_language, &MBSTRG(default_detect_order_list),
1902 &MBSTRG(default_detect_order_list_size));
1903 MBSTRG(current_language) = no_language;
1904 return true;
1907 String f_mb_output_handler(CStrRef contents, int status) {
1908 mbfl_string string, result;
1909 int last_feed;
1911 mbfl_no_encoding encoding = MBSTRG(current_http_output_encoding);
1913 /* start phase only */
1914 if (status & PHP_OUTPUT_HANDLER_START) {
1915 /* delete the converter just in case. */
1916 if (MBSTRG(outconv)) {
1917 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1918 mbfl_buffer_converter_delete(MBSTRG(outconv));
1919 MBSTRG(outconv) = NULL;
1921 if (encoding == mbfl_no_encoding_pass) {
1922 return contents;
1925 /* analyze mime type */
1926 String mimetype = g_context->getMimeType();
1927 if (!mimetype.empty()) {
1928 const char *charset = mbfl_no2preferred_mime_name(encoding);
1929 if (charset) {
1930 g_context->setContentType(mimetype, charset);
1932 /* activate the converter */
1933 MBSTRG(outconv) = mbfl_buffer_converter_new
1934 (MBSTRG(current_internal_encoding), encoding, 0);
1938 /* just return if the converter is not activated. */
1939 if (MBSTRG(outconv) == NULL) {
1940 return contents;
1943 /* flag */
1944 last_feed = ((status & PHP_OUTPUT_HANDLER_END) != 0);
1945 /* mode */
1946 mbfl_buffer_converter_illegal_mode
1947 (MBSTRG(outconv), MBSTRG(current_filter_illegal_mode));
1948 mbfl_buffer_converter_illegal_substchar
1949 (MBSTRG(outconv), MBSTRG(current_filter_illegal_substchar));
1951 /* feed the string */
1952 mbfl_string_init(&string);
1953 string.no_language = MBSTRG(current_language);
1954 string.no_encoding = MBSTRG(current_internal_encoding);
1955 string.val = (unsigned char *)contents.data();
1956 string.len = contents.size();
1957 mbfl_buffer_converter_feed(MBSTRG(outconv), &string);
1958 if (last_feed) {
1959 mbfl_buffer_converter_flush(MBSTRG(outconv));
1961 /* get the converter output, and return it */
1962 mbfl_buffer_converter_result(MBSTRG(outconv), &result);
1964 /* delete the converter if it is the last feed. */
1965 if (last_feed) {
1966 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(MBSTRG(outconv));
1967 mbfl_buffer_converter_delete(MBSTRG(outconv));
1968 MBSTRG(outconv) = NULL;
1971 return String((const char *)result.val, result.len, AttachString);
1974 typedef struct _php_mb_encoding_handler_info_t {
1975 int data_type;
1976 const char *separator;
1977 unsigned int force_register_globals: 1;
1978 unsigned int report_errors: 1;
1979 enum mbfl_no_language to_language;
1980 enum mbfl_no_encoding to_encoding;
1981 enum mbfl_no_language from_language;
1982 int num_from_encodings;
1983 const enum mbfl_no_encoding *from_encodings;
1984 } php_mb_encoding_handler_info_t;
1986 static mbfl_no_encoding _php_mb_encoding_handler_ex
1987 (const php_mb_encoding_handler_info_t *info, Variant &arg, char *res) {
1988 char *var, *val;
1989 const char *s1, *s2;
1990 char *strtok_buf = NULL, **val_list = NULL;
1991 int n, num, *len_list = NULL;
1992 unsigned int val_len;
1993 mbfl_string string, resvar, resval;
1994 enum mbfl_no_encoding from_encoding = mbfl_no_encoding_invalid;
1995 mbfl_encoding_detector *identd = NULL;
1996 mbfl_buffer_converter *convd = NULL;
1998 mbfl_string_init_set(&string, info->to_language, info->to_encoding);
1999 mbfl_string_init_set(&resvar, info->to_language, info->to_encoding);
2000 mbfl_string_init_set(&resval, info->to_language, info->to_encoding);
2002 if (!res || *res == '\0') {
2003 goto out;
2006 /* count the variables(separators) contained in the "res".
2007 * separator may contain multiple separator chars.
2009 num = 1;
2010 for (s1=res; *s1 != '\0'; s1++) {
2011 for (s2=info->separator; *s2 != '\0'; s2++) {
2012 if (*s1 == *s2) {
2013 num++;
2017 num *= 2; /* need space for variable name and value */
2019 val_list = (char **)calloc(num, sizeof(char *));
2020 len_list = (int *)calloc(num, sizeof(int));
2022 /* split and decode the query */
2023 n = 0;
2024 strtok_buf = NULL;
2025 var = strtok_r(res, info->separator, &strtok_buf);
2026 while (var) {
2027 val = strchr(var, '=');
2028 if (val) { /* have a value */
2029 len_list[n] = url_decode_ex(var, val-var);
2030 val_list[n] = var;
2031 n++;
2033 *val++ = '\0';
2034 val_list[n] = val;
2035 len_list[n] = url_decode_ex(val, strlen(val));
2036 } else {
2037 len_list[n] = url_decode_ex(var, strlen(var));
2038 val_list[n] = var;
2039 n++;
2041 val_list[n] = const_cast<char*>("");
2042 len_list[n] = 0;
2044 n++;
2045 var = strtok_r(NULL, info->separator, &strtok_buf);
2047 num = n; /* make sure to process initilized vars only */
2049 /* initialize converter */
2050 if (info->num_from_encodings <= 0) {
2051 from_encoding = mbfl_no_encoding_pass;
2052 } else if (info->num_from_encodings == 1) {
2053 from_encoding = info->from_encodings[0];
2054 } else {
2055 /* auto detect */
2056 from_encoding = mbfl_no_encoding_invalid;
2057 identd = mbfl_encoding_detector_new
2058 ((enum mbfl_no_encoding *)info->from_encodings,
2059 info->num_from_encodings, MBSTRG(strict_detection));
2060 if (identd) {
2061 n = 0;
2062 while (n < num) {
2063 string.val = (unsigned char *)val_list[n];
2064 string.len = len_list[n];
2065 if (mbfl_encoding_detector_feed(identd, &string)) {
2066 break;
2068 n++;
2070 from_encoding = mbfl_encoding_detector_judge(identd);
2071 mbfl_encoding_detector_delete(identd);
2073 if (from_encoding == mbfl_no_encoding_invalid) {
2074 if (info->report_errors) {
2075 raise_warning("Unable to detect encoding");
2077 from_encoding = mbfl_no_encoding_pass;
2081 convd = NULL;
2082 if (from_encoding != mbfl_no_encoding_pass) {
2083 convd = mbfl_buffer_converter_new(from_encoding, info->to_encoding, 0);
2084 if (convd != NULL) {
2085 mbfl_buffer_converter_illegal_mode
2086 (convd, MBSTRG(current_filter_illegal_mode));
2087 mbfl_buffer_converter_illegal_substchar
2088 (convd, MBSTRG(current_filter_illegal_substchar));
2089 } else {
2090 if (info->report_errors) {
2091 raise_warning("Unable to create converter");
2093 goto out;
2097 /* convert encoding */
2098 string.no_encoding = from_encoding;
2100 n = 0;
2101 while (n < num) {
2102 string.val = (unsigned char *)val_list[n];
2103 string.len = len_list[n];
2104 if (convd != NULL &&
2105 mbfl_buffer_converter_feed_result(convd, &string, &resvar) != NULL) {
2106 var = (char *)resvar.val;
2107 } else {
2108 var = val_list[n];
2110 n++;
2111 string.val = (unsigned char *)val_list[n];
2112 string.len = len_list[n];
2113 if (convd != NULL &&
2114 mbfl_buffer_converter_feed_result(convd, &string, &resval) != NULL) {
2115 val = (char *)resval.val;
2116 val_len = resval.len;
2117 } else {
2118 val = val_list[n];
2119 val_len = len_list[n];
2121 n++;
2123 arg.set(String(var, CopyString), String(val, val_len, CopyString));
2125 if (convd != NULL){
2126 mbfl_string_clear(&resvar);
2127 mbfl_string_clear(&resval);
2131 out:
2132 if (convd != NULL) {
2133 MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
2134 mbfl_buffer_converter_delete(convd);
2136 if (val_list != NULL) {
2137 free((void *)val_list);
2139 if (len_list != NULL) {
2140 free((void *)len_list);
2143 return from_encoding;
2146 bool f_mb_parse_str(CStrRef encoded_string, VRefParam result /* = null */) {
2147 php_mb_encoding_handler_info_t info;
2148 info.data_type = PARSE_STRING;
2149 info.separator = ";&";
2150 info.force_register_globals = false;
2151 info.report_errors = 1;
2152 info.to_encoding = MBSTRG(current_internal_encoding);
2153 info.to_language = MBSTRG(current_language);
2154 info.from_encodings = MBSTRG(http_input_list);
2155 info.num_from_encodings = MBSTRG(http_input_list_size);
2156 info.from_language = MBSTRG(current_language);
2158 char *encstr = strndup(encoded_string.data(), encoded_string.size());
2159 mbfl_no_encoding detected =
2160 _php_mb_encoding_handler_ex(&info, result, encstr);
2161 free(encstr);
2163 MBSTRG(http_input_identify) = detected;
2164 return detected != mbfl_no_encoding_invalid;
2167 Variant f_mb_preferred_mime_name(CStrRef encoding) {
2168 mbfl_no_encoding no_encoding = mbfl_name2no_encoding(encoding.data());
2169 if (no_encoding == mbfl_no_encoding_invalid) {
2170 raise_warning("Unknown encoding \"%s\"", encoding.data());
2171 return false;
2174 const char *preferred_name = mbfl_no2preferred_mime_name(no_encoding);
2175 if (preferred_name == NULL || *preferred_name == '\0') {
2176 raise_warning("No MIME preferred name corresponding to \"%s\"",
2177 encoding.data());
2178 return false;
2181 return String(preferred_name, CopyString);
2184 static Variant php_mb_substr(CStrRef str, int from, int len,
2185 CStrRef encoding, bool substr) {
2186 mbfl_string string;
2187 mbfl_string_init(&string);
2188 string.no_language = MBSTRG(current_language);
2189 string.no_encoding = MBSTRG(current_internal_encoding);
2190 string.val = (unsigned char *)str.data();
2191 string.len = str.size();
2193 if (!encoding.empty()) {
2194 string.no_encoding = mbfl_name2no_encoding(encoding.data());
2195 if (string.no_encoding == mbfl_no_encoding_invalid) {
2196 raise_warning("Unknown encoding \"%s\"", encoding.data());
2197 return false;
2201 int size;
2202 if (substr) {
2203 size = mbfl_strlen(&string);
2204 } else {
2205 size = str.size();
2207 if (len == 0x7FFFFFFF) {
2208 len = size;
2211 /* if "from" position is negative, count start position from the end
2212 * of the string
2214 if (from < 0) {
2215 from = size + from;
2216 if (from < 0) {
2217 from = 0;
2221 /* if "length" position is negative, set it to the length
2222 * needed to stop that many chars from the end of the string
2224 if (len < 0) {
2225 len = (size - from) + len;
2226 if (len < 0) {
2227 len = 0;
2231 if (from > size) {
2232 if (!substr) {
2233 return false;
2235 from = size;
2238 mbfl_string result;
2239 mbfl_string *ret;
2240 if (substr) {
2241 ret = mbfl_substr(&string, &result, from, len);
2242 } else {
2243 ret = mbfl_strcut(&string, &result, from, len);
2245 if (ret != NULL) {
2246 return String((const char*)ret->val, ret->len, AttachString);
2248 return false;
2251 Variant f_mb_substr(CStrRef str, int start, int length /* = 0x7FFFFFFF */,
2252 CStrRef encoding /* = null_string */) {
2253 return php_mb_substr(str, start, length, encoding, true);
2256 Variant f_mb_strcut(CStrRef str, int start, int length /* = 0x7FFFFFFF */,
2257 CStrRef encoding /* = null_string */) {
2258 return php_mb_substr(str, start, length, encoding, false);
2261 Variant f_mb_strimwidth(CStrRef str, int start, int width,
2262 CStrRef trimmarker /* = null_string */,
2263 CStrRef encoding /* = null_string */) {
2264 mbfl_string string, result, marker, *ret;
2266 mbfl_string_init(&string);
2267 mbfl_string_init(&marker);
2268 string.no_language = MBSTRG(current_language);
2269 string.no_encoding = MBSTRG(current_internal_encoding);
2270 marker.no_language = MBSTRG(current_language);
2271 marker.no_encoding = MBSTRG(current_internal_encoding);
2272 marker.val = NULL;
2273 marker.len = 0;
2275 if (!encoding.empty()) {
2276 string.no_encoding = marker.no_encoding =
2277 mbfl_name2no_encoding(encoding.data());
2278 if (string.no_encoding == mbfl_no_encoding_invalid) {
2279 raise_warning("Unknown encoding \"%s\"", encoding.data());
2280 return false;
2284 string.val = (unsigned char *)str.data();
2285 string.len = str.size();
2287 if (start < 0 || start > str.size()) {
2288 raise_warning("Start position is out of reange");
2289 return false;
2292 if (width < 0) {
2293 raise_warning("Width is negative value");
2294 return false;
2297 marker.val = (unsigned char *)trimmarker.data();
2298 marker.len = trimmarker.size();
2300 ret = mbfl_strimwidth(&string, &marker, &result, start, width);
2301 if (ret != NULL) {
2302 return String((const char *)ret->val, ret->len, AttachString);
2304 return false;
2307 Variant f_mb_stripos(CStrRef haystack, CStrRef needle, int offset /* = 0 */,
2308 CStrRef encoding /* = null_string */) {
2309 const char *from_encoding;
2310 if (encoding.empty()) {
2311 from_encoding =
2312 mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
2313 } else {
2314 from_encoding = encoding.data();
2317 if (needle.empty()) {
2318 raise_warning("Empty delimiter");
2319 return false;
2322 int n = php_mb_stripos(0, haystack.data(), haystack.size(),
2323 needle.data(), needle.size(), offset, from_encoding);
2324 if (n >= 0) {
2325 return n;
2327 return false;
2330 Variant f_mb_strripos(CStrRef haystack, CStrRef needle, int offset /* = 0 */,
2331 CStrRef encoding /* = null_string */) {
2332 const char *from_encoding;
2333 if (encoding.empty()) {
2334 from_encoding =
2335 mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
2336 } else {
2337 from_encoding = encoding.data();
2340 int n = php_mb_stripos(1, haystack.data(), haystack.size(),
2341 needle.data(), needle.size(), offset, from_encoding);
2342 if (n >= 0) {
2343 return n;
2345 return false;
2348 Variant f_mb_stristr(CStrRef haystack, CStrRef needle, bool part /* = false */,
2349 CStrRef encoding /* = null_string */) {
2350 mbfl_string mbs_haystack;
2351 mbfl_string_init(&mbs_haystack);
2352 mbs_haystack.no_language = MBSTRG(current_language);
2353 mbs_haystack.no_encoding = MBSTRG(current_internal_encoding);
2354 mbs_haystack.val = (unsigned char *)haystack.data();
2355 mbs_haystack.len = haystack.size();
2357 mbfl_string mbs_needle;
2358 mbfl_string_init(&mbs_needle);
2359 mbs_needle.no_language = MBSTRG(current_language);
2360 mbs_needle.no_encoding = MBSTRG(current_internal_encoding);
2361 mbs_needle.val = (unsigned char *)needle.data();
2362 mbs_needle.len = needle.size();
2363 if (!mbs_needle.len) {
2364 raise_warning("Empty delimiter.");
2365 return false;
2368 const char *from_encoding;
2369 if (encoding.empty()) {
2370 from_encoding =
2371 mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
2372 } else {
2373 from_encoding = encoding.data();
2375 mbs_haystack.no_encoding = mbs_needle.no_encoding =
2376 mbfl_name2no_encoding(from_encoding);
2377 if (mbs_haystack.no_encoding == mbfl_no_encoding_invalid) {
2378 raise_warning("Unknown encoding \"%s\"", from_encoding);
2379 return false;
2382 int n = php_mb_stripos(0, (const char*)mbs_haystack.val, mbs_haystack.len,
2383 (const char *)mbs_needle.val, mbs_needle.len,
2384 0, from_encoding);
2385 if (n < 0) {
2386 return false;
2389 int mblen = mbfl_strlen(&mbs_haystack);
2390 mbfl_string result, *ret = NULL;
2391 if (part) {
2392 ret = mbfl_substr(&mbs_haystack, &result, 0, n);
2393 } else {
2394 int len = (mblen - n);
2395 ret = mbfl_substr(&mbs_haystack, &result, n, len);
2398 if (ret != NULL) {
2399 return String((const char*)ret->val, ret->len, AttachString);
2401 return false;
2404 Variant f_mb_strlen(CStrRef str, CStrRef encoding /* = null_string */) {
2405 mbfl_string string;
2406 mbfl_string_init(&string);
2407 string.val = (unsigned char *)str.data();
2408 string.len = str.size();
2409 string.no_language = MBSTRG(current_language);
2411 if (encoding.empty()) {
2412 string.no_encoding = MBSTRG(current_internal_encoding);
2413 } else {
2414 string.no_encoding = mbfl_name2no_encoding(encoding.data());
2415 if (string.no_encoding == mbfl_no_encoding_invalid) {
2416 raise_warning("Unknown encoding \"%s\"", encoding.data());
2417 return false;
2421 int n = mbfl_strlen(&string);
2422 if (n >= 0) {
2423 return n;
2425 return false;
2428 Variant f_mb_strpos(CStrRef haystack, CStrRef needle, int offset /* = 0 */,
2429 CStrRef encoding /* = null_string */) {
2430 mbfl_string mbs_haystack;
2431 mbfl_string_init(&mbs_haystack);
2432 mbs_haystack.no_language = MBSTRG(current_language);
2433 mbs_haystack.no_encoding = MBSTRG(current_internal_encoding);
2434 mbs_haystack.val = (unsigned char *)haystack.data();
2435 mbs_haystack.len = haystack.size();
2437 mbfl_string mbs_needle;
2438 mbfl_string_init(&mbs_needle);
2439 mbs_needle.no_language = MBSTRG(current_language);
2440 mbs_needle.no_encoding = MBSTRG(current_internal_encoding);
2441 mbs_needle.val = (unsigned char *)needle.data();
2442 mbs_needle.len = needle.size();
2444 if (!encoding.empty()) {
2445 mbs_haystack.no_encoding = mbs_needle.no_encoding =
2446 mbfl_name2no_encoding(encoding.data());
2447 if (mbs_haystack.no_encoding == mbfl_no_encoding_invalid) {
2448 raise_warning("Unknown encoding \"%s\"", encoding.data());
2449 return false;
2453 if (offset < 0 || offset > mbfl_strlen(&mbs_haystack)) {
2454 raise_warning("Offset not contained in string.");
2455 return false;
2457 if (mbs_needle.len == 0) {
2458 raise_warning("Empty delimiter.");
2459 return false;
2462 int reverse = 0;
2463 int n = mbfl_strpos(&mbs_haystack, &mbs_needle, offset, reverse);
2464 if (n >= 0) {
2465 return n;
2468 switch (-n) {
2469 case 1:
2470 break;
2471 case 2:
2472 raise_warning("Needle has not positive length.");
2473 break;
2474 case 4:
2475 raise_warning("Unknown encoding or conversion error.");
2476 break;
2477 case 8:
2478 raise_warning("Argument is empty.");
2479 break;
2480 default:
2481 raise_warning("Unknown error in mb_strpos.");
2482 break;
2484 return false;
2487 Variant f_mb_strrpos(CStrRef haystack, CStrRef needle,
2488 CVarRef offset /* = 0LL */,
2489 CStrRef encoding /* = null_string */) {
2490 mbfl_string mbs_haystack;
2491 mbfl_string_init(&mbs_haystack);
2492 mbs_haystack.no_language = MBSTRG(current_language);
2493 mbs_haystack.no_encoding = MBSTRG(current_internal_encoding);
2494 mbs_haystack.val = (unsigned char *)haystack.data();
2495 mbs_haystack.len = haystack.size();
2497 mbfl_string mbs_needle;
2498 mbfl_string_init(&mbs_needle);
2499 mbs_needle.no_language = MBSTRG(current_language);
2500 mbs_needle.no_encoding = MBSTRG(current_internal_encoding);
2501 mbs_needle.val = (unsigned char *)needle.data();
2502 mbs_needle.len = needle.size();
2504 const char *enc_name = encoding.data();
2505 long noffset = 0;
2506 String soffset = offset.toString();
2507 if (offset.isString()) {
2508 enc_name = soffset.data();
2510 int str_flg = 1;
2511 if (enc_name != NULL) {
2512 switch (*enc_name) {
2513 case '0': case '1': case '2': case '3': case '4':
2514 case '5': case '6': case '7': case '8': case '9':
2515 case ' ': case '-': case '.':
2516 break;
2517 default :
2518 str_flg = 0;
2519 break;
2522 if (str_flg) {
2523 noffset = offset.toInt32();
2524 enc_name = encoding.data();
2526 } else {
2527 noffset = offset.toInt32();
2530 if (!enc_name && !*enc_name) {
2531 mbs_haystack.no_encoding = mbs_needle.no_encoding =
2532 mbfl_name2no_encoding(enc_name);
2533 if (mbs_haystack.no_encoding == mbfl_no_encoding_invalid) {
2534 raise_warning("Unknown encoding \"%s\"", enc_name);
2535 return false;
2539 if (mbs_haystack.len <= 0) {
2540 return false;
2542 if (mbs_needle.len <= 0) {
2543 return false;
2546 if ((noffset > 0 && noffset > mbfl_strlen(&mbs_haystack)) ||
2547 (noffset < 0 && -noffset > mbfl_strlen(&mbs_haystack))) {
2548 raise_notice("Offset is greater than the length of haystack string");
2549 return false;
2552 int n = mbfl_strpos(&mbs_haystack, &mbs_needle, noffset, 1);
2553 if (n >= 0) {
2554 return n;
2556 return false;
2559 Variant f_mb_strrchr(CStrRef haystack, CStrRef needle, bool part /* = false */,
2560 CStrRef encoding /* = null_string */) {
2561 mbfl_string mbs_haystack;
2562 mbfl_string_init(&mbs_haystack);
2563 mbs_haystack.no_language = MBSTRG(current_language);
2564 mbs_haystack.no_encoding = MBSTRG(current_internal_encoding);
2565 mbs_haystack.val = (unsigned char *)haystack.data();
2566 mbs_haystack.len = haystack.size();
2568 mbfl_string mbs_needle;
2569 mbfl_string_init(&mbs_needle);
2570 mbs_needle.no_language = MBSTRG(current_language);
2571 mbs_needle.no_encoding = MBSTRG(current_internal_encoding);
2572 mbs_needle.val = (unsigned char *)needle.data();
2573 mbs_needle.len = needle.size();
2575 if (!encoding.empty()) {
2576 mbs_haystack.no_encoding = mbs_needle.no_encoding =
2577 mbfl_name2no_encoding(encoding.data());
2578 if (mbs_haystack.no_encoding == mbfl_no_encoding_invalid) {
2579 raise_warning("Unknown encoding \"%s\"", encoding.data());
2580 return false;
2584 if (mbs_haystack.len <= 0) {
2585 return false;
2587 if (mbs_needle.len <= 0) {
2588 return false;
2591 mbfl_string result, *ret = NULL;
2592 int n = mbfl_strpos(&mbs_haystack, &mbs_needle, 0, 1);
2593 if (n >= 0) {
2594 int mblen = mbfl_strlen(&mbs_haystack);
2595 if (part) {
2596 ret = mbfl_substr(&mbs_haystack, &result, 0, n);
2597 } else {
2598 int len = (mblen - n);
2599 ret = mbfl_substr(&mbs_haystack, &result, n, len);
2603 if (ret != NULL) {
2604 return String((const char*)ret->val, ret->len, AttachString);
2606 return false;
2609 Variant f_mb_strrichr(CStrRef haystack, CStrRef needle, bool part /* = false */,
2610 CStrRef encoding /* = null_string */) {
2611 mbfl_string mbs_haystack;
2612 mbfl_string_init(&mbs_haystack);
2613 mbs_haystack.no_language = MBSTRG(current_language);
2614 mbs_haystack.no_encoding = MBSTRG(current_internal_encoding);
2615 mbs_haystack.val = (unsigned char *)haystack.data();
2616 mbs_haystack.len = haystack.size();
2618 mbfl_string mbs_needle;
2619 mbfl_string_init(&mbs_needle);
2620 mbs_needle.no_language = MBSTRG(current_language);
2621 mbs_needle.no_encoding = MBSTRG(current_internal_encoding);
2622 mbs_needle.val = (unsigned char *)needle.data();
2623 mbs_needle.len = needle.size();
2625 const char *from_encoding;
2626 if (encoding.empty()) {
2627 from_encoding =
2628 mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
2629 } else {
2630 from_encoding = encoding.data();
2632 mbs_haystack.no_encoding = mbs_needle.no_encoding =
2633 mbfl_name2no_encoding(from_encoding);
2634 if (mbs_haystack.no_encoding == mbfl_no_encoding_invalid) {
2635 raise_warning("Unknown encoding \"%s\"", from_encoding);
2636 return false;
2639 int n = php_mb_stripos(1, (const char*)mbs_haystack.val, mbs_haystack.len,
2640 (const char*)mbs_needle.val, mbs_needle.len,
2641 0, from_encoding);
2642 if (n < 0) {
2643 return false;
2646 mbfl_string result, *ret = NULL;
2647 int mblen = mbfl_strlen(&mbs_haystack);
2648 if (part) {
2649 ret = mbfl_substr(&mbs_haystack, &result, 0, n);
2650 } else {
2651 int len = (mblen - n);
2652 ret = mbfl_substr(&mbs_haystack, &result, n, len);
2655 if (ret != NULL) {
2656 return String((const char*)ret->val, ret->len, AttachString);
2658 return false;
2661 Variant f_mb_strstr(CStrRef haystack, CStrRef needle, bool part /* = false */,
2662 CStrRef encoding /* = null_string */) {
2663 mbfl_string mbs_haystack;
2664 mbfl_string_init(&mbs_haystack);
2665 mbs_haystack.no_language = MBSTRG(current_language);
2666 mbs_haystack.no_encoding = MBSTRG(current_internal_encoding);
2667 mbs_haystack.val = (unsigned char *)haystack.data();
2668 mbs_haystack.len = haystack.size();
2670 mbfl_string mbs_needle;
2671 mbfl_string_init(&mbs_needle);
2672 mbs_needle.no_language = MBSTRG(current_language);
2673 mbs_needle.no_encoding = MBSTRG(current_internal_encoding);
2674 mbs_needle.val = (unsigned char *)needle.data();
2675 mbs_needle.len = needle.size();
2677 if (!encoding.empty()) {
2678 mbs_haystack.no_encoding = mbs_needle.no_encoding =
2679 mbfl_name2no_encoding(encoding.data());
2680 if (mbs_haystack.no_encoding == mbfl_no_encoding_invalid) {
2681 raise_warning("Unknown encoding \"%s\"", encoding.data());
2682 return false;
2686 if (mbs_needle.len <= 0) {
2687 raise_warning("Empty delimiter.");
2688 return false;
2691 mbfl_string result, *ret = NULL;
2692 int n = mbfl_strpos(&mbs_haystack, &mbs_needle, 0, 0);
2693 if (n >= 0) {
2694 int mblen = mbfl_strlen(&mbs_haystack);
2695 if (part) {
2696 ret = mbfl_substr(&mbs_haystack, &result, 0, n);
2697 } else {
2698 int len = (mblen - n);
2699 ret = mbfl_substr(&mbs_haystack, &result, n, len);
2703 if (ret != NULL) {
2704 return String((const char*)ret->val, ret->len, AttachString);
2706 return false;
2709 Variant f_mb_strtolower(CStrRef str, CStrRef encoding /* = null_string */) {
2710 const char *from_encoding;
2711 if (encoding.empty()) {
2712 from_encoding =
2713 mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
2714 } else {
2715 from_encoding = encoding.data();
2718 unsigned int ret_len;
2719 char *newstr = php_unicode_convert_case(PHP_UNICODE_CASE_LOWER,
2720 str.data(), str.size(),
2721 &ret_len, from_encoding);
2722 if (newstr) {
2723 return String(newstr, ret_len, AttachString);
2725 return false;
2728 Variant f_mb_strtoupper(CStrRef str, CStrRef encoding /* = null_string */) {
2729 const char *from_encoding;
2730 if (encoding.empty()) {
2731 from_encoding =
2732 mbfl_no2preferred_mime_name(MBSTRG(current_internal_encoding));
2733 } else {
2734 from_encoding = encoding.data();
2737 unsigned int ret_len;
2738 char *newstr = php_unicode_convert_case(PHP_UNICODE_CASE_UPPER,
2739 str.data(), str.size(),
2740 &ret_len, from_encoding);
2741 if (newstr) {
2742 return String(newstr, ret_len, AttachString);
2744 return false;
2747 Variant f_mb_strwidth(CStrRef str, CStrRef encoding /* = null_string */) {
2748 mbfl_string string;
2749 mbfl_string_init(&string);
2750 string.no_language = MBSTRG(current_language);
2751 string.no_encoding = MBSTRG(current_internal_encoding);
2752 string.val = (unsigned char *)str.data();
2753 string.len = str.size();
2755 if (!encoding.empty()) {
2756 string.no_encoding = mbfl_name2no_encoding(encoding.data());
2757 if (string.no_encoding == mbfl_no_encoding_invalid) {
2758 raise_warning("Unknown encoding \"%s\"", encoding.data());
2759 return false;
2763 int n = mbfl_strwidth(&string);
2764 if (n >= 0) {
2765 return n;
2767 return false;
2770 Variant f_mb_substitute_character(CVarRef substrchar /* = null_variant */) {
2771 if (substrchar.isNull()) {
2772 switch (MBSTRG(current_filter_illegal_mode)) {
2773 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE:
2774 return "none";
2775 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
2776 return "long";
2777 case MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY:
2778 return "entity";
2779 default:
2780 return MBSTRG(current_filter_illegal_substchar);
2784 if (substrchar.isString()) {
2785 String s = substrchar.toString();
2786 if (strcasecmp("none", s.data()) == 0) {
2787 MBSTRG(current_filter_illegal_mode) =
2788 MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
2789 return true;
2791 if (strcasecmp("long", s.data()) == 0) {
2792 MBSTRG(current_filter_illegal_mode) =
2793 MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
2794 return true;
2796 if (strcasecmp("entity", s.data()) == 0) {
2797 MBSTRG(current_filter_illegal_mode) =
2798 MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
2799 return true;
2803 int64_t n = substrchar.toInt64();
2804 if (n < 0xffff && n > 0) {
2805 MBSTRG(current_filter_illegal_mode) =
2806 MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
2807 MBSTRG(current_filter_illegal_substchar) = n;
2808 } else {
2809 raise_warning("Unknown character.");
2810 return false;
2812 return true;
2815 Variant f_mb_substr_count(CStrRef haystack, CStrRef needle,
2816 CStrRef encoding /* = null_string */) {
2817 mbfl_string mbs_haystack;
2818 mbfl_string_init(&mbs_haystack);
2819 mbs_haystack.no_language = MBSTRG(current_language);
2820 mbs_haystack.no_encoding = MBSTRG(current_internal_encoding);
2821 mbs_haystack.val = (unsigned char *)haystack.data();
2822 mbs_haystack.len = haystack.size();
2824 mbfl_string mbs_needle;
2825 mbfl_string_init(&mbs_needle);
2826 mbs_needle.no_language = MBSTRG(current_language);
2827 mbs_needle.no_encoding = MBSTRG(current_internal_encoding);
2828 mbs_needle.val = (unsigned char *)needle.data();
2829 mbs_needle.len = needle.size();
2831 if (!encoding.empty()) {
2832 mbs_haystack.no_encoding = mbs_needle.no_encoding =
2833 mbfl_name2no_encoding(encoding.data());
2834 if (mbs_haystack.no_encoding == mbfl_no_encoding_invalid) {
2835 raise_warning("Unknown encoding \"%s\"", encoding.data());
2836 return false;
2840 if (mbs_needle.len <= 0) {
2841 raise_warning("Empty substring.");
2842 return false;
2845 int n = mbfl_substr_count(&mbs_haystack, &mbs_needle);
2846 if (n >= 0) {
2847 return n;
2849 return false;
2852 ///////////////////////////////////////////////////////////////////////////////
2853 // regex helpers
2855 typedef struct _php_mb_regex_enc_name_map_t {
2856 const char *names;
2857 OnigEncoding code;
2858 } php_mb_regex_enc_name_map_t;
2860 static php_mb_regex_enc_name_map_t enc_name_map[] ={
2862 "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0",
2863 ONIG_ENCODING_EUC_JP
2866 "UTF-8\0UTF8\0",
2867 ONIG_ENCODING_UTF8
2870 "UTF-16\0UTF-16BE\0",
2871 ONIG_ENCODING_UTF16_BE
2874 "UTF-16LE\0",
2875 ONIG_ENCODING_UTF16_LE
2878 "UCS-4\0UTF-32\0UTF-32BE\0",
2879 ONIG_ENCODING_UTF32_BE
2882 "UCS-4LE\0UTF-32LE\0",
2883 ONIG_ENCODING_UTF32_LE
2886 "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0",
2887 ONIG_ENCODING_SJIS
2890 "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0",
2891 ONIG_ENCODING_BIG5
2894 "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0",
2895 ONIG_ENCODING_EUC_CN
2898 "EUC-TW\0EUCTW\0EUC_TW\0",
2899 ONIG_ENCODING_EUC_TW
2902 "EUC-KR\0EUCKR\0EUC_KR\0",
2903 ONIG_ENCODING_EUC_KR
2906 "KOI8R\0KOI8-R\0KOI-8R\0",
2907 ONIG_ENCODING_KOI8_R
2910 "ISO-8859-1\0ISO8859-1\0ISO_8859_1\0ISO8859_1\0",
2911 ONIG_ENCODING_ISO_8859_1
2914 "ISO-8859-2\0ISO8859-2\0ISO_8859_2\0ISO8859_2\0",
2915 ONIG_ENCODING_ISO_8859_2
2918 "ISO-8859-3\0ISO8859-3\0ISO_8859_3\0ISO8859_3\0",
2919 ONIG_ENCODING_ISO_8859_3
2922 "ISO-8859-4\0ISO8859-4\0ISO_8859_4\0ISO8859_4\0",
2923 ONIG_ENCODING_ISO_8859_4
2926 "ISO-8859-5\0ISO8859-5\0ISO_8859_5\0ISO8859_5\0",
2927 ONIG_ENCODING_ISO_8859_5
2930 "ISO-8859-6\0ISO8859-6\0ISO_8859_6\0ISO8859_6\0",
2931 ONIG_ENCODING_ISO_8859_6
2934 "ISO-8859-7\0ISO8859-7\0ISO_8859_7\0ISO8859_7\0",
2935 ONIG_ENCODING_ISO_8859_7
2938 "ISO-8859-8\0ISO8859-8\0ISO_8859_8\0ISO8859_8\0",
2939 ONIG_ENCODING_ISO_8859_8
2942 "ISO-8859-9\0ISO8859-9\0ISO_8859_9\0ISO8859_9\0",
2943 ONIG_ENCODING_ISO_8859_9
2946 "ISO-8859-10\0ISO8859-10\0ISO_8859_10\0ISO8859_10\0",
2947 ONIG_ENCODING_ISO_8859_10
2950 "ISO-8859-11\0ISO8859-11\0ISO_8859_11\0ISO8859_11\0",
2951 ONIG_ENCODING_ISO_8859_11
2954 "ISO-8859-13\0ISO8859-13\0ISO_8859_13\0ISO8859_13\0",
2955 ONIG_ENCODING_ISO_8859_13
2958 "ISO-8859-14\0ISO8859-14\0ISO_8859_14\0ISO8859_14\0",
2959 ONIG_ENCODING_ISO_8859_14
2962 "ISO-8859-15\0ISO8859-15\0ISO_8859_15\0ISO8859_15\0",
2963 ONIG_ENCODING_ISO_8859_15
2966 "ISO-8859-16\0ISO8859-16\0ISO_8859_16\0ISO8859_16\0",
2967 ONIG_ENCODING_ISO_8859_16
2970 "ASCII\0US-ASCII\0US_ASCII\0ISO646\0",
2971 ONIG_ENCODING_ASCII
2973 { NULL, ONIG_ENCODING_UNDEF }
2976 static OnigEncoding php_mb_regex_name2mbctype(const char *pname) {
2977 const char *p;
2978 php_mb_regex_enc_name_map_t *mapping;
2980 if (pname == NULL) {
2981 return ONIG_ENCODING_UNDEF;
2984 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
2985 for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) {
2986 if (strcasecmp(p, pname) == 0) {
2987 return mapping->code;
2992 return ONIG_ENCODING_UNDEF;
2995 static const char *php_mb_regex_mbctype2name(OnigEncoding mbctype) {
2996 php_mb_regex_enc_name_map_t *mapping;
2998 for (mapping = enc_name_map; mapping->names != NULL; mapping++) {
2999 if (mapping->code == mbctype) {
3000 return mapping->names;
3004 return NULL;
3008 * regex cache
3010 static php_mb_regex_t *php_mbregex_compile_pattern(CStrRef pattern,
3011 OnigOptionType options,
3012 OnigEncoding enc,
3013 OnigSyntaxType *syntax) {
3014 int err_code = 0;
3015 OnigErrorInfo err_info;
3016 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
3017 php_mb_regex_t *rc = NULL;
3019 std::string spattern = std::string(pattern.data(), pattern.size());
3020 RegexCache &cache = MBSTRG(ht_rc);
3021 RegexCache::const_iterator it =
3022 cache.find(spattern);
3023 if (it != cache.end()) {
3024 rc = it->second;
3027 if (!rc || rc->options != options || rc->enc != enc ||
3028 rc->syntax != syntax) {
3029 if (rc) {
3030 onig_free(rc);
3031 rc = NULL;
3033 if ((err_code = onig_new(&rc, (OnigUChar *)pattern.data(),
3034 (OnigUChar *)(pattern.data() + pattern.size()),
3035 options,enc, syntax, &err_info)) != ONIG_NORMAL) {
3036 onig_error_code_to_str(err_str, err_code, err_info);
3037 raise_warning("mbregex compile err: %s", err_str);
3038 return NULL;
3040 MBSTRG(ht_rc)[spattern] = rc;
3042 return rc;
3045 static size_t _php_mb_regex_get_option_string(char *str, size_t len,
3046 OnigOptionType option,
3047 OnigSyntaxType *syntax) {
3048 size_t len_left = len;
3049 size_t len_req = 0;
3050 char *p = str;
3051 char c;
3053 if ((option & ONIG_OPTION_IGNORECASE) != 0) {
3054 if (len_left > 0) {
3055 --len_left;
3056 *(p++) = 'i';
3058 ++len_req;
3061 if ((option & ONIG_OPTION_EXTEND) != 0) {
3062 if (len_left > 0) {
3063 --len_left;
3064 *(p++) = 'x';
3066 ++len_req;
3069 if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) ==
3070 (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) {
3071 if (len_left > 0) {
3072 --len_left;
3073 *(p++) = 'p';
3075 ++len_req;
3076 } else {
3077 if ((option & ONIG_OPTION_MULTILINE) != 0) {
3078 if (len_left > 0) {
3079 --len_left;
3080 *(p++) = 'm';
3082 ++len_req;
3085 if ((option & ONIG_OPTION_SINGLELINE) != 0) {
3086 if (len_left > 0) {
3087 --len_left;
3088 *(p++) = 's';
3090 ++len_req;
3093 if ((option & ONIG_OPTION_FIND_LONGEST) != 0) {
3094 if (len_left > 0) {
3095 --len_left;
3096 *(p++) = 'l';
3098 ++len_req;
3100 if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) {
3101 if (len_left > 0) {
3102 --len_left;
3103 *(p++) = 'n';
3105 ++len_req;
3108 c = 0;
3110 if (syntax == ONIG_SYNTAX_JAVA) {
3111 c = 'j';
3112 } else if (syntax == ONIG_SYNTAX_GNU_REGEX) {
3113 c = 'u';
3114 } else if (syntax == ONIG_SYNTAX_GREP) {
3115 c = 'g';
3116 } else if (syntax == ONIG_SYNTAX_EMACS) {
3117 c = 'c';
3118 } else if (syntax == ONIG_SYNTAX_RUBY) {
3119 c = 'r';
3120 } else if (syntax == ONIG_SYNTAX_PERL) {
3121 c = 'z';
3122 } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) {
3123 c = 'b';
3124 } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) {
3125 c = 'd';
3128 if (c != 0) {
3129 if (len_left > 0) {
3130 --len_left;
3131 *(p++) = c;
3133 ++len_req;
3136 if (len_left > 0) {
3137 --len_left;
3138 *(p++) = '\0';
3140 ++len_req;
3141 if (len < len_req) {
3142 return len_req;
3145 return 0;
3148 static void _php_mb_regex_init_options(const char *parg, int narg,
3149 OnigOptionType *option,
3150 OnigSyntaxType **syntax, int *eval) {
3151 int n;
3152 char c;
3153 int optm = 0;
3155 *syntax = ONIG_SYNTAX_RUBY;
3156 if (parg != NULL) {
3157 n = 0;
3158 while (n < narg) {
3159 c = parg[n++];
3160 switch (c) {
3161 case 'i': optm |= ONIG_OPTION_IGNORECASE; break;
3162 case 'x': optm |= ONIG_OPTION_EXTEND; break;
3163 case 'm': optm |= ONIG_OPTION_MULTILINE; break;
3164 case 's': optm |= ONIG_OPTION_SINGLELINE; break;
3165 case 'p': optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE; break;
3166 case 'l': optm |= ONIG_OPTION_FIND_LONGEST; break;
3167 case 'n': optm |= ONIG_OPTION_FIND_NOT_EMPTY; break;
3168 case 'j': *syntax = ONIG_SYNTAX_JAVA; break;
3169 case 'u': *syntax = ONIG_SYNTAX_GNU_REGEX; break;
3170 case 'g': *syntax = ONIG_SYNTAX_GREP; break;
3171 case 'c': *syntax = ONIG_SYNTAX_EMACS; break;
3172 case 'r': *syntax = ONIG_SYNTAX_RUBY; break;
3173 case 'z': *syntax = ONIG_SYNTAX_PERL; break;
3174 case 'b': *syntax = ONIG_SYNTAX_POSIX_BASIC; break;
3175 case 'd': *syntax = ONIG_SYNTAX_POSIX_EXTENDED; break;
3176 case 'e':
3177 if (eval != NULL) *eval = 1;
3178 break;
3179 default:
3180 break;
3183 if (option != NULL) *option|=optm;
3187 ///////////////////////////////////////////////////////////////////////////////
3188 // regex functions
3190 bool f_mb_ereg_match(CStrRef pattern, CStrRef str,
3191 CStrRef option /* = null_string */) {
3192 OnigSyntaxType *syntax;
3193 OnigOptionType noption = 0;
3194 if (!option.empty()) {
3195 _php_mb_regex_init_options(option.data(), option.size(), &noption,
3196 &syntax, NULL);
3197 } else {
3198 noption |= MBSTRG(regex_default_options);
3199 syntax = MBSTRG(regex_default_syntax);
3202 php_mb_regex_t *re;
3203 if ((re = php_mbregex_compile_pattern
3204 (pattern, noption, MBSTRG(current_mbctype), syntax)) == NULL) {
3205 return false;
3208 /* match */
3209 int err = onig_match(re, (OnigUChar *)str.data(),
3210 (OnigUChar *)(str.data() + str.size()),
3211 (OnigUChar *)str.data(), NULL, 0);
3212 return err >= 0;
3215 static Variant _php_mb_regex_ereg_replace_exec(CVarRef pattern,
3216 CStrRef replacement,
3217 CStrRef str,
3218 CStrRef option,
3219 OnigOptionType options) {
3220 const char *p;
3221 php_mb_regex_t *re;
3222 OnigSyntaxType *syntax;
3223 OnigRegion *regs = NULL;
3224 StringBuffer out_buf;
3225 int i, err, eval, n;
3226 OnigUChar *pos;
3227 OnigUChar *string_lim;
3228 char pat_buf[2];
3230 const mbfl_encoding *enc;
3233 const char *current_enc_name;
3234 current_enc_name = php_mb_regex_mbctype2name(MBSTRG(current_mbctype));
3235 if (current_enc_name == NULL ||
3236 (enc = mbfl_name2encoding(current_enc_name)) == NULL) {
3237 raise_warning("Unknown error");
3238 return false;
3241 eval = 0;
3243 if (!option.empty()) {
3244 _php_mb_regex_init_options(option.data(), option.size(),
3245 &options, &syntax, &eval);
3246 } else {
3247 options |= MBSTRG(regex_default_options);
3248 syntax = MBSTRG(regex_default_syntax);
3252 String spattern;
3253 if (pattern.isString()) {
3254 spattern = pattern.toString();
3255 } else {
3256 /* FIXME: this code is not multibyte aware! */
3257 pat_buf[0] = pattern.toByte();
3258 pat_buf[1] = '\0';
3259 spattern = String(pat_buf, 1, CopyString);
3261 /* create regex pattern buffer */
3262 re = php_mbregex_compile_pattern(spattern, options,
3263 MBSTRG(current_mbctype), syntax);
3264 if (re == NULL) {
3265 return false;
3268 if (eval) {
3269 throw NotSupportedException("ereg_replace", "dynamic coding");
3272 /* do the actual work */
3273 err = 0;
3274 pos = (OnigUChar*)str.data();
3275 string_lim = (OnigUChar*)(str.data() + str.size());
3276 regs = onig_region_new();
3277 while (err >= 0) {
3278 err = onig_search(re, (OnigUChar *)str.data(), (OnigUChar *)string_lim,
3279 pos, (OnigUChar *)string_lim, regs, 0);
3280 if (err <= -2) {
3281 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
3282 onig_error_code_to_str(err_str, err);
3283 raise_warning("mbregex search failure: %s", err_str);
3284 break;
3286 if (err >= 0) {
3287 #if moriyoshi_0
3288 if (regs->beg[0] == regs->end[0]) {
3289 raise_warning("Empty regular expression");
3290 break;
3292 #endif
3293 /* copy the part of the string before the match */
3294 out_buf.append((const char *)pos,
3295 (OnigUChar *)(str.data() + regs->beg[0]) - pos);
3296 /* copy replacement and backrefs */
3297 i = 0;
3298 p = replacement.data();
3299 while (i < replacement.size()) {
3300 int fwd = (int)php_mb_mbchar_bytes_ex(p, enc);
3301 n = -1;
3302 if ((replacement.size() - i) >= 2 && fwd == 1 &&
3303 p[0] == '\\' && p[1] >= '0' && p[1] <= '9') {
3304 n = p[1] - '0';
3306 if (n >= 0 && n < regs->num_regs) {
3307 if (regs->beg[n] >= 0 && regs->beg[n] < regs->end[n] &&
3308 regs->end[n] <= str.size()) {
3309 out_buf.append(str.data() + regs->beg[n],
3310 regs->end[n] - regs->beg[n]);
3312 p += 2;
3313 i += 2;
3314 } else {
3315 out_buf.append(p, fwd);
3316 p += fwd;
3317 i += fwd;
3320 n = regs->end[0];
3321 if ((pos - (OnigUChar *)str.data()) < n) {
3322 pos = (OnigUChar *)(str.data() + n);
3323 } else {
3324 if (pos < string_lim) {
3325 out_buf.append((const char *)pos, 1);
3327 pos++;
3329 } else { /* nomatch */
3330 /* stick that last bit of string on our output */
3331 if (string_lim - pos > 0) {
3332 out_buf.append((const char *)pos, string_lim - pos);
3335 onig_region_free(regs, 0);
3338 if (regs != NULL) {
3339 onig_region_free(regs, 1);
3342 if (err <= -2) {
3343 return false;
3345 return out_buf.detach();
3348 Variant f_mb_ereg_replace(CVarRef pattern, CStrRef replacement, CStrRef str,
3349 CStrRef option /* = null_string */) {
3350 return _php_mb_regex_ereg_replace_exec(pattern, replacement,
3351 str, option, 0);
3354 Variant f_mb_eregi_replace(CVarRef pattern, CStrRef replacement, CStrRef str,
3355 CStrRef option /* = null_string */) {
3356 return _php_mb_regex_ereg_replace_exec(pattern, replacement,
3357 str, option, ONIG_OPTION_IGNORECASE);
3360 int64_t f_mb_ereg_search_getpos() {
3361 return MBSTRG(search_pos);
3364 bool f_mb_ereg_search_setpos(int position) {
3365 if (position < 0 || position >= (int)MBSTRG(search_str).size()) {
3366 raise_warning("Position is out of range");
3367 MBSTRG(search_pos) = 0;
3368 return false;
3370 MBSTRG(search_pos) = position;
3371 return true;
3374 Variant f_mb_ereg_search_getregs() {
3375 OnigRegion *search_regs = MBSTRG(search_regs);
3376 if (search_regs && !MBSTRG(search_str).empty()) {
3377 Array ret;
3378 OnigUChar *str = (OnigUChar *)MBSTRG(search_str).data();
3379 int len = MBSTRG(search_str).size();
3380 int n = search_regs->num_regs;
3381 for (int i = 0; i < n; i++) {
3382 int beg = search_regs->beg[i];
3383 int end = search_regs->end[i];
3384 if (beg >= 0 && beg <= end && end <= len) {
3385 ret.append(String((const char *)(str + beg), end - beg, CopyString));
3386 } else {
3387 ret.append(false);
3390 return ret;
3392 return false;
3395 bool f_mb_ereg_search_init(CStrRef str, CStrRef pattern /* = null_string */,
3396 CStrRef option /* = null_string */) {
3397 OnigOptionType noption = MBSTRG(regex_default_options);
3398 OnigSyntaxType *syntax = MBSTRG(regex_default_syntax);
3399 if (!option.empty()) {
3400 noption = 0;
3401 _php_mb_regex_init_options(option.data(), option.size(),
3402 &noption, &syntax, NULL);
3404 if (!pattern.empty()) {
3405 if ((MBSTRG(search_re) = php_mbregex_compile_pattern
3406 (pattern, noption, MBSTRG(current_mbctype), syntax)) == NULL) {
3407 return false;
3411 MBSTRG(search_str) = std::string(str.data(), str.size());
3412 MBSTRG(search_pos) = 0;
3414 if (MBSTRG(search_regs) != NULL) {
3415 onig_region_free(MBSTRG(search_regs), 1);
3416 MBSTRG(search_regs) = (OnigRegion *)NULL;
3418 return true;
3421 /* regex search */
3422 static Variant _php_mb_regex_ereg_search_exec(CStrRef pattern, CStrRef option,
3423 int mode) {
3424 int n, i, err, pos, len, beg, end;
3425 OnigUChar *str;
3426 OnigSyntaxType *syntax = NULL;
3427 OnigOptionType noption;
3429 noption = MBSTRG(regex_default_options);
3430 if (!option.empty()) {
3431 noption = 0;
3432 _php_mb_regex_init_options(option.data(), option.size(),
3433 &noption, &syntax, NULL);
3435 if (!pattern.empty()) {
3436 if ((MBSTRG(search_re) = php_mbregex_compile_pattern
3437 (pattern, noption, MBSTRG(current_mbctype), syntax)) == NULL) {
3438 return false;
3442 pos = MBSTRG(search_pos);
3443 str = NULL;
3444 len = 0;
3445 if (!MBSTRG(search_str).empty()) {
3446 str = (OnigUChar *)MBSTRG(search_str).data();
3447 len = MBSTRG(search_str).size();
3450 if (MBSTRG(search_re) == NULL) {
3451 raise_warning("No regex given");
3452 return false;
3455 if (str == NULL) {
3456 raise_warning("No string given");
3457 return false;
3460 if (MBSTRG(search_regs)) {
3461 onig_region_free(MBSTRG(search_regs), 1);
3463 MBSTRG(search_regs) = onig_region_new();
3465 err = onig_search(MBSTRG(search_re), str, str + len, str + pos, str + len,
3466 MBSTRG(search_regs), 0);
3467 Variant ret;
3468 if (err == ONIG_MISMATCH) {
3469 MBSTRG(search_pos) = len;
3470 ret = false;
3471 } else if (err <= -2) {
3472 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
3473 onig_error_code_to_str(err_str, err);
3474 raise_warning("mbregex search failure in mbregex_search(): %s", err_str);
3475 ret = false;
3476 } else {
3477 if (MBSTRG(search_regs)->beg[0] == MBSTRG(search_regs)->end[0]) {
3478 raise_warning("Empty regular expression");
3480 switch (mode) {
3481 case 1:
3483 beg = MBSTRG(search_regs)->beg[0];
3484 end = MBSTRG(search_regs)->end[0];
3485 ret.append(beg);
3486 ret.append(end - beg);
3488 break;
3489 case 2:
3490 n = MBSTRG(search_regs)->num_regs;
3491 for (i = 0; i < n; i++) {
3492 beg = MBSTRG(search_regs)->beg[i];
3493 end = MBSTRG(search_regs)->end[i];
3494 if (beg >= 0 && beg <= end && end <= len) {
3495 ret.append(String((const char *)(str + beg), end - beg, CopyString));
3496 } else {
3497 ret.append(false);
3500 break;
3501 default:
3502 ret = true;
3503 break;
3505 end = MBSTRG(search_regs)->end[0];
3506 if (pos < end) {
3507 MBSTRG(search_pos) = end;
3508 } else {
3509 MBSTRG(search_pos) = pos + 1;
3513 if (err < 0) {
3514 onig_region_free(MBSTRG(search_regs), 1);
3515 MBSTRG(search_regs) = (OnigRegion *)NULL;
3517 return ret;
3520 Variant f_mb_ereg_search(CStrRef pattern /* = null_string */,
3521 CStrRef option /* = null_string */) {
3522 return _php_mb_regex_ereg_search_exec(pattern, option, 0);
3525 Variant f_mb_ereg_search_pos(CStrRef pattern /* = null_string */,
3526 CStrRef option /* = null_string */) {
3527 return _php_mb_regex_ereg_search_exec(pattern, option, 1);
3530 Variant f_mb_ereg_search_regs(CStrRef pattern /* = null_string */,
3531 CStrRef option /* = null_string */) {
3532 return _php_mb_regex_ereg_search_exec(pattern, option, 2);
3535 static Variant _php_mb_regex_ereg_exec(CVarRef pattern, CStrRef str,
3536 Variant &regs, int icase) {
3537 php_mb_regex_t *re;
3538 OnigRegion *regions = NULL;
3539 int i, match_len, beg, end;
3540 OnigOptionType options;
3542 options = MBSTRG(regex_default_options);
3543 if (icase) {
3544 options |= ONIG_OPTION_IGNORECASE;
3547 /* compile the regular expression from the supplied regex */
3548 String spattern;
3549 if (!pattern.isString()) {
3550 /* we convert numbers to integers and treat them as a string */
3551 if (pattern.is(KindOfDouble)) {
3552 spattern = String(pattern.toInt64()); /* get rid of decimal places */
3553 } else {
3554 spattern = pattern.toString();
3556 } else {
3557 spattern = pattern.toString();
3559 re = php_mbregex_compile_pattern(spattern, options, MBSTRG(current_mbctype),
3560 MBSTRG(regex_default_syntax));
3561 if (re == NULL) {
3562 return false;
3565 regions = onig_region_new();
3567 /* actually execute the regular expression */
3568 if (onig_search(re, (OnigUChar *)str.data(),
3569 (OnigUChar *)(str.data() + str.size()),
3570 (OnigUChar *)str.data(),
3571 (OnigUChar *)(str.data() + str.size()),
3572 regions, 0) < 0) {
3573 onig_region_free(regions, 1);
3574 return false;
3577 const char *s = str.data();
3578 int string_len = str.size();
3579 match_len = regions->end[0] - regions->beg[0];
3580 regs = Array::Create();
3581 for (i = 0; i < regions->num_regs; i++) {
3582 beg = regions->beg[i];
3583 end = regions->end[i];
3584 if (beg >= 0 && beg < end && end <= string_len) {
3585 regs.append(String(s + beg, end - beg, CopyString));
3586 } else {
3587 regs.append(false);
3591 if (match_len == 0) {
3592 match_len = 1;
3594 if (regions != NULL) {
3595 onig_region_free(regions, 1);
3597 return match_len;
3600 Variant f_mb_ereg(CVarRef pattern, CStrRef str, VRefParam regs /* = null */) {
3601 return _php_mb_regex_ereg_exec(pattern, str, regs, 0);
3604 Variant f_mb_eregi(CVarRef pattern, CStrRef str, VRefParam regs /* = null */) {
3605 return _php_mb_regex_ereg_exec(pattern, str, regs, 1);
3608 Variant f_mb_regex_encoding(CStrRef encoding /* = null_string */) {
3609 if (encoding.empty()) {
3610 const char *retval = php_mb_regex_mbctype2name(MBSTRG(current_mbctype));
3611 if (retval != NULL) {
3612 return String(retval, CopyString);
3614 return false;
3617 OnigEncoding mbctype = php_mb_regex_name2mbctype(encoding.data());
3618 if (mbctype == ONIG_ENCODING_UNDEF) {
3619 raise_warning("Unknown encoding \"%s\"", encoding.data());
3620 return false;
3623 MBSTRG(current_mbctype) = mbctype;
3624 return true;
3627 static void php_mb_regex_set_options(OnigOptionType options,
3628 OnigSyntaxType *syntax,
3629 OnigOptionType *prev_options,
3630 OnigSyntaxType **prev_syntax) {
3631 if (prev_options != NULL) {
3632 *prev_options = MBSTRG(regex_default_options);
3634 if (prev_syntax != NULL) {
3635 *prev_syntax = MBSTRG(regex_default_syntax);
3637 MBSTRG(regex_default_options) = options;
3638 MBSTRG(regex_default_syntax) = syntax;
3641 String f_mb_regex_set_options(CStrRef options /* = null_string */) {
3642 OnigOptionType opt;
3643 OnigSyntaxType *syntax;
3644 char buf[16];
3646 if (!options.empty()) {
3647 opt = 0;
3648 syntax = NULL;
3649 _php_mb_regex_init_options(options.data(), options.size(),
3650 &opt, &syntax, NULL);
3651 php_mb_regex_set_options(opt, syntax, NULL, NULL);
3652 } else {
3653 opt = MBSTRG(regex_default_options);
3654 syntax = MBSTRG(regex_default_syntax);
3656 _php_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax);
3657 return String(buf, CopyString);
3660 Variant f_mb_split(CStrRef pattern, CStrRef str, int count /* = -1 */) {
3661 php_mb_regex_t *re;
3662 OnigRegion *regs = NULL;
3664 int n, err;
3665 if (count == 0) {
3666 count = 1;
3669 /* create regex pattern buffer */
3670 if ((re = php_mbregex_compile_pattern(pattern,
3671 MBSTRG(regex_default_options),
3672 MBSTRG(current_mbctype),
3673 MBSTRG(regex_default_syntax)))
3674 == NULL) {
3675 return false;
3678 Array ret;
3679 OnigUChar *pos0 = (OnigUChar *)str.data();
3680 OnigUChar *pos_end = (OnigUChar *)(str.data() + str.size());
3681 OnigUChar *pos = pos0;
3682 err = 0;
3683 regs = onig_region_new();
3684 /* churn through str, generating array entries as we go */
3685 while ((--count != 0) &&
3686 (err = onig_search(re, pos0, pos_end, pos, pos_end, regs, 0)) >= 0) {
3687 if (regs->beg[0] == regs->end[0]) {
3688 raise_warning("Empty regular expression");
3689 break;
3692 /* add it to the array */
3693 if (regs->beg[0] < str.size() && regs->beg[0] >= (pos - pos0)) {
3694 ret.append(String((const char *)pos,
3695 ((OnigUChar *)(str.data() + regs->beg[0]) - pos),
3696 CopyString));
3697 } else {
3698 err = -2;
3699 break;
3701 /* point at our new starting point */
3702 n = regs->end[0];
3703 if ((pos - pos0) < n) {
3704 pos = pos0 + n;
3706 if (count < 0) {
3707 count = 0;
3709 onig_region_free(regs, 0);
3712 onig_region_free(regs, 1);
3714 /* see if we encountered an error */
3715 if (err <= -2) {
3716 OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN];
3717 onig_error_code_to_str(err_str, err);
3718 raise_warning("mbregex search failure in mbsplit(): %s", err_str);
3719 return false;
3722 /* otherwise we just have one last element to add to the array */
3723 n = pos_end - pos;
3724 if (n > 0) {
3725 ret.append(String((const char *)pos, n, CopyString));
3726 } else {
3727 ret.append("");
3729 return ret;
3732 ///////////////////////////////////////////////////////////////////////////////
3734 #define SKIP_LONG_HEADER_SEP_MBSTRING(str, pos) \
3735 if (str[pos] == '\r' && str[pos + 1] == '\n' && \
3736 (str[pos + 2] == ' ' || str[pos + 2] == '\t')) { \
3737 pos += 2; \
3738 while (str[pos + 1] == ' ' || str[pos + 1] == '\t') { \
3739 pos++; \
3741 continue; \
3744 static int _php_mbstr_parse_mail_headers(Array &ht, const char *str,
3745 size_t str_len) {
3746 const char *ps;
3747 size_t icnt;
3748 int state = 0;
3749 int crlf_state = -1;
3751 StringBuffer token;
3752 String fld_name, fld_val;
3754 ps = str;
3755 icnt = str_len;
3758 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3759 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
3760 * state 0 1 2 3
3762 * C o n t e n t - T y p e : t e x t / h t m l \r\n
3763 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
3764 * crlf_state -1 0 1 -1
3768 while (icnt > 0) {
3769 switch (*ps) {
3770 case ':':
3771 if (crlf_state == 1) {
3772 token.append('\r');
3775 if (state == 0 || state == 1) {
3776 fld_name = token.detach();
3778 state = 2;
3779 } else {
3780 token.append(*ps);
3783 crlf_state = 0;
3784 break;
3786 case '\n':
3787 if (crlf_state == -1) {
3788 goto out;
3790 crlf_state = -1;
3791 break;
3793 case '\r':
3794 if (crlf_state == 1) {
3795 token.append('\r');
3796 } else {
3797 crlf_state = 1;
3799 break;
3801 case ' ': case '\t':
3802 if (crlf_state == -1) {
3803 if (state == 3) {
3804 /* continuing from the previous line */
3805 state = 4;
3806 } else {
3807 /* simply skipping this new line */
3808 state = 5;
3810 } else {
3811 if (crlf_state == 1) {
3812 token.append('\r');
3814 if (state == 1 || state == 3) {
3815 token.append(*ps);
3818 crlf_state = 0;
3819 break;
3821 default:
3822 switch (state) {
3823 case 0:
3824 token.clear();
3825 state = 1;
3826 break;
3828 case 2:
3829 if (crlf_state != -1) {
3830 token.clear();
3831 state = 3;
3832 break;
3834 /* break is missing intentionally */
3836 case 3:
3837 if (crlf_state == -1) {
3838 fld_val = token.detach();
3839 if (!fld_name.empty() && !fld_val.empty()) {
3840 /* FIXME: some locale free implementation is
3841 * really required here,,, */
3842 ht.set(StringUtil::ToUpper(fld_name), fld_val);
3844 state = 1;
3846 break;
3848 case 4:
3849 token.append(' ');
3850 state = 3;
3851 break;
3854 if (crlf_state == 1) {
3855 token.append('\r');
3858 token.append(*ps);
3860 crlf_state = 0;
3861 break;
3863 ps++, icnt--;
3865 out:
3866 if (state == 2) {
3867 token.clear();
3868 state = 3;
3870 if (state == 3) {
3871 fld_val = token.detach();
3872 if (!fld_name.empty() && !fld_val.empty()) {
3873 /* FIXME: some locale free implementation is
3874 * really required here,,, */
3875 ht.set(StringUtil::ToUpper(fld_name), fld_val);
3878 return state;
3881 static int php_mail(const char *to, const char *subject, const char *message,
3882 const char *headers, const char *extra_cmd) {
3883 const char *sendmail_path = "/usr/sbin/sendmail -t -i";
3884 String sendmail_cmd = sendmail_path;
3885 if (extra_cmd != NULL) {
3886 sendmail_cmd += " ";
3887 sendmail_cmd += extra_cmd;
3890 /* Since popen() doesn't indicate if the internal fork() doesn't work
3891 * (e.g. the shell can't be executed) we explicitely set it to 0 to be
3892 * sure we don't catch any older errno value. */
3893 errno = 0;
3894 FILE *sendmail = popen(sendmail_cmd.data(), "w");
3895 if (sendmail == NULL) {
3896 raise_warning("Could not execute mail delivery program '%s'",
3897 sendmail_path);
3898 return 0;
3901 if (EACCES == errno) {
3902 raise_warning("Permission denied: unable to execute shell to run "
3903 "mail delivery binary '%s'", sendmail_path);
3904 pclose(sendmail);
3905 return 0;
3908 fprintf(sendmail, "To: %s\n", to);
3909 fprintf(sendmail, "Subject: %s\n", subject);
3910 if (headers != NULL) {
3911 fprintf(sendmail, "%s\n", headers);
3913 fprintf(sendmail, "\n%s\n", message);
3914 int ret = pclose(sendmail);
3915 #if defined(EX_TEMPFAIL)
3916 if ((ret != EX_OK) && (ret != EX_TEMPFAIL)) return 0;
3917 #elif defined(EX_OK)
3918 if (ret != EX_OK) return 0;
3919 #else
3920 if (ret != 0) return 0;
3921 #endif
3922 return 1;
3925 bool f_mb_send_mail(CStrRef to, CStrRef subject, CStrRef message,
3926 CStrRef headers /* = null_string */,
3927 CStrRef extra_cmd /* = null_string */) {
3928 /* initialize */
3929 /* automatic allocateable buffer for additional header */
3930 mbfl_memory_device device;
3931 mbfl_memory_device_init(&device, 0, 0);
3932 mbfl_string orig_str, conv_str;
3933 mbfl_string_init(&orig_str);
3934 mbfl_string_init(&conv_str);
3936 /* character-set, transfer-encoding */
3937 mbfl_no_encoding
3938 tran_cs, /* transfar text charset */
3939 head_enc, /* header transfar encoding */
3940 body_enc; /* body transfar encoding */
3941 tran_cs = mbfl_no_encoding_utf8;
3942 head_enc = mbfl_no_encoding_base64;
3943 body_enc = mbfl_no_encoding_base64;
3944 const mbfl_language *lang = mbfl_no2language(MBSTRG(current_language));
3945 if (lang != NULL) {
3946 tran_cs = lang->mail_charset;
3947 head_enc = lang->mail_header_encoding;
3948 body_enc = lang->mail_body_encoding;
3951 Array ht_headers;
3952 if (!headers.empty()) {
3953 _php_mbstr_parse_mail_headers(ht_headers, headers.data(), headers.size());
3956 struct {
3957 int cnt_type:1;
3958 int cnt_trans_enc:1;
3959 } suppressed_hdrs = { 0, 0 };
3961 static const StaticString s_CONTENT_TYPE("CONTENT-TYPE");
3962 String s = ht_headers[s_CONTENT_TYPE].toString();
3963 if (!s.isNull()) {
3964 char *tmp;
3965 char *param_name;
3966 char *charset = NULL;
3968 char *p = const_cast<char*>(strchr(s.data(), ';'));
3969 if (p != NULL) {
3970 /* skipping the padded spaces */
3971 do {
3972 ++p;
3973 } while (*p == ' ' || *p == '\t');
3975 if (*p != '\0') {
3976 if ((param_name = strtok_r(p, "= ", &tmp)) != NULL) {
3977 if (strcasecmp(param_name, "charset") == 0) {
3978 mbfl_no_encoding _tran_cs = tran_cs;
3980 charset = strtok_r(NULL, "= ", &tmp);
3981 if (charset != NULL) {
3982 _tran_cs = mbfl_name2no_encoding(charset);
3985 if (_tran_cs == mbfl_no_encoding_invalid) {
3986 raise_warning("Unsupported charset \"%s\" - "
3987 "will be regarded as ascii", charset);
3988 _tran_cs = mbfl_no_encoding_ascii;
3990 tran_cs = _tran_cs;
3995 suppressed_hdrs.cnt_type = 1;
3998 static const StaticString
3999 s_CONTENT_TRANSFER_ENCODING("CONTENT-TRANSFER-ENCODING");
4000 s = ht_headers[s_CONTENT_TRANSFER_ENCODING];
4001 if (!s.isNull()) {
4002 mbfl_no_encoding _body_enc = mbfl_name2no_encoding(s.data());
4003 switch (_body_enc) {
4004 case mbfl_no_encoding_base64:
4005 case mbfl_no_encoding_7bit:
4006 case mbfl_no_encoding_8bit:
4007 body_enc = _body_enc;
4008 break;
4010 default:
4011 raise_warning("Unsupported transfer encoding \"%s\" - "
4012 "will be regarded as 8bit", s.data());
4013 body_enc = mbfl_no_encoding_8bit;
4014 break;
4016 suppressed_hdrs.cnt_trans_enc = 1;
4019 /* To: */
4020 char *to_r = NULL;
4021 int err = 0;
4022 if (!to.empty()) {
4023 int to_len = to.size();
4024 if (to_len > 0) {
4025 to_r = strndup(to.data(), to_len);
4026 for (; to_len; to_len--) {
4027 if (!isspace((unsigned char)to_r[to_len - 1])) {
4028 break;
4030 to_r[to_len - 1] = '\0';
4032 for (int i = 0; to_r[i]; i++) {
4033 if (iscntrl((unsigned char)to_r[i])) {
4035 * According to RFC 822, section 3.1.1 long headers may be
4036 * separated into parts using CRLF followed at least one
4037 * linear-white-space character ('\t' or ' ').
4038 * To prevent these separators from being replaced with a space,
4039 * we use the SKIP_LONG_HEADER_SEP_MBSTRING to skip over them.
4041 SKIP_LONG_HEADER_SEP_MBSTRING(to_r, i);
4042 to_r[i] = ' ';
4045 } else {
4046 to_r = (char*)to.data();
4048 } else {
4049 raise_warning("Missing To: field");
4050 err = 1;
4053 /* Subject: */
4054 String encoded_subject;
4055 if (!subject.isNull()) {
4056 orig_str.no_language = MBSTRG(current_language);
4057 orig_str.val = (unsigned char *)subject.data();
4058 orig_str.len = subject.size();
4059 orig_str.no_encoding = MBSTRG(current_internal_encoding);
4060 if (orig_str.no_encoding == mbfl_no_encoding_invalid
4061 || orig_str.no_encoding == mbfl_no_encoding_pass) {
4062 orig_str.no_encoding = mbfl_identify_encoding_no
4063 (&orig_str, MBSTRG(current_detect_order_list),
4064 MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4066 mbfl_string *pstr = mbfl_mime_header_encode
4067 (&orig_str, &conv_str, tran_cs, head_enc,
4068 "\n", sizeof("Subject: [PHP-jp nnnnnnnn]"));
4069 if (pstr != NULL) {
4070 encoded_subject = String((const char *)pstr->val, pstr->len,
4071 AttachString);
4073 } else {
4074 raise_warning("Missing Subject: field");
4075 err = 1;
4078 /* message body */
4079 String encoded_message;
4080 if (!message.empty()) {
4081 orig_str.no_language = MBSTRG(current_language);
4082 orig_str.val = (unsigned char*)message.data();
4083 orig_str.len = message.size();
4084 orig_str.no_encoding = MBSTRG(current_internal_encoding);
4086 if (orig_str.no_encoding == mbfl_no_encoding_invalid
4087 || orig_str.no_encoding == mbfl_no_encoding_pass) {
4088 orig_str.no_encoding = mbfl_identify_encoding_no
4089 (&orig_str, MBSTRG(current_detect_order_list),
4090 MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4093 mbfl_string *pstr = NULL;
4095 mbfl_string tmpstr;
4096 if (mbfl_convert_encoding(&orig_str, &tmpstr, tran_cs) != NULL) {
4097 tmpstr.no_encoding = mbfl_no_encoding_8bit;
4098 pstr = mbfl_convert_encoding(&tmpstr, &conv_str, body_enc);
4099 free(tmpstr.val);
4102 if (pstr != NULL) {
4103 encoded_message = String((const char *)pstr->val, pstr->len,
4104 AttachString);
4106 } else {
4107 /* this is not really an error, so it is allowed. */
4108 raise_warning("Empty message body");
4111 /* other headers */
4112 #define PHP_MBSTR_MAIL_MIME_HEADER1 "Mime-Version: 1.0"
4113 #define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4114 #define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4115 #define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4116 if (!headers.empty()) {
4117 const char *p = headers.data();
4118 int n = headers.size();
4119 mbfl_memory_device_strncat(&device, p, n);
4120 if (n > 0 && p[n - 1] != '\n') {
4121 mbfl_memory_device_strncat(&device, "\n", 1);
4125 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER1,
4126 sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4127 mbfl_memory_device_strncat(&device, "\n", 1);
4129 if (!suppressed_hdrs.cnt_type) {
4130 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER2,
4131 sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4133 char *p = (char *)mbfl_no2preferred_mime_name(tran_cs);
4134 if (p != NULL) {
4135 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER3,
4136 sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4137 mbfl_memory_device_strcat(&device, p);
4139 mbfl_memory_device_strncat(&device, "\n", 1);
4141 if (!suppressed_hdrs.cnt_trans_enc) {
4142 mbfl_memory_device_strncat(&device, PHP_MBSTR_MAIL_MIME_HEADER4,
4143 sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4144 const char *p = (char *)mbfl_no2preferred_mime_name(body_enc);
4145 if (p == NULL) {
4146 p = "7bit";
4148 mbfl_memory_device_strcat(&device, p);
4149 mbfl_memory_device_strncat(&device, "\n", 1);
4152 mbfl_memory_device_unput(&device);
4153 mbfl_memory_device_output('\0', &device);
4155 char *all_headers = (char *)device.buffer;
4157 String cmd = f_escapeshellcmd(extra_cmd);
4158 bool ret = (!err && php_mail(to_r, encoded_subject.data(),
4159 encoded_message.data(),
4160 all_headers, cmd.data()));
4161 mbfl_memory_device_clear(&device);
4162 return ret;
4165 ///////////////////////////////////////////////////////////////////////////////