2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2010 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include <runtime/ext/ext_icu_ucnv.h>
19 #include <runtime/vm/translator/translator-inline.h>
22 ///////////////////////////////////////////////////////////////////////////////
24 #define UCNV_REASON_CONST(v) \
25 const int64 q_UConverter$$REASON_ ## v = UCNV_ ## v ;
26 #define UCNV_TYPE_CONST(v) \
27 const int64 q_UConverter$$ ## v = UCNV_ ## v ;
29 UCNV_REASON_CONST(UNASSIGNED
);
30 UCNV_REASON_CONST(ILLEGAL
);
31 UCNV_REASON_CONST(IRREGULAR
);
32 UCNV_REASON_CONST(RESET
);
33 UCNV_REASON_CONST(CLOSE
);
34 UCNV_REASON_CONST(CLONE
);
36 UCNV_TYPE_CONST(UNSUPPORTED_CONVERTER
);
37 UCNV_TYPE_CONST(SBCS
);
38 UCNV_TYPE_CONST(DBCS
);
39 UCNV_TYPE_CONST(MBCS
);
40 UCNV_TYPE_CONST(LATIN_1
);
41 UCNV_TYPE_CONST(UTF8
);
42 UCNV_TYPE_CONST(UTF16_BigEndian
);
43 UCNV_TYPE_CONST(UTF16_LittleEndian
);
44 UCNV_TYPE_CONST(UTF32_BigEndian
);
45 UCNV_TYPE_CONST(UTF32_LittleEndian
);
46 UCNV_TYPE_CONST(EBCDIC_STATEFUL
);
47 UCNV_TYPE_CONST(ISO_2022
);
48 UCNV_TYPE_CONST(LMBCS_1
);
49 UCNV_TYPE_CONST(LMBCS_2
);
50 UCNV_TYPE_CONST(LMBCS_3
);
51 UCNV_TYPE_CONST(LMBCS_4
);
52 UCNV_TYPE_CONST(LMBCS_5
);
53 UCNV_TYPE_CONST(LMBCS_6
);
54 UCNV_TYPE_CONST(LMBCS_8
);
55 UCNV_TYPE_CONST(LMBCS_11
);
56 UCNV_TYPE_CONST(LMBCS_16
);
57 UCNV_TYPE_CONST(LMBCS_17
);
58 UCNV_TYPE_CONST(LMBCS_18
);
59 UCNV_TYPE_CONST(LMBCS_19
);
60 UCNV_TYPE_CONST(LMBCS_LAST
);
62 UCNV_TYPE_CONST(SCSU
);
63 UCNV_TYPE_CONST(ISCII
);
64 UCNV_TYPE_CONST(US_ASCII
);
65 UCNV_TYPE_CONST(UTF7
);
66 UCNV_TYPE_CONST(BOCU1
);
67 UCNV_TYPE_CONST(UTF16
);
68 UCNV_TYPE_CONST(UTF32
);
69 UCNV_TYPE_CONST(CESU8
);
70 UCNV_TYPE_CONST(IMAP_MAILBOX
);
72 static StaticString
s_toUCallback("toUCallback");
73 static StaticString
s_fromUCallback("fromUCallback");
75 #define THROW_UFAILURE(fname, uerr, ierr) throwFailure(uerr, #fname, ierr);
77 c_UConverter::c_UConverter(const ObjectStaticCallbacks
*cb
)
78 : ExtObjectData(cb
), m_src(NULL
), m_dest(NULL
) {
79 m_error
.code
= U_ZERO_ERROR
;
80 m_error
.custom_error_message
= "";
83 c_UConverter::~c_UConverter() { }
85 void c_UConverter::throwFailure(UErrorCode error
, const char *fname
,
88 snprintf(message
, sizeof(message
), "%s() returned error %ld: %s",
89 fname
, (long)error
, u_errorName(error
));
91 merror
.custom_error_message
= String((const char*)message
, CopyString
);
94 void c_UConverter::t___construct(CStrRef toEncoding
, CStrRef fromEncoding
) {
95 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::__construct
);
96 setEncoding(toEncoding
, &m_dest
, m_error
);
97 setEncoding(fromEncoding
, &m_src
, m_error
);
102 Variant
c_UConverter::t___destruct() {
103 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::__destruct
);
114 /* get/set source/dest encodings */
116 #define TARGET_CHECK(args, len) \
117 checkLimits(args->targetLimit - args->target, len)
118 bool c_UConverter::checkLimits(int64_t available
, int64_t needed
) {
119 if (needed
> available
) {
120 THROW_UFAILURE(appendUTarget
, U_BUFFER_OVERFLOW_ERROR
, m_error
);
126 void c_UConverter::appendToUTarget(Variant val
,
127 UConverterToUnicodeArgs
*args
) {
132 if (val
.isInteger()) {
133 int64_t lval
= val
.toInt64();
134 if (lval
< 0 || lval
> 0x10FFFF) {
135 THROW_UFAILURE(appendToUTarget
, U_ILLEGAL_ARGUMENT_ERROR
, m_error
);
139 if (TARGET_CHECK(args
, 2)) {
140 *(args
->target
++) = (UChar
)(((lval
- 0x10000) >> 10) | 0xD800);
141 *(args
->target
++) = (UChar
)(((lval
- 0x10000) & 0x3FF) | 0xDC00);
145 if (TARGET_CHECK(args
, 1)) {
146 *(args
->target
++) = (UChar
)lval
;
150 if (val
.isString()) {
151 const char *strval
= val
.toString().data();
152 int32_t i
= 0, strlen
= val
.toString().size();
153 while((i
!= strlen
) && TARGET_CHECK(args
, 1)) {
155 U8_NEXT(strval
, i
, strlen
, c
);
156 *(args
->target
++) = c
;
161 for(ArrayIter
it(val
.toArray()); it
; ++it
) {
162 appendToUTarget(it
.second(), args
);
166 THROW_UFAILURE(appendToTarget
, U_ILLEGAL_ARGUMENT_ERROR
, m_error
);
169 void c_UConverter::ucnvToUCallback(c_UConverter
*objval
,
170 UConverterToUnicodeArgs
*args
,
171 const char *codeUnits
, int32_t length
,
172 UConverterCallbackReason reason
,
173 UErrorCode
*pErrorCode
) {
174 String
source(args
->source
, args
->sourceLimit
- args
->source
, CopyString
);
175 VRefParam
errRef((long)*pErrorCode
);
176 Variant ret
= objval
->o_invoke(s_toUCallback
, CREATE_VECTOR4(
177 reason
, source
, String(codeUnits
, length
, CopyString
), ref(errRef
)
179 if (errRef
.is(KindOfInt64
)) {
180 *pErrorCode
= (UErrorCode
)errRef
.toInt64();
182 throwFailure(U_ILLEGAL_ARGUMENT_ERROR
, "ucnvToUCallback()",
185 objval
->appendToUTarget(ret
, args
);
188 void c_UConverter::appendFromUTarget(Variant val
,
189 UConverterFromUnicodeArgs
*args
) {
194 if (val
.isInteger()) {
195 int64_t lval
= val
.toInt64();
196 if (lval
< 0 || lval
> 255) {
197 THROW_UFAILURE(appendFromUTarget
, U_ILLEGAL_ARGUMENT_ERROR
, m_error
);
200 if (TARGET_CHECK(args
, 1)) {
201 *(args
->target
++) = (char)lval
;
205 if (val
.isString()) {
206 int32_t strlen
= val
.toString().size();
207 if (TARGET_CHECK(args
, strlen
)) {
208 memcpy(args
->target
, val
.toString().data(), strlen
);
209 args
->target
+= strlen
;
214 for(ArrayIter
it(val
.toArray()); it
; ++it
) {
215 appendFromUTarget(it
.second(), args
);
219 THROW_UFAILURE(appendFromUTarget
, U_ILLEGAL_ARGUMENT_ERROR
, m_error
);
222 void c_UConverter::ucnvFromUCallback(c_UConverter
*objval
,
223 UConverterFromUnicodeArgs
*args
,
224 const UChar
*codeUnits
, int32_t length
,
226 UConverterCallbackReason reason
,
227 UErrorCode
*pErrorCode
) {
228 Array source
= Array::Create();
229 for(int i
= 0; i
< length
; i
++) {
231 U16_NEXT(codeUnits
, i
, length
, c
);
232 source
.append((int64_t)c
);
234 VRefParam
errRef((int64_t)*pErrorCode
);
235 Variant ret
= objval
->o_invoke(s_fromUCallback
, CREATE_VECTOR4(
236 reason
, source
, (int64_t)codePoint
, ref(errRef
)
238 if (errRef
.is(KindOfInt64
)) {
239 *pErrorCode
= (UErrorCode
)errRef
.toInt64();
241 throwFailure(U_ILLEGAL_ARGUMENT_ERROR
, "ucnvFromUCallback()",
244 objval
->appendFromUTarget(ret
, args
);
247 bool c_UConverter::setCallback(UConverter
*cnv
) {
248 if (o_getClassName().get()->isame(String("UConverter").get())) {
252 UErrorCode error
= U_ZERO_ERROR
;
253 ucnv_setToUCallBack(cnv
, (UConverterToUCallback
)ucnvToUCallback
,
254 (const void*)this, NULL
, NULL
, &error
);
255 if (U_FAILURE(error
)) {
256 THROW_UFAILURE(ucnv_setToUCallback
, error
, m_error
);
260 error
= U_ZERO_ERROR
;
261 ucnv_setFromUCallBack(cnv
, (UConverterFromUCallback
)ucnvFromUCallback
,
262 (const void*)this, NULL
, NULL
, &error
);
263 if (U_FAILURE(error
)) {
264 THROW_UFAILURE(ucnv_setFromUCallback
, error
, m_error
);
272 bool c_UConverter::setEncoding(CStrRef encoding
, UConverter
**pcnv
,
274 UErrorCode error
= U_ZERO_ERROR
;
275 UConverter
*cnv
= ucnv_open(encoding
.data(), &error
);
277 if (error
== U_AMBIGUOUS_ALIAS_WARNING
) {
278 UErrorCode getname_error
= U_ZERO_ERROR
;
279 const char *actual_encoding
= ucnv_getName(cnv
, &getname_error
);
280 if (U_FAILURE(getname_error
)) {
281 actual_encoding
= "(unknown)";
283 raise_warning("Ambiguous encoding specified, using %s", actual_encoding
);
284 } else if (U_FAILURE(error
)) {
285 THROW_UFAILURE(ucnv_open
, error
, err
);
297 void c_UConverter::t_setsourceencoding(CStrRef encoding
) {
298 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::setsourceencoding
);
299 setEncoding(encoding
, &m_src
, m_error
);
302 void c_UConverter::t_setdestinationencoding(CStrRef encoding
) {
303 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::setdestinationencoding
);
304 setEncoding(encoding
, &m_dest
, m_error
);
307 String
c_UConverter::t_getsourceencoding() {
308 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getsourceencoding
);
313 UErrorCode error
= U_ZERO_ERROR
;
314 const char *name
= ucnv_getName(m_src
, &error
);
315 if (U_FAILURE(error
)) {
316 THROW_UFAILURE(ucnv_getName
, error
, m_error
);
323 String
c_UConverter::t_getdestinationencoding() {
324 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getdestinationencoding
);
329 UErrorCode error
= U_ZERO_ERROR
;
330 const char *name
= ucnv_getName(m_dest
, &error
);
331 if (U_FAILURE(error
)) {
332 THROW_UFAILURE(ucnv_getName
, error
, m_error
);
339 /* Get algorithmic types */
341 int64
c_UConverter::t_getsourcetype() {
342 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getsourcetype
);
344 return UCNV_UNSUPPORTED_CONVERTER
;
347 return ucnv_getType(m_src
);
350 int64
c_UConverter::t_getdestinationtype() {
351 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getdestinationtype
);
353 return UCNV_UNSUPPORTED_CONVERTER
;
356 return ucnv_getType(m_dest
);
359 /* Basic substitution */
361 bool c_UConverter::setSubstChars(String chars
, UConverter
*cnv
,
363 UErrorCode error
= U_ZERO_ERROR
;
364 ucnv_setSubstChars(cnv
, chars
.data(), chars
.size(), &error
);
365 if (U_FAILURE(error
)) {
366 THROW_UFAILURE(ucnv_setSubstChars
, error
, err
);
372 bool c_UConverter::t_setsubstchars(CStrRef chars
) {
373 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::setsubstchars
);
374 return setSubstChars(chars
, m_dest
, m_error
) &&
375 setSubstChars(chars
, m_src
, m_error
);
378 String
c_UConverter::t_getsubstchars() {
379 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getsubstchars
);
380 UErrorCode error
= U_ZERO_ERROR
;
384 ucnv_getSubstChars(m_src
, chars
, &chars_len
, &error
);
385 if (U_FAILURE(error
)) {
386 THROW_UFAILURE(ucnv_getSubstChars
, error
, m_error
);
390 return String(chars
, chars_len
, CopyString
);
395 Variant
c_UConverter::defaultCallback(int64 reason
, VRefParam error
) {
397 case UCNV_UNASSIGNED
:
400 error
= U_ZERO_ERROR
;
401 return t_getsubstchars();
407 Variant
c_UConverter::t_fromucallback(int64 reason
,
408 CArrRef source
, int64 codepoint
,
410 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::fromucallback
);
411 return defaultCallback(reason
, error
);
414 Variant
c_UConverter::t_toucallback(int64 reason
,
415 CStrRef source
, CStrRef codeunits
,
417 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::toucallback
);
418 return defaultCallback(reason
, error
);
421 /* Main workhorse functions */
423 Variant
c_UConverter::t_convert(CStrRef str
, bool reverse
) {
424 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::convert
);
425 SYNC_VM_REGS_SCOPED();
426 return doConvert(str
, reverse
? m_src
: m_dest
,
427 reverse
? m_dest
: m_src
, m_error
);
430 String
c_UConverter::doConvert(CStrRef str
,
431 UConverter
*toCnv
, UConverter
*fromCnv
,
433 UErrorCode error
= U_ZERO_ERROR
;
435 if (!fromCnv
|| !toCnv
) {
436 err
.code
= U_INVALID_STATE_ERROR
;
437 err
.custom_error_message
= "Internal converters not initialized";
441 /* Convert to UChar pivot encoding */
442 int32_t temp_len
= ucnv_toUChars(fromCnv
, NULL
, 0,
443 str
.c_str(), str
.size(), &error
);
444 if (U_FAILURE(error
) && error
!= U_BUFFER_OVERFLOW_ERROR
) {
445 THROW_UFAILURE(ucnv_toUChars
, error
, err
);
448 // Explicitly include the space for a \u0000 UChar since String
449 // only allocates one extra byte (not the 2 needed)
450 String
tempStr(sizeof(UChar
) * (temp_len
+ 1), ReserveString
);
451 UChar
*temp
= (UChar
*) tempStr
.mutableSlice().ptr
;
453 error
= U_ZERO_ERROR
;
454 temp_len
= ucnv_toUChars(fromCnv
, temp
, temp_len
,
455 str
.c_str(), str
.size(), &error
);
456 if (U_FAILURE(error
)) {
457 THROW_UFAILURE(ucnv_toUChars
, error
, err
);
462 /* Convert to final encoding */
463 error
= U_ZERO_ERROR
;
464 int32_t dest_len
= ucnv_fromUChars(toCnv
, NULL
, 0,
465 temp
, temp_len
, &error
);
466 if (U_FAILURE(error
) && error
!= U_BUFFER_OVERFLOW_ERROR
) {
467 THROW_UFAILURE(ucnv_fromUChars
, error
, err
);
470 String
destStr(dest_len
, ReserveString
);
471 char *dest
= (char*) destStr
.mutableSlice().ptr
;
473 error
= U_ZERO_ERROR
;
474 dest_len
= ucnv_fromUChars(toCnv
, dest
, dest_len
,
475 temp
, temp_len
, &error
);
476 if (U_FAILURE(error
)) {
477 THROW_UFAILURE(ucnv_fromUChars
, error
, err
);
480 return destStr
.setSize(dest_len
);
483 Variant
c_UConverter::ti_transcode(const char* cls
, CStrRef str
,
484 CStrRef toEncoding
, CStrRef fromEncoding
,
486 STATIC_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::transcode
);
487 UConverter
*fromCnv
= NULL
, *toCnv
= NULL
;
488 if (!setEncoding(fromEncoding
, &fromCnv
, s_intl_error
->m_error
)) {
491 if (!setEncoding(toEncoding
, &toCnv
, s_intl_error
->m_error
)) {
494 if (options
.exists("from_subst") &&
495 !setSubstChars(options
["from_subst"].toString(), fromCnv
,
496 s_intl_error
->m_error
)) {
499 if (options
.exists("to_subst") &&
500 !setSubstChars(options
["to_subst"].toString(), toCnv
,
501 s_intl_error
->m_error
)) {
504 Variant ret
= doConvert(str
, toCnv
, fromCnv
, s_intl_error
->m_error
);
510 /* ext/intl error handling */
512 int64
c_UConverter::t_geterrorcode() {
513 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::geterrorcode
);
517 String
c_UConverter::t_geterrormessage() {
518 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::geterrormessage
);
519 return m_error
.custom_error_message
;
522 /* Ennumerators and lookups */
524 #define UCNV_REASON_CASE(v) case UCNV_ ## v : return String("REASON_" #v );
525 String
c_UConverter::ti_reasontext(const char* cls
, int64 reason
) {
526 STATIC_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::reasontext
);
528 UCNV_REASON_CASE(UNASSIGNED
)
529 UCNV_REASON_CASE(ILLEGAL
)
530 UCNV_REASON_CASE(IRREGULAR
)
531 UCNV_REASON_CASE(RESET
)
532 UCNV_REASON_CASE(CLOSE
)
533 UCNV_REASON_CASE(CLONE
)
535 raise_warning("Unknown UConverterCallbackReason: %ld", (long)reason
);
540 Array
c_UConverter::ti_getavailable(const char* cls
) {
541 STATIC_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getavailable
);
542 int32_t i
, count
= ucnv_countAvailable();
543 Array ret
= Array::Create();
545 for(i
= 0; i
< count
; ++i
) {
546 ret
.append(ucnv_getAvailableName(i
));
552 Array
c_UConverter::ti_getaliases(const char* cls
, CStrRef encoding
) {
553 STATIC_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getaliases
);
554 UErrorCode error
= U_ZERO_ERROR
;
555 int16_t i
, count
= ucnv_countAliases(encoding
.data(), &error
);
557 if (U_FAILURE(error
)) {
558 THROW_UFAILURE(ucnv_getAliases
, error
, s_intl_error
->m_error
);
562 Array ret
= Array::Create();
563 for(i
= 0; i
< count
; ++i
) {
564 error
= U_ZERO_ERROR
;
565 const char *alias
= ucnv_getAlias(encoding
, i
, &error
);
566 if (U_FAILURE(error
)) {
567 THROW_UFAILURE(ucnv_getAlias
, error
, s_intl_error
->m_error
);
575 Array
c_UConverter::ti_getstandards(const char* cls
) {
576 STATIC_METHOD_INJECTION_BUILTIN(UConverter
, UConverter::getstandards
);
577 int16_t i
, count
= ucnv_countStandards();
578 Array ret
= Array::Create();
580 for(i
= 0; i
< count
; ++i
) {
581 UErrorCode error
= U_ZERO_ERROR
;
582 const char *name
= ucnv_getStandard(i
, &error
);
583 if (U_FAILURE(error
)) {
584 THROW_UFAILURE(ucnv_getStandard
, error
, s_intl_error
->m_error
);
592 ///////////////////////////////////////////////////////////////////////////////