Add UConverter to intl extension to match Zend
[hiphop-php.git] / src / runtime / ext / ext_icu_ucnv.cpp
blob362e8157c904b2f2082741f8ce283771548036a2
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010- Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2010 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include <runtime/ext/ext_icu_ucnv.h>
19 #include <runtime/vm/translator/translator-inline.h>
21 namespace HPHP {
22 ///////////////////////////////////////////////////////////////////////////////
24 #define UCNV_REASON_CONST(v) \
25 const int64 q_UConverter$$REASON_ ## v = UCNV_ ## v ;
26 #define UCNV_TYPE_CONST(v) \
27 const int64 q_UConverter$$ ## v = UCNV_ ## v ;
29 UCNV_REASON_CONST(UNASSIGNED);
30 UCNV_REASON_CONST(ILLEGAL);
31 UCNV_REASON_CONST(IRREGULAR);
32 UCNV_REASON_CONST(RESET);
33 UCNV_REASON_CONST(CLOSE);
34 UCNV_REASON_CONST(CLONE);
36 UCNV_TYPE_CONST(UNSUPPORTED_CONVERTER);
37 UCNV_TYPE_CONST(SBCS);
38 UCNV_TYPE_CONST(DBCS);
39 UCNV_TYPE_CONST(MBCS);
40 UCNV_TYPE_CONST(LATIN_1);
41 UCNV_TYPE_CONST(UTF8);
42 UCNV_TYPE_CONST(UTF16_BigEndian);
43 UCNV_TYPE_CONST(UTF16_LittleEndian);
44 UCNV_TYPE_CONST(UTF32_BigEndian);
45 UCNV_TYPE_CONST(UTF32_LittleEndian);
46 UCNV_TYPE_CONST(EBCDIC_STATEFUL);
47 UCNV_TYPE_CONST(ISO_2022);
48 UCNV_TYPE_CONST(LMBCS_1);
49 UCNV_TYPE_CONST(LMBCS_2);
50 UCNV_TYPE_CONST(LMBCS_3);
51 UCNV_TYPE_CONST(LMBCS_4);
52 UCNV_TYPE_CONST(LMBCS_5);
53 UCNV_TYPE_CONST(LMBCS_6);
54 UCNV_TYPE_CONST(LMBCS_8);
55 UCNV_TYPE_CONST(LMBCS_11);
56 UCNV_TYPE_CONST(LMBCS_16);
57 UCNV_TYPE_CONST(LMBCS_17);
58 UCNV_TYPE_CONST(LMBCS_18);
59 UCNV_TYPE_CONST(LMBCS_19);
60 UCNV_TYPE_CONST(LMBCS_LAST);
61 UCNV_TYPE_CONST(HZ);
62 UCNV_TYPE_CONST(SCSU);
63 UCNV_TYPE_CONST(ISCII);
64 UCNV_TYPE_CONST(US_ASCII);
65 UCNV_TYPE_CONST(UTF7);
66 UCNV_TYPE_CONST(BOCU1);
67 UCNV_TYPE_CONST(UTF16);
68 UCNV_TYPE_CONST(UTF32);
69 UCNV_TYPE_CONST(CESU8);
70 UCNV_TYPE_CONST(IMAP_MAILBOX);
72 static StaticString s_toUCallback("toUCallback");
73 static StaticString s_fromUCallback("fromUCallback");
75 #define THROW_UFAILURE(fname, uerr, ierr) throwFailure(uerr, #fname, ierr);
77 c_UConverter::c_UConverter(const ObjectStaticCallbacks *cb)
78 : ExtObjectData(cb), m_src(NULL), m_dest(NULL) {
79 m_error.code = U_ZERO_ERROR;
80 m_error.custom_error_message = "";
83 c_UConverter::~c_UConverter() { }
85 void c_UConverter::throwFailure(UErrorCode error, const char *fname,
86 intl_error &merror) {
87 char message[1024];
88 snprintf(message, sizeof(message), "%s() returned error %ld: %s",
89 fname, (long)error, u_errorName(error));
90 merror.code = error;
91 merror.custom_error_message = String((const char*)message, CopyString);
94 void c_UConverter::t___construct(CStrRef toEncoding, CStrRef fromEncoding) {
95 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::__construct);
96 setEncoding(toEncoding, &m_dest, m_error);
97 setEncoding(fromEncoding, &m_src, m_error);
98 setCallback(m_dest);
99 setCallback(m_src);
102 Variant c_UConverter::t___destruct() {
103 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::__destruct);
104 if (m_src) {
105 ucnv_close(m_src);
107 if (m_dest) {
108 ucnv_close(m_dest);
111 return null;
114 /* get/set source/dest encodings */
116 #define TARGET_CHECK(args, len) \
117 checkLimits(args->targetLimit - args->target, len)
118 bool c_UConverter::checkLimits(int64_t available, int64_t needed) {
119 if (needed > available) {
120 THROW_UFAILURE(appendUTarget, U_BUFFER_OVERFLOW_ERROR, m_error);
121 return false;
123 return true;
126 void c_UConverter::appendToUTarget(Variant val,
127 UConverterToUnicodeArgs *args) {
128 if (val.isNull()) {
129 // Ignore
130 return;
132 if (val.isInteger()) {
133 int64_t lval = val.toInt64();
134 if (lval < 0 || lval > 0x10FFFF) {
135 THROW_UFAILURE(appendToUTarget, U_ILLEGAL_ARGUMENT_ERROR, m_error);
136 return;
138 if (lval > 0xFFFF) {
139 if (TARGET_CHECK(args, 2)) {
140 *(args->target++) = (UChar)(((lval - 0x10000) >> 10) | 0xD800);
141 *(args->target++) = (UChar)(((lval - 0x10000) & 0x3FF) | 0xDC00);
143 return;
145 if (TARGET_CHECK(args, 1)) {
146 *(args->target++) = (UChar)lval;
148 return;
150 if (val.isString()) {
151 const char *strval = val.toString().data();
152 int32_t i = 0, strlen = val.toString().size();
153 while((i != strlen) && TARGET_CHECK(args, 1)) {
154 UChar c;
155 U8_NEXT(strval, i, strlen, c);
156 *(args->target++) = c;
158 return;
160 if (val.isArray()) {
161 for(ArrayIter it(val.toArray()); it; ++it) {
162 appendToUTarget(it.second(), args);
164 return;
166 THROW_UFAILURE(appendToTarget, U_ILLEGAL_ARGUMENT_ERROR, m_error);
169 void c_UConverter::ucnvToUCallback(c_UConverter *objval,
170 UConverterToUnicodeArgs *args,
171 const char *codeUnits, int32_t length,
172 UConverterCallbackReason reason,
173 UErrorCode *pErrorCode) {
174 String source(args->source, args->sourceLimit - args->source, CopyString);
175 VRefParam errRef((long)*pErrorCode);
176 Variant ret = objval->o_invoke(s_toUCallback, CREATE_VECTOR4(
177 reason, source, String(codeUnits, length, CopyString), ref(errRef)
178 ), -1);
179 if (errRef.is(KindOfInt64)) {
180 *pErrorCode = (UErrorCode)errRef.toInt64();
181 } else {
182 throwFailure(U_ILLEGAL_ARGUMENT_ERROR, "ucnvToUCallback()",
183 objval->m_error);
185 objval->appendToUTarget(ret, args);
188 void c_UConverter::appendFromUTarget(Variant val,
189 UConverterFromUnicodeArgs *args) {
190 if (val.isNull()) {
191 // ignore
192 return;
194 if (val.isInteger()) {
195 int64_t lval = val.toInt64();
196 if (lval < 0 || lval > 255) {
197 THROW_UFAILURE(appendFromUTarget, U_ILLEGAL_ARGUMENT_ERROR, m_error);
198 return;
200 if (TARGET_CHECK(args, 1)) {
201 *(args->target++) = (char)lval;
203 return;
205 if (val.isString()) {
206 int32_t strlen = val.toString().size();
207 if (TARGET_CHECK(args, strlen)) {
208 memcpy(args->target, val.toString().data(), strlen);
209 args->target += strlen;
211 return;
213 if (val.isArray()) {
214 for(ArrayIter it(val.toArray()); it; ++it) {
215 appendFromUTarget(it.second(), args);
217 return;
219 THROW_UFAILURE(appendFromUTarget, U_ILLEGAL_ARGUMENT_ERROR, m_error);
222 void c_UConverter::ucnvFromUCallback(c_UConverter *objval,
223 UConverterFromUnicodeArgs *args,
224 const UChar *codeUnits, int32_t length,
225 UChar32 codePoint,
226 UConverterCallbackReason reason,
227 UErrorCode *pErrorCode) {
228 Array source = Array::Create();
229 for(int i = 0; i < length; i++) {
230 UChar32 c;
231 U16_NEXT(codeUnits, i, length, c);
232 source.append((int64_t)c);
234 VRefParam errRef((int64_t)*pErrorCode);
235 Variant ret = objval->o_invoke(s_fromUCallback, CREATE_VECTOR4(
236 reason, source, (int64_t)codePoint, ref(errRef)
237 ), -1);
238 if (errRef.is(KindOfInt64)) {
239 *pErrorCode = (UErrorCode)errRef.toInt64();
240 } else {
241 throwFailure(U_ILLEGAL_ARGUMENT_ERROR, "ucnvFromUCallback()",
242 objval->m_error);
244 objval->appendFromUTarget(ret, args);
247 bool c_UConverter::setCallback(UConverter *cnv) {
248 if (o_getClassName().get()->isame(String("UConverter").get())) {
249 return true;
252 UErrorCode error = U_ZERO_ERROR;
253 ucnv_setToUCallBack(cnv, (UConverterToUCallback)ucnvToUCallback,
254 (const void*)this, NULL, NULL, &error);
255 if (U_FAILURE(error)) {
256 THROW_UFAILURE(ucnv_setToUCallback, error, m_error);
257 ucnv_close(cnv);
258 return false;
260 error = U_ZERO_ERROR;
261 ucnv_setFromUCallBack(cnv, (UConverterFromUCallback)ucnvFromUCallback,
262 (const void*)this, NULL, NULL, &error);
263 if (U_FAILURE(error)) {
264 THROW_UFAILURE(ucnv_setFromUCallback, error, m_error);
265 ucnv_close(cnv);
266 return false;
269 return true;
272 bool c_UConverter::setEncoding(CStrRef encoding, UConverter **pcnv,
273 intl_error &err) {
274 UErrorCode error = U_ZERO_ERROR;
275 UConverter *cnv = ucnv_open(encoding.data(), &error);
277 if (error == U_AMBIGUOUS_ALIAS_WARNING) {
278 UErrorCode getname_error = U_ZERO_ERROR;
279 const char *actual_encoding = ucnv_getName(cnv, &getname_error);
280 if (U_FAILURE(getname_error)) {
281 actual_encoding = "(unknown)";
283 raise_warning("Ambiguous encoding specified, using %s", actual_encoding);
284 } else if (U_FAILURE(error)) {
285 THROW_UFAILURE(ucnv_open, error, err);
286 return false;
289 if (*pcnv) {
290 ucnv_close(*pcnv);
292 *pcnv = cnv;
294 return true;
297 void c_UConverter::t_setsourceencoding(CStrRef encoding) {
298 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::setsourceencoding);
299 setEncoding(encoding, &m_src, m_error);
302 void c_UConverter::t_setdestinationencoding(CStrRef encoding) {
303 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::setdestinationencoding);
304 setEncoding(encoding, &m_dest, m_error);
307 String c_UConverter::t_getsourceencoding() {
308 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getsourceencoding);
309 if (!m_src) {
310 return null;
313 UErrorCode error = U_ZERO_ERROR;
314 const char *name = ucnv_getName(m_src, &error);
315 if (U_FAILURE(error)) {
316 THROW_UFAILURE(ucnv_getName, error, m_error);
317 return null;
320 return String(name);
323 String c_UConverter::t_getdestinationencoding() {
324 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getdestinationencoding);
325 if (!m_dest) {
326 return null;
329 UErrorCode error = U_ZERO_ERROR;
330 const char *name = ucnv_getName(m_dest, &error);
331 if (U_FAILURE(error)) {
332 THROW_UFAILURE(ucnv_getName, error, m_error);
333 return null;
336 return String(name);
339 /* Get algorithmic types */
341 int64 c_UConverter::t_getsourcetype() {
342 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getsourcetype);
343 if (!m_src) {
344 return UCNV_UNSUPPORTED_CONVERTER;
347 return ucnv_getType(m_src);
350 int64 c_UConverter::t_getdestinationtype() {
351 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getdestinationtype);
352 if (!m_dest) {
353 return UCNV_UNSUPPORTED_CONVERTER;
356 return ucnv_getType(m_dest);
359 /* Basic substitution */
361 bool c_UConverter::setSubstChars(String chars, UConverter *cnv,
362 intl_error &err) {
363 UErrorCode error = U_ZERO_ERROR;
364 ucnv_setSubstChars(cnv, chars.data(), chars.size(), &error);
365 if (U_FAILURE(error)) {
366 THROW_UFAILURE(ucnv_setSubstChars, error, err);
367 return false;
369 return true;
372 bool c_UConverter::t_setsubstchars(CStrRef chars) {
373 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::setsubstchars);
374 return setSubstChars(chars, m_dest, m_error) &&
375 setSubstChars(chars, m_src, m_error);
378 String c_UConverter::t_getsubstchars() {
379 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getsubstchars);
380 UErrorCode error = U_ZERO_ERROR;
381 char chars[127];
382 int8_t chars_len;
384 ucnv_getSubstChars(m_src, chars, &chars_len, &error);
385 if (U_FAILURE(error)) {
386 THROW_UFAILURE(ucnv_getSubstChars, error, m_error);
387 return null;
390 return String(chars, chars_len, CopyString);
393 /* Callbacks */
395 Variant c_UConverter::defaultCallback(int64 reason, VRefParam error) {
396 switch(reason) {
397 case UCNV_UNASSIGNED:
398 case UCNV_ILLEGAL:
399 case UCNV_IRREGULAR:
400 error = U_ZERO_ERROR;
401 return t_getsubstchars();
404 return null;
407 Variant c_UConverter::t_fromucallback(int64 reason,
408 CArrRef source, int64 codepoint,
409 VRefParam error) {
410 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::fromucallback);
411 return defaultCallback(reason, error);
414 Variant c_UConverter::t_toucallback(int64 reason,
415 CStrRef source, CStrRef codeunits,
416 VRefParam error) {
417 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::toucallback);
418 return defaultCallback(reason, error);
421 /* Main workhorse functions */
423 Variant c_UConverter::t_convert(CStrRef str, bool reverse) {
424 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::convert);
425 SYNC_VM_REGS_SCOPED();
426 return doConvert(str, reverse ? m_src : m_dest,
427 reverse ? m_dest : m_src, m_error);
430 String c_UConverter::doConvert(CStrRef str,
431 UConverter *toCnv, UConverter *fromCnv,
432 intl_error &err) {
433 UErrorCode error = U_ZERO_ERROR;
435 if (!fromCnv || !toCnv) {
436 err.code = U_INVALID_STATE_ERROR;
437 err.custom_error_message = "Internal converters not initialized";
438 return null;
441 /* Convert to UChar pivot encoding */
442 int32_t temp_len = ucnv_toUChars(fromCnv, NULL, 0,
443 str.c_str(), str.size(), &error);
444 if (U_FAILURE(error) && error != U_BUFFER_OVERFLOW_ERROR) {
445 THROW_UFAILURE(ucnv_toUChars, error, err);
446 return null;
448 // Explicitly include the space for a \u0000 UChar since String
449 // only allocates one extra byte (not the 2 needed)
450 String tempStr(sizeof(UChar) * (temp_len + 1), ReserveString);
451 UChar *temp = (UChar*) tempStr.mutableSlice().ptr;
453 error = U_ZERO_ERROR;
454 temp_len = ucnv_toUChars(fromCnv, temp, temp_len,
455 str.c_str(), str.size(), &error);
456 if (U_FAILURE(error)) {
457 THROW_UFAILURE(ucnv_toUChars, error, err);
458 return null;
460 temp[temp_len] = 0;
462 /* Convert to final encoding */
463 error = U_ZERO_ERROR;
464 int32_t dest_len = ucnv_fromUChars(toCnv, NULL, 0,
465 temp, temp_len, &error);
466 if (U_FAILURE(error) && error != U_BUFFER_OVERFLOW_ERROR) {
467 THROW_UFAILURE(ucnv_fromUChars, error, err);
468 return null;
470 String destStr(dest_len, ReserveString);
471 char *dest = (char*) destStr.mutableSlice().ptr;
473 error = U_ZERO_ERROR;
474 dest_len = ucnv_fromUChars(toCnv, dest, dest_len,
475 temp, temp_len, &error);
476 if (U_FAILURE(error)) {
477 THROW_UFAILURE(ucnv_fromUChars, error, err);
478 return null;
480 return destStr.setSize(dest_len);
483 Variant c_UConverter::ti_transcode(const char* cls , CStrRef str,
484 CStrRef toEncoding, CStrRef fromEncoding,
485 CArrRef options) {
486 STATIC_METHOD_INJECTION_BUILTIN(UConverter, UConverter::transcode);
487 UConverter *fromCnv = NULL, *toCnv = NULL;
488 if (!setEncoding(fromEncoding, &fromCnv, s_intl_error->m_error)) {
489 return null;
491 if (!setEncoding(toEncoding, &toCnv, s_intl_error->m_error)) {
492 return null;
494 if (options.exists("from_subst") &&
495 !setSubstChars(options["from_subst"].toString(), fromCnv,
496 s_intl_error->m_error)) {
497 return null;
499 if (options.exists("to_subst") &&
500 !setSubstChars(options["to_subst"].toString(), toCnv,
501 s_intl_error->m_error)) {
502 return null;
504 Variant ret = doConvert(str, toCnv, fromCnv, s_intl_error->m_error);
505 ucnv_close(toCnv);
506 ucnv_close(fromCnv);
507 return ret;
510 /* ext/intl error handling */
512 int64 c_UConverter::t_geterrorcode() {
513 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::geterrorcode);
514 return m_error.code;
517 String c_UConverter::t_geterrormessage() {
518 INSTANCE_METHOD_INJECTION_BUILTIN(UConverter, UConverter::geterrormessage);
519 return m_error.custom_error_message;
522 /* Ennumerators and lookups */
524 #define UCNV_REASON_CASE(v) case UCNV_ ## v : return String("REASON_" #v );
525 String c_UConverter::ti_reasontext(const char* cls , int64 reason) {
526 STATIC_METHOD_INJECTION_BUILTIN(UConverter, UConverter::reasontext);
527 switch (reason) {
528 UCNV_REASON_CASE(UNASSIGNED)
529 UCNV_REASON_CASE(ILLEGAL)
530 UCNV_REASON_CASE(IRREGULAR)
531 UCNV_REASON_CASE(RESET)
532 UCNV_REASON_CASE(CLOSE)
533 UCNV_REASON_CASE(CLONE)
534 default:
535 raise_warning("Unknown UConverterCallbackReason: %ld", (long)reason);
536 return null;
540 Array c_UConverter::ti_getavailable(const char* cls ) {
541 STATIC_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getavailable);
542 int32_t i, count = ucnv_countAvailable();
543 Array ret = Array::Create();
545 for(i = 0; i < count; ++i) {
546 ret.append(ucnv_getAvailableName(i));
549 return ret;
552 Array c_UConverter::ti_getaliases(const char* cls , CStrRef encoding) {
553 STATIC_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getaliases);
554 UErrorCode error = U_ZERO_ERROR;
555 int16_t i, count = ucnv_countAliases(encoding.data(), &error);
557 if (U_FAILURE(error)) {
558 THROW_UFAILURE(ucnv_getAliases, error, s_intl_error->m_error);
559 return null;
562 Array ret = Array::Create();
563 for(i = 0; i < count; ++i) {
564 error = U_ZERO_ERROR;
565 const char *alias = ucnv_getAlias(encoding, i, &error);
566 if (U_FAILURE(error)) {
567 THROW_UFAILURE(ucnv_getAlias, error, s_intl_error->m_error);
568 return null;
570 ret.append(alias);
572 return ret;
575 Array c_UConverter::ti_getstandards(const char* cls ) {
576 STATIC_METHOD_INJECTION_BUILTIN(UConverter, UConverter::getstandards);
577 int16_t i, count = ucnv_countStandards();
578 Array ret = Array::Create();
580 for(i = 0; i < count; ++i) {
581 UErrorCode error = U_ZERO_ERROR;
582 const char *name = ucnv_getStandard(i, &error);
583 if (U_FAILURE(error)) {
584 THROW_UFAILURE(ucnv_getStandard, error, s_intl_error->m_error);
585 return null;
587 ret.append(name);
589 return ret;
592 ///////////////////////////////////////////////////////////////////////////////