Adding extra charsets for ActionMailer unit tests, if you're looking to parse incomin...
[akelos.git] / lib / AkCharset.php
blob4df77b1ec614282615bd3bb696813c627309d305
1 <?php
2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
4 // +----------------------------------------------------------------------+
5 // | Akelos Framework - http://www.akelos.org |
6 // +----------------------------------------------------------------------+
7 // | Copyright (c) 2002-2006, Akelos Media, S.L. & Bermi Ferrer Martinez |
8 // | Released under the GNU Lesser General Public License, see LICENSE.txt|
9 // +----------------------------------------------------------------------+
11 /**
12 * @package ActiveSupport
13 * @subpackage I18n-L10n
14 * @author Bermi Ferrer <bermi a.t akelos c.om>
15 * @copyright Copyright (c) 2002-2006, Akelos Media, S.L. http://www.akelos.org
16 * @license GNU Lesser General Public License <http://www.gnu.org/copyleft/lesser.html>
20 /**
21 * Charset conversion using UT8 mapping tables.
23 * Charset conversion using 4 different methods. Pure PHP
24 * conversion or one of this PHP extensions iconv, recode and
25 * multibyte.
27 * Supported charsets are:
28 * ASCII, ISO 8859-1, ISO 8859-2, ISO 8859-3, ISO 8859-4, ISO
29 * 8859-5, ISO 8859-6, ISO 8859-7, ISO 8859-8, ISO 8859-9, ISO
30 * 8859-10, ISO 8859-11, ISO 8859-13, ISO 8859-14, ISO 8859-15,
31 * ISO 8859-16, CP437, CP737, CP850, CP852, CP855, CP857,
32 * CP858, CP860, CP861, CP863, CP865, CP866, CP869,
33 * Windows-1250, Windows-1251, Windows-1252, Windows-1253,
34 * Windows-1254, Windows-1255, Windows-1256, Windows-1257,
35 * Windows-1258, KOI8-R, KOI8-U, ISCII, VISCII, Big5, HKSCS,
36 * GB2312, GB18030, Shift-JIS, EUC
38 * More information about charsets at
39 * http://en.wikipedia.org/wiki/Character_encoding
41 * @author Bermi Ferrer <bermi@akelos.org>
42 * @copyright Copyright (c) 2002-2005, Akelos Media, S.L. http://www.akelos.org
43 * @license GNU Lesser General Public License <http://www.gnu.org/copyleft/lesser.html>
44 * @since 0.1
45 * @version $Revision 0.1 $
47 class AkCharset
51 // ------ CLASS ATTRIBUTES ------ //
55 // ---- Public attributes ---- //
58 /**
59 * Allow charset recoding.
61 * @access public
62 * @var bool $enableCharsetRecoding
64 var $enableCharsetRecoding = true;
66 /**
67 * Allow or disallow PHP Based charset conversion.
69 * @access public
70 * @var boolean $usePhpRecoding
72 var $usePhpRecoding = true;
74 /**
75 * Default charset
77 * @access public
78 * @var string $defaultCharset
80 var $defaultCharset = 'ISO-8859-1';
82 /**
83 * UTF-8 error character
85 * Char that will be used when no matches are found on the UTF8
86 * mapping table
88 * @access public
89 * @var string $utf8ErrorChar
91 var $utf8ErrorChar = '?';
94 // ---- Private attributes ---- //
97 /**
98 * Current encoding engine
100 * @see GetRecodingEngine
101 * @see SetRecodingEngine
102 * @access private
103 * @var string $_recodingEngine
105 var $_recodingEngine = null;
108 * Extra parameters for invoking the encoding engine (useful
109 * for iconv)
111 * @see GetRecodingEngineExtraParams
112 * @see SetRecodingEngineExtraParams
113 * @access private
114 * @var string $_recodingEngineExtraParams
116 var $_recodingEngineExtraParams = '';
119 * Holds current procesing charset.
121 * @see GetCurrentCharset
122 * @access private
123 * @var string $_currentCharset
125 var $_currentCharset = 'ISO-8859-1';
129 // ------------------------------
133 // ------ CLASS METHODS ------ //
137 // ---- Getters ---- //
141 * $this->_recodingEngine getter
143 * Use this method to get $this->_recodingEngine value
145 * @access public
146 * @see set$recodingEngine
147 * @return string Returns Current encoding engine value.
149 function GetRecodingEngine()
151 return $this->_recodingEngine;
153 }// -- end of GetRecodingEngine -- //
156 * $this->_recodingEngineExtraParams getter
158 * Use this method to get $this->_recodingEngineExtraParams
159 * value
161 * @access public
162 * @see set$recodingEngineExtraParams
163 * @return string Returns Extra parameters for invoking the encoding
164 * engine (useful for iconv) value.
166 function GetRecodingEngineExtraParams()
168 return $this->_recodingEngineExtraParams;
170 }// -- end of GetRecodingEngineExtraParams -- //
173 * $this->_currentCharset getter
175 * Use this method to get $this->_currentCharset value
177 * @access public
178 * @see set$currentCharset
179 * @return string Returns Holds current procesing charset. value.
181 function GetCurrentCharset()
183 return $this->_currentCharset;
185 }// -- end of GetCurrentCharset -- //
188 // ---- Setters ---- //
192 * Sets the default recoding engine.
194 * @access public
195 * @uses _LoadExtension
196 * @param string $engine Possible engines are:
197 * - iconv (http://php.net/iconv)
198 * - mbstring (http://php.net/mb_convert_encoding)
199 * - recode (http://php.net/recode_string)
200 * @param string $extra_params Extra parameters for invoking the encoding engine
201 * (useful for iconv)
202 * @return string Name of current recoding engine
204 function SetRecodingEngine($engine = null, $extra_params = null)
206 static $memory;
208 if(isset($memory[$engine.$extra_params])){
209 return $memory[$engine.$extra_params];
211 $engines = array('iconv'=>'iconv','mbstring'=>'mb_convert_encoding','recode'=>'recode_string');
212 $this->_recodingEngine = false;
213 // Fix for systems with constant iconv defined. Php uses libiconv function instead
214 if (!function_exists('iconv') && function_exists('libiconv')) {
215 function iconv($input_encoding, $output_encoding, $string) {
216 return libiconv($input_encoding, $output_encoding, $string);
219 if(!isset($engine)){
220 foreach ($engines as $engine=>$function){
221 if(@function_exists($function)){
222 $this->_recodingEngine = $engine;
223 break;
224 }elseif($this->_LoadExtension($engine)&&function_exists($function)){
225 $this->_recodingEngine = $engine;
226 break;
229 }elseif (isset($engines[$engine])){
230 if(!@function_exists($engines[$engine])){
231 user_error(Ak::t('Could not set AkCharset::SetRecodingEngine("%engine");',array('%engine'=>$engine)),E_USER_NOTICE);
232 $memory[$engine.$extra_params] = false;
233 }else{
234 $this->_recodingEngine = $engine;
237 if(isset($extra_params)){
238 $this->_recodingEngineExtraParams = $extra_params;
240 $memory[$engine.$extra_params] = true;
241 return $this->_recodingEngine;
242 }// -- end of &SetRecodingEngine -- //
246 * $this->_recodingEngineExtraParams setter
248 * Use this method to set $this->_recodingEngineExtraParams
249 * value
251 * @access public
252 * @see get$recodingEngineExtraParams
253 * @param string $recoding__engine__extra__params Extra parameters for invoking the encoding engine
254 * (useful for iconv)
255 * @return bool Returns true if $this->_recodingEngineExtraParams
256 * has been set correctly.
258 function SetRecodingEngineExtraParams($recoding__engine__extra__params)
260 $this->_recodingEngineExtraParams = $recoding__engine__extra__params;
262 }// -- end of SetRecodingEngineExtraParams -- //
265 // ---- Public methods ---- //
269 * Changes the charset encoding of one string to other charset.
271 * This function will convert a string from one charset to
272 * another.
273 * Unfortunately PHP has not native Unicode support, so in
274 * order to display and handle different charsets, this
275 * function wraps 3 non standard PHP extensions plus an
276 * additional Pure PHP conversion utility for systems that do
277 * not have this extensions enabled.
279 * @access public
280 * @param string $string String to recode
281 * @param string $target_charset Target charset. AkCharset availability may vary
282 * depending on your system configuration.
283 * @param string $origin_charset Input string charset. AkCharset availability may
284 * vary depending on your system configuration.
285 * This parameter is optional if you are using
286 * multibyte extension.
287 * @param string $engine Possible engines are:
288 * - iconv (http://php.net/iconv)
289 * - mbstring (http://php.net/mb_convert_encoding)
290 * - recode (http://php.net/recode_string)
291 * @param string $engine_extra_params Extra parameters for invoking the encoding engine
292 * (useful for iconv)
293 * @return void Recoded string if possible, otherwise it will
294 * return the string without modifications.
296 function RecodeString($string, $target_charset, $origin_charset = null, $engine = null, $engine_extra_params = null)
298 static $memory;
299 if(!is_string($string)){
300 return $string;
302 if($this->enableCharsetRecoding == false || $target_charset==$origin_charset){
303 return $string;
305 if(isset($engine)|!isset($memory['engine'])){
306 $engine = $this->SetRecodingEngine($engine,$engine_extra_params);
308 }else{
309 $engine = $memory['engine'];
311 if(!$engine && !$this->usePhpRecoding){
312 return $string;
314 $method = strlen($engine)>1 ? '_'.ucfirst($engine).'StringRecode' : '_PhpStringRecode';
315 if(method_exists($this,$method)){
316 return $this->$method($string, $target_charset, $origin_charset, $engine_extra_params);
317 }else{
318 user_error(Ak::t('Could not invoque AkCharset::%method();',array('%method'=>$method)),E_USER_NOTICE);
319 return $string;
321 }// -- end of &RecodeString -- //
324 * Fetch an array with UTF8 charset equivalence table.
326 * @access public
327 * @uses _LoadInverseMap
328 * @uses _GetCharset
329 * @param string $charset Desired charset
330 * @return mixed Multilevel array with selected mapping:
331 * array(
332 * 'to_utf' => array(CHARS_VAL=>UTF_VAL),
333 * 'from_utf' => array(UTF_VAL=>CHARS_VAL)
334 * );
336 * False if mapping is not found.
338 function GetMapping($charset)
340 $charset = $this->_GetCharset($charset,false);
341 if($charset!=false){
342 $mapping = array();
343 include_once(AK_LIB_DIR.DS.'AkCharset'.DS.'utf8_mappings'.DS.$charset.'.php');
344 if(class_exists($charset)){
345 $mappingObject =& Ak::singleton($charset,$charset);
346 $mapping["to_utf"] = $mappingObject->_toUtfMap;
347 $mappingObject->_LoadInverseMap();
348 $mapping["from_utf"] = $mappingObject->_fromUtfMap;
350 return $mapping;
353 return false;
354 }// -- end of &GetMapping -- //
357 // ---- Private methods ---- //
361 * Tries to load required extension.
363 * @access private
364 * @see SetRecodingEngine
365 * @param string $extension Extension name
366 * @return boolean Returns true on success false on failure.
368 function _LoadExtension($extension)
370 static $memory;
371 if(!isset($memory[$extension])){
372 if (!extension_loaded($extension)) {
373 if(!ini_get('safe_mode')){
374 $prefix = (PHP_SHLIB_SUFFIX == 'dll') ? 'php_' : '';
375 $memory[$extension] = @dl($prefix .$extension.PHP_SHLIB_SUFFIX);
376 }else{
377 $memory[$extension] = false;
379 }else{
380 $memory[$extension] = true;
383 return $memory[$extension];
384 }// -- end of &_LoadExtension -- //
387 * AkCharset::RecodeString() iconv implementation
389 * @access private
390 * @see RecodeString
391 * @return string Recoded string if possible, otherwise it will
392 * return the string without modifications.
394 function _IconvStringRecode($string, $target_charset, $origin_charset, $engine_extra_params=null)
397 * @todo Fix iconv bug on PHP
399 if(AK_PHP5){
400 return $this->_PhpStringRecode($string, $target_charset, $origin_charset);
402 if(!$this->_ConversionIsNeeded($origin_charset, $target_charset) && !$this->isUtf8($string)){
403 return $string;
405 $engine_extra_params = isset($engine_extra_params) ? $engine_extra_params : $this->_recodingEngineExtraParams;
406 return iconv($target_charset, $origin_charset.$engine_extra_params, $string);
407 }// -- end of &_IconvStringRecode -- //
410 * AkCharset::RecodeString() recode_string implementation
412 * @access private
413 * @see RecodeString
414 * @return string Recoded string if possible, otherwise it will
415 * return the string without modifications.
417 function _RecodeStringRecode($string, $target_charset, $origin_charset)
419 return recode_string($target_charset, '..'.$origin_charset, $string);
420 }// -- end of &_RecodeStringRecode -- //
423 * AkCharset::RecodeString() mb_convert_encoding implementation
425 * @access private
426 * @see RecodeString
427 * @return string Recoded string if possible, otherwise it will
428 * return the string without modifications.
430 function _MbstringStringRecode($string, $target_charset, $origin_charset=null)
432 if(is_null($origin_charset)){
433 $origin_charset = $string;
434 }else{
435 if(!$this->_ConversionIsNeeded($origin_charset, $target_charset) && !$this->isUtf8($string)){
436 return $string;
440 return mb_convert_encoding($string,$target_charset, empty($origin_charset) ? mb_detect_encoding($string) : $origin_charset);
441 }// -- end of &_MbstringStringRecode -- //
444 * AkCharset::RecodeString() Pure PHP implementation
446 * @access private
447 * @uses _Utf8StringEncode
448 * @uses _Utf8StringDecode
449 * @see RecodeString
450 * @see _Utf8StringEncode
451 * @see _Utf8StringDecode
452 * @return string Recoded string if possible, otherwise it will
453 * return the string without modifications.
455 function _PhpStringRecode($string, $target_charset, $origin_charset)
457 $target_charset = $this->_GetCharset($target_charset,false);
458 $origin_charset = $this->_GetCharset($origin_charset,false);
460 if((!$this->_ConversionIsNeeded($origin_charset, $target_charset)|!$this->usePhpRecoding) && !$this->isUtf8($string)){
461 return $string;
463 if($origin_charset=='utf8'){
464 include_once(AK_LIB_DIR.DS.'AkCharset'.DS.'utf8_mappings'.DS.$target_charset.'.php');
465 if(class_exists($target_charset)){
467 $mappingObject =& Ak::singleton($target_charset, $target_charset);
469 if(method_exists($mappingObject,'_Utf8StringDecode')){
470 return $mappingObject->_Utf8StringDecode($string);
471 }else{
472 return $string;
474 }else{
475 return $string;
477 }elseif($target_charset=='utf8'){
478 include_once(AK_LIB_DIR.DS.'AkCharset'.DS.'utf8_mappings'.DS.$origin_charset.'.php');
479 if(class_exists($origin_charset)){
480 $mappingObject =& Ak::singleton($origin_charset, $origin_charset);
481 if(method_exists($mappingObject,'_Utf8StringEncode')){
482 return $mappingObject->_Utf8StringEncode($string);
483 }else{
484 return $string;
486 }else{
487 return $string;
489 }else{
490 $utf8String = $this->_PhpStringRecode($string,'utf8',$origin_charset);
491 return $this->_PhpStringRecode($utf8String,$target_charset,'utf8');
493 }// -- end of &_PhpStringRecode -- //
496 * Checks for possibility or need of charset conversion.
498 * @access private
499 * @uses _GetCharset
500 * @param string $origin_charset
501 * @param string $target_charset
502 * @return boolean
504 function _ConversionIsNeeded($origin_charset, $target_charset)
506 $target_charset = $this->_GetCharset($target_charset,false);
507 $origin_charset = $this->_GetCharset($origin_charset,false);
509 if(($origin_charset==$target_charset)||!$target_charset||!$origin_charset){
510 return false;
513 if($origin_charset == 'utf8' || $target_charset == 'utf8'){
514 return true;
516 $similar_charsets[] = array('cp1257','iso885913','iso88594');
517 $similar_charsets[] = array('koi8u','cp1251','iso88595','koi8r');
519 foreach ($similar_charsets as $group){
520 if(in_array($origin_charset,$group)&&in_array($target_charset,$group)){
521 return true;
524 return false;
525 }// -- end of &_ConversionIsNeeded -- //
528 * Filters input charset and returns a custom formated value
529 * for class wide usage.
531 * @access private
532 * @param string $charset AkCharset name
533 * @param boolean $set_charset If true will set $this->defaultCharset value
534 * @return mixed AkCharset internal name or FALSE if charset is not
535 * found.
537 function _GetCharset($charset = null, $set_charset = true)
539 static $memory;
540 if(isset($memory[$charset])){
541 return $memory[$charset];
543 $procesed_charset = $charset == null ? $this->defaultCharset : $charset;
544 $procesed_charset = str_replace(array('-','_','.',' '),'',strtolower(trim($procesed_charset)));
545 $procesed_charset = str_replace(array('windows','ibm'),'cp',strtolower(trim($procesed_charset)));
546 $alias_xref = array('437'=>'cp437','850'=>'cp850','852'=>'cp852','855'=>'cp855','857'=>'cp857',
547 '860'=>'cp860','861'=>'cp861','862'=>'cp862','863'=>'cp863','865'=>'cp865','866'=>'cp866','869'=>'cp869',
548 'ansix341968'=>'ascii','ansix341986'=>'ascii','arabic'=>'iso88596','asmo708'=>'iso88596','big5cp950'=>'big5',
549 'cp367'=>'ascii','cp819'=>'iso88591','cpgr'=>'cp869','cpis'=>'cp861','csascii'=>'ascii','csbig5'=>'big5',
550 'cscp855'=>'cp855','cscp857'=>'cp857','cscp860'=>'cp860','cscp861'=>'cp861','cscp863'=>'cp863','cscp864'=>'cp864',
551 'cscp865'=>'cp865','cscp866'=>'cp866','cscp869'=>'cp869','cseuckr'=>'euckr','cseucpkdfmtjapanese'=>'eucjp',
552 'csgb2312'=>'gb18030','csisolatin1'=>'iso88591','csisolatin2'=>'iso88592','csisolatin3'=>'iso88593',
553 'csisolatin4'=>'iso88594','csisolatin5'=>'iso88599','csisolatinarabic'=>'iso88596',
554 'csisolatincyrillic'=>'iso88595','csisolatingreek'=>'iso88597','csisolatinhebrew'=>'iso88598','cskoi8r'=>'koi8r',
555 'cspc850multilingual'=>'cp850','cspc862latinhebrew'=>'cp862','cspc8codepage437'=>'cp437','cspcp852'=>'cp852',
556 'csshiftjis'=>'shiftjis','cyrillic'=>'iso88595','ecma114'=>'iso88596','ecma118'=>'iso88597','elot928'=>'iso88597',
557 'extendedunixcodepackedformatforjapanese'=>'eucjp','gb2312'=>'gb18030','greek'=>'iso88597','greek8'=>'iso88597',
558 'hebrew'=>'iso88598','hkscsbig5'=>'big5hkscs','iso646irv:1991'=>'ascii','iso646us'=>'ascii',
559 'iso885914:1998'=>'iso885914','iso88591:1987'=>'iso88591','iso88592:1987'=>'iso88592','iso88593:1988'=>'iso88593',
560 'iso88594:1988'=>'iso88594','iso88595:1988'=>'iso88595','iso88596:1987'=>'iso88596','iso88597:1987'=>'iso88597',
561 'iso88598:1988'=>'iso88598','iso88599:1989'=>'iso88599','isoceltic'=>'iso885914','isoir100'=>'iso88591',
562 'isoir101'=>'iso88592','isoir109'=>'iso88593','isoir110'=>'iso88594','isoir126'=>'iso88597','isoir127'=>'iso88596',
563 'isoir138'=>'iso88598','isoir144'=>'iso88595','isoir148'=>'iso88599','isoir166'=>'tis620','isoir179'=>'iso885913',
564 'isoir199'=>'iso885914','isoir226'=>'iso885916','isoir6'=>'ascii','l1'=>'iso88591','l10'=>'iso885916','l2'=>'iso88592',
565 'l3'=>'iso88593','l4'=>'iso88594','l5'=>'iso88599','l7'=>'iso885913','l8'=>'iso885914','latin1'=>'iso88591',
566 'latin10'=>'iso885916','latin2'=>'iso88592','latin3'=>'iso88593','latin4'=>'iso88594','latin5'=>'iso88599',
567 'latin7'=>'iso885913','latin8'=>'iso885914','mscyrl'=>'cp1251','mshebr'=>'cp1255','mskanji'=>'shiftjis',
568 'sjis'=>'shiftjis','tcabig5'=>'big5','tis6200'=>'tis620','tis62025291'=>'tis620','tis62025330'=>'tis620',
569 'us'=>'ascii','usascii'=>'ascii');
570 $alias = array(
571 'armscii8'=>'armscii_8','ascii'=>'ascii','big5hkscs'=>'big5_hkscs','utf8'=>'utf8',
572 'big5'=>'big5','cp1046'=>'cp1046','cp1124'=>'cp1124','cp1125'=>'cp1125','cp1129'=>'cp1129',
573 'cp1133'=>'cp1133','cp1161'=>'cp1161','cp1162'=>'cp1162','cp1163'=>'cp1163','cp1250'=>'cp1250',
574 'cp1251'=>'cp1251','cp1252'=>'cp1252','cp1253'=>'cp1253','cp1254'=>'cp1254','cp1255'=>'cp1255',
575 'cp1256'=>'cp1256','cp1257'=>'cp1257','cp1258'=>'cp1258','cp437'=>'cp437','cp737'=>'cp737',
576 'cp775'=>'cp775','cp850'=>'cp850','cp852'=>'cp852','cp853'=>'cp853','cp855'=>'cp855','cp856'=>'cp856',
577 'cp857'=>'cp857','cp858'=>'cp858','cp860'=>'cp860','cp861'=>'cp861','cp862'=>'cp862','cp863'=>'cp863',
578 'cp864'=>'cp864','cp865'=>'cp865','cp866'=>'cp866','cp869'=>'cp869','cp874'=>'cp874','cp922'=>'cp922',
579 'cp932'=>'cp932','cp949'=>'cp949','cp950'=>'cp950','dechanyu'=>'dec_hanyu','deckanji'=>'dec_kanji',
580 'euccn'=>'euc_cn','eucjisx0213'=>'euc_jisx0213','eucjp'=>'euc_jp','euckr'=>'euc_kr','euctw'=>'euc_tw',
581 'gb18030'=>'gb18030','gbk'=>'gbk','georgianacademy'=>'georgian_academy','georgianps'=>'georgian_ps',
582 'hproman8'=>'hp_roman8','iso88591'=>'iso_8859_1','iso885910'=>'iso_8859_10','iso885913'=>'iso_8859_13',
583 'iso885914'=>'iso_8859_14','iso885915'=>'iso_8859_15','iso885916'=>'iso_8859_16','iso88592'=>'iso_8859_2',
584 'iso88593'=>'iso_8859_3','iso88594'=>'iso_8859_4','iso88595'=>'iso_8859_5','iso88596'=>'iso_8859_6',
585 'iso88597'=>'iso_8859_7','iso88598'=>'iso_8859_8','iso88599'=>'iso_8859_9','isoir165'=>'iso_ir_165',
586 'iso646cn'=>'iso646_cn','iso646jp'=>'iso646_jp','jisx0201'=>'jis_x0201','johab'=>'johab','koi8r'=>'koi8_r',
587 'koi8ru'=>'koi8_ru','koi8t'=>'koi8_t','koi8u'=>'koi8_u','macarabic'=>'macarabic',
588 'maccentraleurope'=>'maccentraleurope','maccroatian'=>'maccroatian','maccyrillic'=>'maccyrillic',
589 'macgreek'=>'macgreek','machebrew'=>'machebrew','maciceland'=>'maciceland','macroman'=>'macroman',
590 'macromania'=>'macromania','macthai'=>'macthai','macturkish'=>'macturkish','macukraine'=>'macukraine',
591 'mulelao1'=>'mulelao_1','nextstep'=>'nextstep','riscoslatin1'=>'riscos_latin1','shiftjis'=>'shift_jis',
592 'shiftjisx0213'=>'shift_jisx0213','tcvn'=>'tcvn','tds565'=>'tds565','tis620'=>'tis_620','viscii'=>'viscii'
594 $procesed_charset = isset($alias_xref[$procesed_charset]) ? $alias_xref[$procesed_charset] : $procesed_charset;
595 $memory[$charset] = isset($alias[$procesed_charset]) ? $alias[$procesed_charset] : FALSE;//$this->defaultCharset;
596 if($set_charset){
597 $this->_currentCharset = $memory[$charset];
599 return $memory[$charset];
600 }// -- end of &_GetCharset -- //
603 * Encodes given string as UTF8 text.
605 * Given string and charset mapping, returns input string as
606 * UTF8 text
608 * @access private
609 * @uses _CharToUtf8
610 * @see _PhpStringRecode
611 * @see _Utf8StringDecode
612 * @param string $string Text to be converted to UTF8
613 * @param array $mapping_array Array containing the charset mapping.
614 * @return string UTF8 String
616 function _Utf8StringEncode($string, $mapping_array)
618 $chars = unpack('C*', $string);
619 $count = count($chars);
620 for($i=1;$i<=$count;$i++){
621 if(!isset($mapping_array[$chars[$i]])){
622 continue;
623 }else{
624 $char = (int)$mapping_array[$chars[$i]];
626 $chars[$i] = $this->_CharToUtf8($char);
628 return implode('',$chars);
629 }// -- end of &_Utf8StringEncode -- //
632 * Decodes data, assumed to be UTF-8 encoded given its
633 * equivalence map.
635 * @access private
636 * @uses _Utf8
637 * @uses ToChar
638 * @see _PhpStringRecode
639 * @see _Utf8StringEncode
640 * @param string $utf_string UTF8 string
641 * @param array $mapping_array Mapping array
642 * @return string Decoded string
644 function _Utf8StringDecode($utf_string, $mapping_array)
646 $chars = unpack('C*', $utf_string);
647 $count = count($chars);
648 $result = '';
649 for ($i=1;$i<=$count;$i++){
650 $result .= $this->_Utf8ToChar($chars,$i,$mapping_array);
652 return $result;
653 }// -- end of &_Utf8StringDecode -- //
657 * Converts a single character to its UTF8 representation
659 * @access protected
660 * @see _Utf8StringEncode
661 * @param string $char Char to be converted
662 * @return string UTF8 char
664 function _CharToUtf8($char)
666 if ($char < 0x80){
667 $utf8_char = chr($char);
668 // 2 bytes
669 }else if($char<0x800){
670 $utf8_char = (chr(0xC0 | $char>>6) . chr(0x80 | $char & 0x3F));
671 // 3 bytes
672 }else if($char<0x10000){
673 $utf8_char = (chr(0xE0 | $char>>12) . chr(0x80 | $char>>6 & 0x3F) . chr(0x80 | $char & 0x3F));
674 // 4 bytes
675 }else if($char<0x200000){
676 $utf8_char = (chr(0xF0 | $char>>18) . chr(0x80 | $char>>12 & 0x3F) . chr(0x80 | $char>>6 & 0x3F) . chr(0x80 | $char & 0x3F));
678 return $utf8_char;
679 }// -- end of &_CharToUtf8 -- //
684 * Decodes a single UTF8 char to it's representation as
685 * specified in the mapping array
687 * @access private
688 * @see _Utf8StringDecode
689 * @param array $chars Assoc array with chars to be decoded
690 * @param integer &$id Current char position
691 * @param array $mapping_array Mapping Array
692 * @return string Decoded char
694 function _Utf8ToChar($chars, &$id, $mapping_array)
696 if(($chars[$id]>=240)&&($chars[$id]<=255)){
697 $utf=(intval($chars[$id]-240)<<18)+(intval($chars[++$id]-128)<<12)+(intval($chars[++$id]-128)<<6)+(intval($chars[++$id]-128)<<0);
698 }elseif(($chars[$id]>=224)&&($chars[$id]<=239)){
699 $utf=(intval($chars[$id]-224)<<12)+(intval($chars[++$id]-128)<<6)+(intval($chars[++$id]-128)<<0);
700 }elseif(($chars[$id]>=192)&&($chars[$id]<=223)){
701 $utf=(intval($chars[$id]-192)<<6)+(intval($chars[++$id]-128)<<0);
702 }else{
703 $utf=$chars[$id];
705 if(array_key_exists($utf,$mapping_array)){
706 return chr($mapping_array[$utf]);
707 }else{
708 return $this->utf8ErrorChar;
710 }// -- end of &_Utf8ToChar -- //
713 function isUtf8($text = '')
715 // From http://w3.org/International/questions/qa-forms-utf-8.html
716 return preg_match('%^(?:[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$%xs', $text);