Created a new folder in _templates to house site icons. Put the new RSS standard...
[elgg.git] / lib / typo3 / class.t3lib_cs.php
blobde7b88c50691916e075df542afd9c2c2527931cf
1 <?php
2 /***************************************************************
3 * Copyright notice
5 * (c) 2003-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
6 * All rights reserved
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
27 * $Id: class.t3lib_cs.php,v 1.54 2005/12/12 21:47:50 masi Exp $
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
37 * 136: class t3lib_cs
38 * 503: function parse_charset($charset)
39 * 522: function get_locale_charset($locale)
41 * SECTION: Charset Conversion functions
42 * 575: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 615: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 632: function utf8_encode($str,$charset)
45 * 678: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 721: function utf8_to_entities($str)
47 * 754: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 788: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 838: function UnumberToChar($cbyte)
50 * 883: function utf8CharToUnumber($str,$hex=0)
52 * SECTION: Init functions
53 * 926: function initCharset($charset)
54 * 988: function initUnicodeData($mode=null)
55 * 1213: function initCaseFolding($charset)
56 * 1275: function initToASCII($charset)
58 * SECTION: String operation functions
59 * 1346: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1412: function crop($charset,$string,$len,$crop='')
62 * 1465: function strtrunc($charset,$string,$len)
63 * 1499: function conv_case($charset,$string,$case)
64 * 1525: function specCharsToASCII($charset,$string)
66 * SECTION: Internal string operation functions
67 * 1565: function sb_char_mapping($str,$charset,$mode,$opt='')
69 * SECTION: Internal UTF-8 string operation functions
70 * 1620: function utf8_substr($str,$start,$len=null)
71 * 1653: function utf8_strlen($str)
72 * 1674: function utf8_strtrunc($str,$len)
73 * 1696: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1719: function utf8_strrpos($haystack,$needle)
75 * 1739: function utf8_char2byte_pos($str,$pos)
76 * 1780: function utf8_byte2char_pos($str,$pos)
77 * 1803: function utf8_char_mapping($str,$mode,$opt='')
79 * SECTION: Internal EUC string operation functions
80 * 1879: function euc_strtrunc($str,$len,$charset)
81 * 1908: function euc_substr($str,$start,$charset,$len=null)
82 * 1933: function euc_strlen($str,$charset)
83 * 1960: function euc_char2byte_pos($str,$pos,$charset)
84 * 2001: function euc_char_mapping($str,$charset,$mode,$opt='')
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
98 /**
99 * Notes on UTF-8
101 * Functions working on UTF-8 strings:
103 * - strchr/strstr
104 * - strrchr
105 * - substr_count
106 * - implode/explode/join
108 * Functions nearly working on UTF-8 strings:
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
115 * Functions NOT working on UTF-8 strings:
117 * - str*cmp
118 * - stristr
119 * - stripos
120 * - substr
121 * - strrev
122 * - ereg/eregi
123 * - split/spliti
124 * - preg_*
125 * - ...
129 * Class for conversion between charsets
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
133 * @package TYPO3
134 * @subpackage t3lib
136 class t3lib_cs {
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
169 var $synonyms=array(
170 'us' => 'ascii',
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
223 'koi8r' => 'koi-8r',
224 'cp878' => 'koi-8r',
225 'mac' => 'macroman',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
229 'euccn' => 'gb2312',
230 'cp936' => 'gb2312',
231 'big-5' => 'big5',
232 'cp950' => 'big5',
233 'eucjp' => 'euc-jp',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
237 'cp949' => 'euc-kr',
238 'utf7' => 'utf-7',
239 'utf8' => 'utf-8',
240 'utf16' => 'utf-16',
241 'utf32' => 'utf-32',
242 'utf8' => 'utf-8',
243 'ucs2' => 'ucs-2',
244 'ucs4' => 'ucs-4',
247 // mapping of iso-639:2 language codes to language (family) names
248 var $lang_to_langfamily=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.unicode.org/onlinedat/languages.html
252 'ar' => 'arabic',
253 'bg' => 'cyrillic',
254 'cs' => 'east_european',
255 'da' => 'west_european',
256 'de' => 'west_european',
257 'es' => 'west_european',
258 'et' => 'estonian',
259 'eu' => 'west_european',
260 'fi' => 'west_european',
261 'fr' => 'west_european',
262 'gr' => 'greek',
263 'hr' => 'east_european',
264 'hu' => 'east_european',
265 'iw' => 'hebrew',
266 'is' => 'west_european',
267 'it' => 'west_european',
268 'ja' => 'japanese',
269 'kl' => 'west_european',
270 'ko' => 'korean',
271 'lt' => 'lithuanian',
272 'lv' => 'west_european', // Latvian/Lettish
273 'nl' => 'west_european',
274 'no' => 'west_european',
275 'pl' => 'east_european',
276 'pt' => 'west_european',
277 'ro' => 'east_european',
278 'ru' => 'cyrillic',
279 'sk' => 'east_european',
280 'sl' => 'east_european',
281 'sv' => 'west_european',
282 'th' => 'thai',
283 'uk' => 'cyrillic',
284 'vi' => 'vietnamese',
285 'zh' => 'chinese',
286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
287 'chs' => 'simpl_chinese',
288 'cht' => 'trad_chinese',
289 'csy' => 'east_european',
290 'dan' => 'west_european',
291 'deu' => 'west_european',
292 'dea' => 'west_european',
293 'des' => 'west_european',
294 'ena' => 'west_european',
295 'enc' => 'west_european',
296 'eng' => 'west_european',
297 'enz' => 'west_european',
298 'enu' => 'west_european',
299 'nld' => 'west_european',
300 'nlb' => 'west_european',
301 'fin' => 'west_european',
302 'fra' => 'west_european',
303 'frb' => 'west_european',
304 'frc' => 'west_european',
305 'frs' => 'west_european',
306 'ell' => 'greek',
307 'hun' => 'east_european',
308 'isl' => 'west_euorpean',
309 'ita' => 'west_european',
310 'its' => 'west_european',
311 'jpn' => 'japanese',
312 'kor' => 'korean',
313 'nor' => 'west_european',
314 'non' => 'west_european',
315 'plk' => 'east_european',
316 'ptg' => 'west_european',
317 'ptb' => 'west_european',
318 'rus' => 'east_european',
319 'sky' => 'east_european',
320 'esp' => 'west_european',
321 'esm' => 'west_european',
322 'esn' => 'west_european',
323 'sve' => 'west_european',
324 'trk' => 'turkish',
325 // English language names
326 'bulgarian' => 'east_european',
327 'catalan' => 'west_european',
328 'croatian' => 'east_european',
329 'czech' => 'east_european',
330 'danish' => 'west_european',
331 'dutch' => 'west_european',
332 'english' => 'west_european',
333 'finnish' => 'west_european',
334 'french' => 'west_european',
335 'galician' => 'west_european',
336 'german' => 'west_european',
337 'hungarian' => 'east_european',
338 'icelandic' => 'west_european',
339 'italian' => 'west_european',
340 'latvian' => 'west_european',
341 'lettish' => 'west_european',
342 'norwegian' => 'west_european',
343 'polish' => 'east_european',
344 'portuguese' => 'west_european',
345 'russian' => 'cyrillic',
346 'romanian' => 'east_european',
347 'slovak' => 'east_european',
348 'slovenian' => 'east_european',
349 'spanish' => 'west_european',
350 'svedish' => 'west_european',
351 'turkish' => 'east_european',
352 'ukrainian' => 'cyrillic',
355 // mapping of language (family) names to charsets on Unix
356 var $lang_to_charset_unix=array(
357 'west_european' => 'iso-8859-1',
358 'estonian' => 'iso-8859-1',
359 'east_european' => 'iso-8859-2',
360 'baltic' => 'iso-8859-4',
361 'cyrillic' => 'iso-8859-5',
362 'arabic' => 'iso-8859-6',
363 'greek' => 'iso-8859-7',
364 'hebrew' => 'iso-8859-8',
365 'turkish' => 'iso-8859-9',
366 'thai' => 'iso-8859-11', // = TIS-620
367 'lithuanian' => 'iso-8859-13',
368 'chinese' => 'gb2312', // = euc-cn
369 'japanese' => 'euc-jp',
370 'korean' => 'euc-kr',
371 'simpl_chinese' => 'gb2312',
372 'trad_chinese' => 'big5',
373 'vietnamese' => '',
376 // mapping of language (family) names to charsets on Windows
377 var $lang_to_charset_windows=array(
378 'east_european' => 'windows-1250',
379 'cyrillic' => 'windows-1251',
380 'west_european' => 'windows-1252',
381 'greek' => 'windows-1253',
382 'turkish' => 'windows-1254',
383 'hebrew' => 'windows-1255',
384 'arabic' => 'windows-1256',
385 'baltic' => 'windows-1257',
386 'estonian' => 'windows-1257',
387 'lithuanian' => 'windows-1257',
388 'vietnamese' => 'windows-1258',
389 'thai' => 'cp874',
390 'korean' => 'cp949',
391 'chinese' => 'gb2312',
392 'japanese' => 'shift_jis',
393 'simpl_chinese' => 'gb2312',
394 'trad_chinese' => 'big5',
397 // mapping of locale names to charsets
398 var $locale_to_charset=array(
399 'japanese.euc' => 'euc-jp',
400 'ja_jp.ujis' => 'euc-jp',
401 'korean.euc' => 'euc-kr',
402 'zh_cn' => 'gb2312',
403 'zh_hk' => 'big5',
404 'zh_tw' => 'big5',
407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
408 // Empty values means "iso-8859-1"
409 var $charSetArray = array(
410 'dk' => '',
411 'de' => '',
412 'no' => '',
413 'it' => '',
414 'fr' => '',
415 'es' => '',
416 'nl' => '',
417 'cz' => 'windows-1250',
418 'pl' => 'iso-8859-2',
419 'si' => 'windows-1250',
420 'fi' => '',
421 'tr' => 'iso-8859-9',
422 'se' => '',
423 'pt' => '',
424 'ru' => 'windows-1251',
425 'ro' => 'iso-8859-2',
426 'ch' => 'gb2312',
427 'sk' => 'windows-1250',
428 'lt' => 'windows-1257',
429 'is' => 'utf-8',
430 'hr' => 'windows-1250',
431 'hu' => 'iso-8859-2',
432 'gl' => '',
433 'th' => 'iso-8859-11',
434 'gr' => 'iso-8859-7',
435 'hk' => 'big5',
436 'eu' => '',
437 'bg' => 'windows-1251',
438 'br' => '',
439 'et' => 'iso-8859-4',
440 'ar' => 'iso-8859-6',
441 'he' => 'utf-8',
442 'ua' => 'windows-1251',
443 'jp' => 'shift_jis',
444 'lv' => 'utf-8',
445 'vn' => 'utf-8',
446 'ca' => 'iso-8859-15',
447 'ba' => 'iso-8859-2',
448 'kr' => 'euc-kr',
449 'eo' => 'utf-8',
450 'my' => '',
451 'hi' => 'utf-8',
454 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
455 // Empty values means sames as Typo3
456 var $isoArray = array(
457 'dk' => 'da',
458 'de' => '',
459 'no' => '',
460 'it' => '',
461 'fr' => '',
462 'es' => '',
463 'nl' => '',
464 'cz' => 'cs',
465 'pl' => '',
466 'si' => 'sl',
467 'fi' => '',
468 'tr' => '',
469 'se' => 'sv',
470 'pt' => '',
471 'ru' => '',
472 'ro' => '',
473 'ch' => 'zh_CN',
474 'sk' => '',
475 'lt' => '',
476 'is' => '',
477 'hr' => '',
478 'hu' => '',
479 'gl' => '', // Greenlandic
480 'th' => '',
481 'gr' => 'el',
482 'hk' => 'zh_HK',
483 'eu' => '',
484 'bg' => '',
485 'br' => 'pt_BR',
486 'et' => '',
487 'ar' => '',
488 'he' => 'iw',
489 'ua' => 'uk',
490 'jp' => 'ja',
491 'lv' => '',
492 'vn' => 'vi',
493 'ca' => '',
494 'ba' => '', // Bosnian
495 'kr' => '',
499 * Normalize - changes input character set to lowercase letters.
501 * @param string Input charset
502 * @return string Normalized charset
503 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
505 function parse_charset($charset) {
506 $charset = strtolower($charset);
507 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
509 return $charset;
513 * Get the charset of a locale.
515 * ln language
516 * ln_CN language / country
517 * ln_CN.cs language / country / charset
518 * ln_CN.cs@mod language / country / charset / modifier
520 * @param string Locale string
521 * @return string Charset resolved for locale string
522 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
524 function get_locale_charset($locale) {
525 $locale = strtolower($locale);
527 // exact locale specific charset?
528 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
530 // get modifier
531 list($locale,$modifier) = explode('@',$locale);
533 // locale contains charset: use it
534 list($locale,$charset) = explode('.',$locale);
535 if ($charset) return $this->parse_charset($charset);
537 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
538 if ($modifier == 'euro') return 'iso-8859-15';
540 // get language
541 list($language,$country) = explode('_',$locale);
542 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language];
544 if (TYPO3_OS == 'WIN') {
545 $cs = $this->lang_to_charset_windows[$language];
546 } else {
547 $cs = $this->lang_to_charset_unix[$language];
550 return $cs ? $cs : 'iso-8859-1';
561 /********************************************
563 * Charset Conversion functions
565 ********************************************/
568 * Convert from one charset to another charset.
570 * @param string Input string
571 * @param string From charset (the current charset of the string)
572 * @param string To charset (the output charset wanted)
573 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
574 * @return string Converted string
575 * @see convArray()
577 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
578 if ($fromCS==$toCS) return $str;
580 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
581 if ($toCS=='utf-8' || !$useEntityForNoChar) {
582 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
583 case 'mbstring':
584 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
585 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
586 break;
588 case 'iconv':
589 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
590 if (false !== $conv_str) return $conv_str;
591 break;
593 case 'recode':
594 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
595 if (false !== $conv_str) return $conv_str;
596 break;
598 // fallback to TYPO3 conversion
601 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
602 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
603 return $str;
607 * Convert all elements in ARRAY from one charset to another charset.
608 * NOTICE: Array is passed by reference!
610 * @param string Input array, possibly multidimensional
611 * @param string From charset (the current charset of the string)
612 * @param string To charset (the output charset wanted)
613 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
614 * @return void
615 * @see conv()
617 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
618 foreach($array as $key => $value) {
619 if (is_array($array[$key])) {
620 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
621 } else {
622 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
628 * Converts $str from $charset to UTF-8
630 * @param string String in local charset to convert to UTF-8
631 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
632 * @return string Output string, converted to UTF-8
634 function utf8_encode($str,$charset) {
636 if ($charset === 'utf-8') return $str;
638 // Charset is case-insensitive.
639 if ($this->initCharset($charset)) { // Parse conv. table if not already...
640 $strLen = strlen($str);
641 $outStr='';
643 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
644 $chr=substr($str,$a,1);
645 $ord=ord($chr);
646 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
647 $ord2 = ord($str{$a+1});
648 $ord = $ord<<8 & $ord2; // assume big endian
650 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
651 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
652 } else $outStr.=chr($this->noCharByteVal); // No char exists
653 $a++;
654 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
655 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
656 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
657 $a++;
658 $ord2=ord(substr($str,$a,1));
659 $ord = $ord*256+$ord2;
663 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
664 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
665 } else $outStr.= chr($this->noCharByteVal); // No char exists
666 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
668 return $outStr;
673 * Converts $str from UTF-8 to $charset
675 * @param string String in UTF-8 to convert to local charset
676 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
677 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
678 * @return string Output string, converted to local charset
680 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
682 // Charset is case-insensitive.
683 if ($this->initCharset($charset)) { // Parse conv. table if not already...
684 $strLen = strlen($str);
685 $outStr='';
686 $buf='';
687 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
688 $chr=substr($str,$a,1);
689 $ord=ord($chr);
690 if ($ord>127) { // This means multibyte! (first byte!)
691 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
693 $buf=$chr; // Add first byte
694 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
695 $ord = $ord << 1; // Shift it left and ...
696 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
697 $a++; // Increase pointer...
698 $buf.=substr($str,$a,1); // ... and add the next char.
699 } else break;
702 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
703 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
704 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
705 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
706 } else $outStr.= chr($mByte);
707 } elseif ($useEntityForNoChar) { // Create num entity:
708 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
709 } else $outStr.=chr($this->noCharByteVal); // No char exists
710 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
711 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
713 return $outStr;
718 * Converts all chars > 127 to numeric entities.
720 * @param string Input string
721 * @return string Output string
723 function utf8_to_entities($str) {
724 $strLen = strlen($str);
725 $outStr='';
726 $buf='';
727 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
728 $chr=substr($str,$a,1);
729 $ord=ord($chr);
730 if ($ord>127) { // This means multibyte! (first byte!)
731 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
732 $buf=$chr; // Add first byte
733 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
734 $ord = $ord << 1; // Shift it left and ...
735 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
736 $a++; // Increase pointer...
737 $buf.=substr($str,$a,1); // ... and add the next char.
738 } else break;
741 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
742 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
743 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
746 return $outStr;
750 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
752 * @param string Input string, UTF-8
753 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
754 * @return string Output string
756 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
757 if ($alsoStdHtmlEnt) {
758 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
761 $token = md5(microtime());
762 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
763 foreach($parts as $k => $v) {
764 if ($k%2) {
765 if (substr($v,0,1)=='#') { // Dec or hex entities:
766 if (substr($v,1,1)=='x') {
767 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
768 } else {
769 $parts[$k] = $this->UnumberToChar(substr($v,1));
771 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
772 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
773 } else { // No conversion:
774 $parts[$k] ='&'.$v.';';
779 return implode('',$parts);
783 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
785 * @param string Input string, UTF-8
786 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
787 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
788 * @return array Output array with the char numbers
790 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
791 // If entities must be registered as well...:
792 if ($convEntities) {
793 $str = $this->entities_to_utf8($str,1);
795 // Do conversion:
796 $strLen = strlen($str);
797 $outArr=array();
798 $buf='';
799 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
800 $chr=substr($str,$a,1);
801 $ord=ord($chr);
802 if ($ord>127) { // This means multibyte! (first byte!)
803 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
804 $buf=$chr; // Add first byte
805 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
806 $ord = $ord << 1; // Shift it left and ...
807 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
808 $a++; // Increase pointer...
809 $buf.=substr($str,$a,1); // ... and add the next char.
810 } else break;
813 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
814 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
815 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
818 return $outArr;
822 * Converts a UNICODE number to a UTF-8 multibyte character
823 * Algorithm based on script found at From: http://czyborra.com/utf/
824 * Unit-tested by Kasper
826 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
828 * bytes | bits | representation
829 * 1 | 7 | 0vvvvvvv
830 * 2 | 11 | 110vvvvv 10vvvvvv
831 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
832 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
833 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
834 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
836 * @param integer UNICODE integer
837 * @return string UTF-8 multibyte character string
838 * @see utf8CharToUnumber()
840 function UnumberToChar($cbyte) {
841 $str='';
843 if ($cbyte < 0x80) {
844 $str.=chr($cbyte);
845 } else if ($cbyte < 0x800) {
846 $str.=chr(0xC0 | ($cbyte >> 6));
847 $str.=chr(0x80 | ($cbyte & 0x3F));
848 } else if ($cbyte < 0x10000) {
849 $str.=chr(0xE0 | ($cbyte >> 12));
850 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
851 $str.=chr(0x80 | ($cbyte & 0x3F));
852 } else if ($cbyte < 0x200000) {
853 $str.=chr(0xF0 | ($cbyte >> 18));
854 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
855 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
856 $str.=chr(0x80 | ($cbyte & 0x3F));
857 } else if ($cbyte < 0x4000000) {
858 $str.=chr(0xF8 | ($cbyte >> 24));
859 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
860 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
861 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
862 $str.=chr(0x80 | ($cbyte & 0x3F));
863 } else if ($cbyte < 0x80000000) {
864 $str.=chr(0xFC | ($cbyte >> 30));
865 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
866 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
867 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
868 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
869 $str.=chr(0x80 | ($cbyte & 0x3F));
870 } else { // Cannot express a 32-bit character in UTF-8
871 $str .= chr($this->noCharByteVal);
873 return $str;
877 * Converts a UTF-8 Multibyte character to a UNICODE number
878 * Unit-tested by Kasper
880 * @param string UTF-8 multibyte character string
881 * @param boolean If set, then a hex. number is returned.
882 * @return integer UNICODE integer
883 * @see UnumberToChar()
885 function utf8CharToUnumber($str,$hex=0) {
886 $ord=ord(substr($str,0,1)); // First char
888 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
889 $binBuf='';
890 for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
891 $ord = $ord << 1; // Shift it left and ...
892 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
893 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
894 } else break;
896 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
898 $int = bindec($binBuf);
899 } else $int = $ord;
901 return $hex ? 'x'.dechex($int) : $int;
912 /********************************************
914 * Init functions
916 ********************************************/
919 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
920 * This function is automatically called by the conversion functions
922 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
924 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
925 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
926 * @access private
928 function initCharset($charset) {
929 // Only process if the charset is not yet loaded:
930 if (!is_array($this->parsedCharsets[$charset])) {
932 // Conversion table filename:
933 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
935 // If the conversion table is found:
936 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
937 // Cache file for charsets:
938 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
939 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
940 if ($cacheFile && @is_file($cacheFile)) {
941 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
942 } else {
943 // Parse conversion table into lines:
944 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
945 // Initialize the internal variable holding the conv. table:
946 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
947 // traverse the lines:
948 $detectedType='';
949 foreach($lines as $value) {
950 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
952 // Detect type if not done yet: (Done on first real line)
953 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
954 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
956 if ($detectedType=='ms-token') {
957 list($hexbyte,$utf8) = split('=|:',$value,3);
958 } elseif ($detectedType=='whitespaced') {
959 $regA=array();
960 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
961 $hexbyte = $regA[1];
962 $utf8 = 'U+'.$regA[2];
964 $decval = hexdec(trim($hexbyte));
965 if ($decval>127) {
966 $utf8decval = hexdec(substr(trim($utf8),2));
967 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
968 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
972 if ($cacheFile) {
973 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
976 return 2;
977 } else return false;
978 } else return 1;
982 * This function initializes all UTF-8 character data tables.
984 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
986 * @param string Mode ("case", "ascii", ...)
987 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
988 * @access private
990 function initUnicodeData($mode=null) {
991 // cache files
992 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
993 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
995 // Only process if the tables are not yet loaded
996 switch($mode) {
997 case 'case':
998 if (is_array($this->caseFolding['utf-8'])) return 1;
1000 // Use cached version if possible
1001 if ($cacheFileCase && @is_file($cacheFileCase)) {
1002 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1003 return 2;
1005 break;
1007 case 'ascii':
1008 if (is_array($this->toASCII['utf-8'])) return 1;
1010 // Use cached version if possible
1011 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1012 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1013 return 2;
1015 break;
1018 // process main Unicode data file
1019 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1020 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1022 $fh = fopen($unicodeDataFile,'rb');
1023 if (!$fh) return false;
1025 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1026 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1027 $this->caseFolding['utf-8'] = array();
1028 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1029 $utf8CaseFolding['toUpper'] = array();
1030 $utf8CaseFolding['toLower'] = array();
1031 $utf8CaseFolding['toTitle'] = array();
1033 $decomposition = array(); // array of temp. decompositions
1034 $mark = array(); // array of chars that are marks (eg. composing accents)
1035 $number = array(); // array of chars that are numbers (eg. digits)
1036 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1038 while (!feof($fh)) {
1039 $line = fgets($fh,4096);
1040 // has a lot of info
1041 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1043 $ord = hexdec($char);
1044 if ($ord > 0xFFFF) break; // only process the BMP
1046 $utf8_char = $this->UnumberToChar($ord);
1048 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1049 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1050 // store "title" only when different from "upper" (only a few)
1051 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1053 switch ($cat{0}) {
1054 case 'M': // mark (accent, umlaut, ...)
1055 $mark["U+$char"] = 1;
1056 break;
1058 case 'N': // numeric value
1059 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1062 // accented Latin letters without "official" decomposition
1063 $match = array();
1064 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1065 $c = ord($match[2]);
1066 if ($match[1] == 'SMALL') $c += 32;
1068 $decomposition["U+$char"] = array(dechex($c));
1069 continue;
1072 $match = array();
1073 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1074 switch($match[1]) {
1075 case '<circle>': // add parenthesis as circle replacement, eg (1)
1076 $match[2] = '0028 '.$match[2].' 0029';
1077 break;
1079 case '<square>': // add square brackets as square replacement, eg [1]
1080 $match[2] = '005B '.$match[2].' 005D';
1081 break;
1083 case '<compat>': // ignore multi char decompositions that start with a space
1084 if (ereg('^0020 ',$match[2])) continue 2;
1085 break;
1087 // ignore Arabic and vertical layout presentation decomposition
1088 case '<initial>':
1089 case '<medial>':
1090 case '<final>':
1091 case '<isolated>':
1092 case '<vertical>':
1093 continue 2;
1095 $decomposition["U+$char"] = split(' ',$match[2]);
1098 fclose($fh);
1100 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1101 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1102 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1103 $fh = fopen($specialCasingFile,'rb');
1104 if ($fh) {
1105 while (!feof($fh)) {
1106 $line = fgets($fh,4096);
1107 if ($line{0} != '#' && trim($line) != '') {
1109 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1110 if ($cond == '' || $cond{0} == '#') {
1111 $utf8_char = $this->UnumberToChar(hexdec($char));
1112 if ($char != $lower) {
1113 $arr = split(' ',$lower);
1114 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1115 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1117 if ($char != $title && $title != $upper) {
1118 $arr = split(' ',$title);
1119 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1120 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1122 if ($char != $upper) {
1123 $arr = split(' ',$upper);
1124 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1125 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1130 fclose($fh);
1134 // process custom decompositions
1135 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1136 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1137 $fh = fopen($customTranslitFile,'rb');
1138 if ($fh) {
1139 while (!feof($fh)) {
1140 $line = fgets($fh,4096);
1141 if ($line{0} != '#' && trim($line) != '') {
1142 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1143 if (!$translit) $omit["U+$char"] = 1;
1144 $decomposition["U+$char"] = split(' ', $translit);
1148 fclose($fh);
1152 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1153 foreach($decomposition as $from => $to) {
1154 $code_decomp = array();
1156 while ($code_value = array_shift($to)) {
1157 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1158 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1159 array_unshift($to, $cv);
1161 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1162 array_push($code_decomp, $code_value);
1165 if (count($code_decomp) || isset($omit[$from])) {
1166 $decomposition[$from] = $code_decomp;
1167 } else {
1168 unset($decomposition[$from]);
1172 // create ascii only mapping
1173 $this->toASCII['utf-8'] = array();
1174 $ascii =& $this->toASCII['utf-8'];
1176 foreach($decomposition as $from => $to) {
1177 $code_decomp = array();
1178 while ($code_value = array_shift($to)) {
1179 $ord = hexdec($code_value);
1180 if ($ord > 127)
1181 continue 2; // skip decompositions containing non-ASCII chars
1182 else
1183 array_push($code_decomp,chr($ord));
1185 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1188 // add numeric decompositions
1189 foreach($number as $from => $to) {
1190 $utf8_char = $this->UnumberToChar(hexdec($from));
1191 if (!isset($ascii[$utf8_char])) {
1192 $ascii[$utf8_char] = $to;
1196 if ($cacheFileCase) {
1197 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1200 if ($cacheFileASCII) {
1201 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1204 return 3;
1208 * This function initializes the folding table for a charset other than UTF-8.
1209 * This function is automatically called by the case folding functions.
1211 * @param string Charset for which to initialize case folding.
1212 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1213 * @access private
1215 function initCaseFolding($charset) {
1216 // Only process if the case table is not yet loaded:
1217 if (is_array($this->caseFolding[$charset])) return 1;
1219 // Use cached version if possible
1220 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1221 if ($cacheFile && @is_file($cacheFile)) {
1222 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1223 return 2;
1226 // init UTF-8 conversion for this charset
1227 if (!$this->initCharset($charset)) {
1228 return false;
1231 // UTF-8 case folding is used as the base conversion table
1232 if (!$this->initUnicodeData('case')) {
1233 return false;
1236 $nochar = chr($this->noCharByteVal);
1237 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1238 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1239 $c = $this->utf8_decode($utf8, $charset);
1241 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1242 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1243 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1245 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1246 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1247 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
1249 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1250 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1251 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1254 // add the ASCII case table
1255 for ($i=ord('a'); $i<=ord('z'); $i++) {
1256 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1258 for ($i=ord('A'); $i<=ord('Z'); $i++) {
1259 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1262 if ($cacheFile) {
1263 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1266 return 3;
1270 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1271 * This function is automatically called by the ASCII transliteration functions.
1273 * @param string Charset for which to initialize conversion.
1274 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1275 * @access private
1277 function initToASCII($charset) {
1278 // Only process if the case table is not yet loaded:
1279 if (is_array($this->toASCII[$charset])) return 1;
1281 // Use cached version if possible
1282 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1283 if ($cacheFile && @is_file($cacheFile)) {
1284 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1285 return 2;
1288 // init UTF-8 conversion for this charset
1289 if (!$this->initCharset($charset)) {
1290 return false;
1293 // UTF-8/ASCII transliteration is used as the base conversion table
1294 if (!$this->initUnicodeData('ascii')) {
1295 return false;
1298 $nochar = chr($this->noCharByteVal);
1299 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1300 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1301 $c = $this->utf8_decode($utf8, $charset);
1303 if (isset($this->toASCII['utf-8'][$utf8])) {
1304 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1308 if ($cacheFile) {
1309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1312 return 3;
1330 /********************************************
1332 * String operation functions
1334 ********************************************/
1337 * Returns a part of a string.
1338 * Unit-tested by Kasper (single byte charsets only)
1340 * @param string The character set
1341 * @param string Character string
1342 * @param integer Start position (character position)
1343 * @param integer Length (in characters)
1344 * @return string The substring
1345 * @see substr(), mb_substr()
1346 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1348 function substr($charset,$string,$start,$len=null) {
1349 if ($len===0) return '';
1351 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1352 // cannot omit $len, when specifying charset
1353 if ($len==null) {
1354 $enc = mb_internal_encoding(); // save internal encoding
1355 mb_internal_encoding($charset);
1356 $str = mb_substr($string,$start);
1357 mb_internal_encoding($enc); // restore internal encoding
1359 return $str;
1361 else {
1362 return mb_substr($string,$start,$len,$charset);
1364 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1365 // cannot omit $len, when specifying charset
1366 if ($len==null) {
1367 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1368 iconv_set_encoding('internal_encoding',$charset);
1369 $str = iconv_substr($string,$start);
1370 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1372 return $str;
1374 else {
1375 return iconv_substr($string,$start,$len,$charset);
1377 } elseif ($charset == 'utf-8') {
1378 return $this->utf8_substr($string,$start,$len);
1379 } elseif ($this->eucBasedSets[$charset]) {
1380 return $this->euc_substr($string,$start,$charset,$len);
1381 } elseif ($this->twoByteSets[$charset]) {
1382 return substr($string,$start*2,$len*2);
1383 } elseif ($this->fourByteSets[$charset]) {
1384 return substr($string,$start*4,$len*4);
1387 // treat everything else as single-byte encoding
1388 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1392 * Counts the number of characters.
1393 * Unit-tested by Kasper (single byte charsets only)
1395 * @param string The character set
1396 * @param string Character string
1397 * @return integer The number of characters
1398 * @see strlen()
1399 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1401 function strlen($charset,$string) {
1402 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1403 return mb_strlen($string,$charset);
1404 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1405 return iconv_strlen($string,$charset);
1406 } elseif ($charset == 'utf-8') {
1407 return $this->utf8_strlen($string);
1408 } elseif ($this->eucBasedSets[$charset]) {
1409 return $this->euc_strlen($string,$charset);
1410 } elseif ($this->twoByteSets[$charset]) {
1411 return strlen($string)/2;
1412 } elseif ($this->fourByteSets[$charset]) {
1413 return strlen($string)/4;
1415 // treat everything else as single-byte encoding
1416 return strlen($string);
1420 * Truncates a string and pre-/appends a string.
1421 * Unit tested by Kasper
1423 * @param string The character set
1424 * @param string Character string
1425 * @param integer Length (in characters)
1426 * @param string Crop signifier
1427 * @return string The shortened string
1428 * @see substr(), mb_strimwidth()
1429 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1431 function crop($charset,$string,$len,$crop='') {
1432 if (intval($len) == 0) return $string;
1434 if ($charset == 'utf-8') {
1435 $i = $this->utf8_char2byte_pos($string,$len);
1436 } elseif ($this->eucBasedSets[$charset]) {
1437 $i = $this->euc_char2byte_pos($string,$len,$charset);
1438 } else {
1439 if ($len > 0) {
1440 $i = $len;
1441 } else {
1442 $i = strlen($string)+$len;
1443 if ($i<=0) $i = false;
1447 if ($i === false) { // $len outside actual string length
1448 return $string;
1449 } else {
1450 if ($len > 0) {
1451 if (strlen($string{$i})) {
1452 return substr($string,0,$i).$crop;
1455 } else {
1456 if (strlen($string{$i-1})) {
1457 return $crop.substr($string,$i);
1462 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1463 if ($len > 0) {
1464 return substr($string,0,$i).$crop;
1465 } else {
1466 return $crop.substr($string,$i);
1471 return $string;
1475 * Cuts a string short at a given byte length.
1477 * @param string The character set
1478 * @param string Character string
1479 * @param integer The byte length
1480 * @return string The shortened string
1481 * @see mb_strcut()
1482 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1484 function strtrunc($charset,$string,$len) {
1485 if ($len <= 0) return '';
1487 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1488 return mb_strcut($string,0,$len,$charset);
1489 } elseif ($charset == 'utf-8') {
1490 return $this->utf8_strtrunc($string,$len);
1491 } elseif ($this->eucBasedSets[$charset]) {
1492 return $this->euc_strtrunc($string,$charset);
1493 } elseif ($this->twoByteSets[$charset]) {
1494 if ($len % 2) $len--; // don't cut at odd positions
1495 } elseif ($this->fourByteSets[$charset]) {
1496 $x = $len % 4;
1497 $len -= $x; // realign to position dividable by four
1499 // treat everything else as single-byte encoding
1500 return substr($string,0,$len);
1504 * Translates all characters of a string into their respective case values.
1505 * Unlike strtolower() and strtoupper() this method is locale independent.
1506 * Note that the string length may change!
1507 * eg. lower case German �(sharp S) becomes upper case "SS"
1508 * Unit-tested by Kasper
1509 * Real case folding is language dependent, this method ignores this fact.
1511 * @param string Character set of string
1512 * @param string Input string to convert case for
1513 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1514 * @return string The converted string
1515 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1516 * @see strtolower(), strtoupper()
1518 function conv_case($charset,$string,$case) {
1519 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) {
1520 if ($case == 'toLower') {
1521 return mb_strtolower($string,$charset);
1522 } else {
1523 return mb_strtoupper($string,$charset);
1525 } elseif ($charset == 'utf-8') {
1526 return $this->utf8_char_mapping($string,'case',$case);
1527 } elseif (isset($this->eucBasedSets[$charset])) {
1528 return $this->euc_char_mapping($string,$charset,'case',$case);
1529 } else {
1530 // treat everything else as single-byte encoding
1531 return $this->sb_char_mapping($string,$charset,'case',$case);
1534 return $string;
1538 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1540 * @param string Character set of string
1541 * @param string Input string to convert
1542 * @return string The converted string
1544 function specCharsToASCII($charset,$string) {
1545 if ($charset == 'utf-8') {
1546 return $this->utf8_char_mapping($string,'ascii');
1547 } elseif (isset($this->eucBasedSets[$charset])) {
1548 return $this->euc_char_mapping($string,$charset,'ascii');
1549 } else {
1550 // treat everything else as single-byte encoding
1551 return $this->sb_char_mapping($string,$charset,'ascii');
1554 return $string;
1568 /********************************************
1570 * Internal string operation functions
1572 ********************************************/
1575 * Maps all characters of a string in a single byte charset.
1577 * @param string the string
1578 * @param string the charset
1579 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1580 * @param string 'case': conversion 'toLower' or 'toUpper'
1581 * @return string the converted string
1582 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1584 function sb_char_mapping($str,$charset,$mode,$opt='') {
1585 switch($mode) {
1586 case 'case':
1587 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1588 $map =& $this->caseFolding[$charset][$opt];
1589 break;
1591 case 'ascii':
1592 if (!$this->initToASCII($charset)) return $str; // do nothing
1593 $map =& $this->toASCII[$charset];
1594 break;
1596 default:
1597 return $str;
1600 $out = '';
1601 for($i=0; strlen($str{$i}); $i++) {
1602 $c = $str{$i};
1603 if (isset($map[$c])) {
1604 $out .= $map[$c];
1605 } else {
1606 $out .= $c;
1610 return $out;
1622 /********************************************
1624 * Internal UTF-8 string operation functions
1626 ********************************************/
1629 * Returns a part of a UTF-8 string.
1630 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1632 * @param string UTF-8 string
1633 * @param integer Start position (character position)
1634 * @param integer Length (in characters)
1635 * @return string The substring
1636 * @see substr()
1637 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1639 function utf8_substr($str,$start,$len=null) {
1640 if (!strcmp($len,'0')) return '';
1642 $byte_start = $this->utf8_char2byte_pos($str,$start);
1643 if ($byte_start === false) {
1644 if ($start > 0) {
1645 return false; // $start outside string length
1646 } else {
1647 $start = 0;
1651 $str = substr($str,$byte_start);
1653 if ($len!=null) {
1654 $byte_end = $this->utf8_char2byte_pos($str,$len);
1655 if ($byte_end === false) // $len outside actual string length
1656 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string.
1657 else
1658 return substr($str,0,$byte_end);
1660 else return $str;
1664 * Counts the number of characters of a string in UTF-8.
1665 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1667 * @param string UTF-8 multibyte character string
1668 * @return integer The number of characters
1669 * @see strlen()
1670 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1672 function utf8_strlen($str) {
1673 $n=0;
1674 for($i=0; strlen($str{$i}); $i++) {
1675 $c = ord($str{$i});
1676 if (!($c & 0x80)) // single-byte (0xxxxxx)
1677 $n++;
1678 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1679 $n++;
1681 return $n;
1685 * Truncates a string in UTF-8 short at a given byte length.
1687 * @param string UTF-8 multibyte character string
1688 * @param integer the byte length
1689 * @return string the shortened string
1690 * @see mb_strcut()
1691 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1693 function utf8_strtrunc($str,$len) {
1694 $i = $len-1;
1695 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1696 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1697 if ($i <= 0) return ''; // sanity check
1698 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1699 if ($bc+$i > $len) return substr($str,0,$i);
1700 // fallthru: multibyte char fits into length
1702 return substr($str,0,$len);
1706 * Find position of first occurrence of a string, both arguments are in UTF-8.
1708 * @param string UTF-8 string to search in
1709 * @param string UTF-8 string to search for
1710 * @param integer Positition to start the search
1711 * @return integer The character position
1712 * @see strpos()
1713 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1715 function utf8_strpos($haystack,$needle,$offset=0) {
1716 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1717 return mb_strpos($haystack,$needle,$offset,'utf-8');
1718 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1719 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1722 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1723 if ($byte_offset === false) return false; // offset beyond string length
1725 $byte_pos = strpos($haystack,$needle,$byte_offset);
1726 if ($byte_pos === false) return false; // needle not found
1728 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1732 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1734 * @param string UTF-8 string to search in
1735 * @param string UTF-8 character to search for (single character)
1736 * @return integer The character position
1737 * @see strrpos()
1738 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1740 function utf8_strrpos($haystack,$needle) {
1741 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1742 return mb_strrpos($haystack,$needle,'utf-8');
1743 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1744 return iconv_strrpos($haystack,$needle,$offset,'utf-8');
1747 $byte_pos = strrpos($haystack,$needle);
1748 if ($byte_pos === false) return false; // needle not found
1750 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1754 * Translates a character position into an 'absolute' byte position.
1755 * Unit tested by Kasper.
1757 * @param string UTF-8 string
1758 * @param integer Character position (negative values start from the end)
1759 * @return integer Byte position
1760 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1762 function utf8_char2byte_pos($str,$pos) {
1763 $n = 0; // number of characters found
1764 $p = abs($pos); // number of characters wanted
1766 if ($pos >= 0) {
1767 $i = 0;
1768 $d = 1;
1769 } else {
1770 $i = strlen($str)-1;
1771 $d = -1;
1774 for( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1775 $c = (int)ord($str{$i});
1776 if (!($c & 0x80)) // single-byte (0xxxxxx)
1777 $n++;
1778 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1779 $n++;
1781 if (!strlen($str{$i})) return false; // offset beyond string length
1783 if ($pos >= 0) {
1784 // skip trailing multi-byte data bytes
1785 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1786 } else {
1787 // correct offset
1788 $i++;
1791 return $i;
1795 * Translates an 'absolute' byte position into a character position.
1796 * Unit tested by Kasper.
1798 * @param string UTF-8 string
1799 * @param integer byte position
1800 * @return integer character position
1801 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1803 function utf8_byte2char_pos($str,$pos) {
1804 $n = 0; // number of characters
1805 for($i=$pos; $i>0; $i--) {
1806 $c = (int)ord($str{$i});
1807 if (!($c & 0x80)) // single-byte (0xxxxxx)
1808 $n++;
1809 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1810 $n++;
1812 if (!strlen($str{$i})) return false; // offset beyond string length
1814 return $n;
1818 * Maps all characters of an UTF-8 string.
1820 * @param string UTF-8 string
1821 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1822 * @param string 'case': conversion 'toLower' or 'toUpper'
1823 * @return string the converted string
1824 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1826 function utf8_char_mapping($str,$mode,$opt='') {
1827 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1829 $out = '';
1830 switch($mode) {
1831 case 'case':
1832 $map =& $this->caseFolding['utf-8'][$opt];
1833 break;
1835 case 'ascii':
1836 $map =& $this->toASCII['utf-8'];
1837 break;
1839 default:
1840 return $str;
1843 for($i=0; strlen($str{$i}); $i++) {
1844 $c = ord($str{$i});
1845 if (!($c & 0x80)) // single-byte (0xxxxxx)
1846 $mbc = $str{$i};
1847 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1848 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1849 $mbc = substr($str,$i,$bc);
1850 $i += $bc-1;
1853 if (isset($map[$mbc])) {
1854 $out .= $map[$mbc];
1855 } else {
1856 $out .= $mbc;
1860 return $out;
1880 /********************************************
1882 * Internal EUC string operation functions
1884 * Extended Unix Code:
1885 * ASCII compatible 7bit single bytes chars
1886 * 8bit two byte chars
1888 * Shift-JIS is treated as a special case.
1890 ********************************************/
1893 * Cuts a string in the EUC charset family short at a given byte length.
1895 * @param string EUC multibyte character string
1896 * @param integer the byte length
1897 * @param string the charset
1898 * @return string the shortened string
1899 * @see mb_strcut()
1900 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1902 function euc_strtrunc($str,$len,$charset) {
1903 $sjis = ($charset == 'shift_jis');
1904 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1905 $c = ord($str{$i});
1906 if ($sjis) {
1907 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1909 else {
1910 if ($c >= 0x80) $i++; // advance a double-byte char
1913 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1915 if ($i>$len)
1916 return substr($str,0,$len-1); // we ended on a first byte
1917 else
1918 return substr($str,0,$len);
1922 * Returns a part of a string in the EUC charset family.
1924 * @param string EUC multibyte character string
1925 * @param integer start position (character position)
1926 * @param string the charset
1927 * @param integer length (in characters)
1928 * @return string the substring
1929 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1931 function euc_substr($str,$start,$charset,$len=null) {
1932 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1933 if ($byte_start === false) return false; // $start outside string length
1935 $str = substr($str,$byte_start);
1937 if ($len!=null) {
1938 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1939 if ($byte_end === false) // $len outside actual string length
1940 return $str;
1941 else
1942 return substr($str,0,$byte_end);
1944 else return $str;
1948 * Counts the number of characters of a string in the EUC charset family.
1950 * @param string EUC multibyte character string
1951 * @param string the charset
1952 * @return integer the number of characters
1953 * @see strlen()
1954 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1956 function euc_strlen($str,$charset) {
1957 $sjis = ($charset == 'shift_jis');
1958 $n=0;
1959 for ($i=0; strlen($str{$i}); $i++) {
1960 $c = ord($str{$i});
1961 if ($sjis) {
1962 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char
1964 else {
1965 if ($c >= 0x80) $i++; // advance a double-byte char
1968 $n++;
1971 return $n;
1975 * Translates a character position into an 'absolute' byte position.
1977 * @param string EUC multibyte character string
1978 * @param integer character position (negative values start from the end)
1979 * @param string the charset
1980 * @return integer byte position
1981 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1983 function euc_char2byte_pos($str,$pos,$charset) {
1984 $sjis = ($charset == 'shift_jis');
1985 $n = 0; // number of characters seen
1986 $p = abs($pos); // number of characters wanted
1988 if ($pos >= 0) {
1989 $i = 0;
1990 $d = 1;
1991 } else {
1992 $i = strlen($str)-1;
1993 $d = -1;
1996 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1997 $c = ord($str{$i});
1998 if ($sjis) {
1999 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char
2001 else {
2002 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2005 $n++;
2007 if (!strlen($str{$i})) return false; // offset beyond string length
2009 if ($pos < 0) $i++; // correct offset
2011 return $i;
2015 * Maps all characters of a string in the EUC charset family.
2017 * @param string EUC multibyte character string
2018 * @param string the charset
2019 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2020 * @param string 'case': conversion 'toLower' or 'toUpper'
2021 * @return string the converted string
2022 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2024 function euc_char_mapping($str,$charset,$mode,$opt='') {
2025 switch($mode) {
2026 case 'case':
2027 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2028 $map =& $this->caseFolding[$charset][$opt];
2029 break;
2031 case 'ascii':
2032 if (!$this->initToASCII($charset)) return $str; // do nothing
2033 $map =& $this->toASCII[$charset];
2034 break;
2036 default:
2037 return $str;
2040 $sjis = ($charset == 'shift_jis');
2041 $out = '';
2042 for($i=0; strlen($str{$i}); $i++) {
2043 $mbc = $str{$i};
2044 $c = ord($mbc);
2046 if ($sjis) {
2047 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2048 $mbc = substr($str,$i,2);
2049 $i++;
2052 else {
2053 if ($c >= 0x80) { // a double-byte char
2054 $mbc = substr($str,$i,2);
2055 $i++;
2059 if (isset($map[$mbc])) {
2060 $out .= $map[$mbc];
2061 } else {
2062 $out .= $mbc;
2066 return $out;
2071 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2072 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);