2 /***************************************************************
5 * (c) 2003-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
27 * $Id: class.t3lib_cs.php,v 1.54 2005/12/12 21:47:50 masi Exp $
29 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
33 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 503: function parse_charset($charset)
39 * 522: function get_locale_charset($locale)
41 * SECTION: Charset Conversion functions
42 * 575: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 615: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 632: function utf8_encode($str,$charset)
45 * 678: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 721: function utf8_to_entities($str)
47 * 754: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 788: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 838: function UnumberToChar($cbyte)
50 * 883: function utf8CharToUnumber($str,$hex=0)
52 * SECTION: Init functions
53 * 926: function initCharset($charset)
54 * 988: function initUnicodeData($mode=null)
55 * 1213: function initCaseFolding($charset)
56 * 1275: function initToASCII($charset)
58 * SECTION: String operation functions
59 * 1346: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1412: function crop($charset,$string,$len,$crop='')
62 * 1465: function strtrunc($charset,$string,$len)
63 * 1499: function conv_case($charset,$string,$case)
64 * 1525: function specCharsToASCII($charset,$string)
66 * SECTION: Internal string operation functions
67 * 1565: function sb_char_mapping($str,$charset,$mode,$opt='')
69 * SECTION: Internal UTF-8 string operation functions
70 * 1620: function utf8_substr($str,$start,$len=null)
71 * 1653: function utf8_strlen($str)
72 * 1674: function utf8_strtrunc($str,$len)
73 * 1696: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1719: function utf8_strrpos($haystack,$needle)
75 * 1739: function utf8_char2byte_pos($str,$pos)
76 * 1780: function utf8_byte2char_pos($str,$pos)
77 * 1803: function utf8_char_mapping($str,$mode,$opt='')
79 * SECTION: Internal EUC string operation functions
80 * 1879: function euc_strtrunc($str,$len,$charset)
81 * 1908: function euc_substr($str,$start,$charset,$len=null)
82 * 1933: function euc_strlen($str,$charset)
83 * 1960: function euc_char2byte_pos($str,$pos,$charset)
84 * 2001: function euc_char_mapping($str,$charset,$mode,$opt='')
87 * (This index is automatically created/updated by the extension "extdeveval")
101 * Functions working on UTF-8 strings:
106 * - implode/explode/join
108 * Functions nearly working on UTF-8 strings:
110 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
111 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
112 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
113 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
115 * Functions NOT working on UTF-8 strings:
129 * Class for conversion between charsets
131 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
132 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
139 // This is the array where parsed conversion tables are stored (cached)
140 var $parsedCharsets=array();
142 // An array where case folding data will be stored (cached)
143 var $caseFolding=array();
145 // An array where charset-to-ASCII mappings are stored (cached)
146 var $toASCII=array();
148 // This tells the converter which charsets has two bytes per char:
149 var $twoByteSets=array(
150 'ucs-2'=>1, // 2-byte Unicode
153 // This tells the converter which charsets has four bytes per char:
154 var $fourByteSets=array(
155 'ucs-4'=>1, // 4-byte Unicode
156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
159 // This tells the converter which charsets use a scheme like the Extended Unix Code:
160 var $eucBasedSets=array(
161 'gb2312'=>1, // Chinese, simplified.
162 'big5'=>1, // Chinese, traditional.
163 'euc-kr'=>1, // Korean
164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
168 // http://czyborra.com/charsets/iso8859.html
171 'us-ascii'=> 'ascii',
172 'cp819' => 'iso-8859-1',
173 'ibm819' => 'iso-8859-1',
174 'iso-ir-100' => 'iso-8859-1',
175 'iso-ir-109' => 'iso-8859-2',
176 'iso-ir-148' => 'iso-8859-9',
177 'iso-ir-199' => 'iso-8859-14',
178 'iso-ir-203' => 'iso-8859-15',
179 'csisolatin1' => 'iso-8859-1',
180 'csisolatin2' => 'iso-8859-2',
181 'csisolatin3' => 'iso-8859-3',
182 'csisolatin5' => 'iso-8859-9',
183 'csisolatin8' => 'iso-8859-14',
184 'csisolatin9' => 'iso-8859-15',
185 'csisolatingreek' => 'iso-8859-7',
186 'iso-celtic' => 'iso-8859-14',
187 'latin1' => 'iso-8859-1',
188 'latin2' => 'iso-8859-2',
189 'latin3' => 'iso-8859-3',
190 'latin5' => 'iso-8859-9',
191 'latin6' => 'iso-8859-10',
192 'latin8' => 'iso-8859-14',
193 'latin9' => 'iso-8859-15',
194 'l1' => 'iso-8859-1',
195 'l2' => 'iso-8859-2',
196 'l3' => 'iso-8859-3',
197 'l5' => 'iso-8859-9',
198 'l6' => 'iso-8859-10',
199 'l8' => 'iso-8859-14',
200 'l9' => 'iso-8859-15',
201 'cyrillic' => 'iso-8859-5',
202 'arabic' => 'iso-8859-6',
203 'tis-620' => 'iso-8859-11',
204 'win874' => 'windows-874',
205 'win1250' => 'windows-1250',
206 'win1251' => 'windows-1251',
207 'win1252' => 'windows-1252',
208 'win1253' => 'windows-1253',
209 'win1254' => 'windows-1254',
210 'win1255' => 'windows-1255',
211 'win1256' => 'windows-1256',
212 'win1257' => 'windows-1257',
213 'win1258' => 'windows-1258',
214 'cp1250' => 'windows-1250',
215 'cp1251' => 'windows-1251',
216 'cp1252' => 'windows-1252',
217 'ms-ee' => 'windows-1250',
218 'ms-ansi' => 'windows-1252',
219 'ms-greek' => 'windows-1253',
220 'ms-turk' => 'windows-1254',
221 'winbaltrim' => 'windows-1257',
222 'koi-8ru' => 'koi-8r',
226 'macintosh' => 'macroman',
227 'euc-cn' => 'gb2312',
228 'x-euc-cn' => 'gb2312',
234 'sjis' => 'shift_jis',
235 'shift-jis' => 'shift_jis',
236 'cp932' => 'shift_jis',
247 // mapping of iso-639:2 language codes to language (family) names
248 var $lang_to_langfamily=array(
249 // iso-639:2 language codes, see:
250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
251 // http://www.unicode.org/onlinedat/languages.html
254 'cs' => 'east_european',
255 'da' => 'west_european',
256 'de' => 'west_european',
257 'es' => 'west_european',
259 'eu' => 'west_european',
260 'fi' => 'west_european',
261 'fr' => 'west_european',
263 'hr' => 'east_european',
264 'hu' => 'east_european',
266 'is' => 'west_european',
267 'it' => 'west_european',
269 'kl' => 'west_european',
271 'lt' => 'lithuanian',
272 'lv' => 'west_european', // Latvian/Lettish
273 'nl' => 'west_european',
274 'no' => 'west_european',
275 'pl' => 'east_european',
276 'pt' => 'west_european',
277 'ro' => 'east_european',
279 'sk' => 'east_european',
280 'sl' => 'east_european',
281 'sv' => 'west_european',
284 'vi' => 'vietnamese',
286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
287 'chs' => 'simpl_chinese',
288 'cht' => 'trad_chinese',
289 'csy' => 'east_european',
290 'dan' => 'west_european',
291 'deu' => 'west_european',
292 'dea' => 'west_european',
293 'des' => 'west_european',
294 'ena' => 'west_european',
295 'enc' => 'west_european',
296 'eng' => 'west_european',
297 'enz' => 'west_european',
298 'enu' => 'west_european',
299 'nld' => 'west_european',
300 'nlb' => 'west_european',
301 'fin' => 'west_european',
302 'fra' => 'west_european',
303 'frb' => 'west_european',
304 'frc' => 'west_european',
305 'frs' => 'west_european',
307 'hun' => 'east_european',
308 'isl' => 'west_euorpean',
309 'ita' => 'west_european',
310 'its' => 'west_european',
313 'nor' => 'west_european',
314 'non' => 'west_european',
315 'plk' => 'east_european',
316 'ptg' => 'west_european',
317 'ptb' => 'west_european',
318 'rus' => 'east_european',
319 'sky' => 'east_european',
320 'esp' => 'west_european',
321 'esm' => 'west_european',
322 'esn' => 'west_european',
323 'sve' => 'west_european',
325 // English language names
326 'bulgarian' => 'east_european',
327 'catalan' => 'west_european',
328 'croatian' => 'east_european',
329 'czech' => 'east_european',
330 'danish' => 'west_european',
331 'dutch' => 'west_european',
332 'english' => 'west_european',
333 'finnish' => 'west_european',
334 'french' => 'west_european',
335 'galician' => 'west_european',
336 'german' => 'west_european',
337 'hungarian' => 'east_european',
338 'icelandic' => 'west_european',
339 'italian' => 'west_european',
340 'latvian' => 'west_european',
341 'lettish' => 'west_european',
342 'norwegian' => 'west_european',
343 'polish' => 'east_european',
344 'portuguese' => 'west_european',
345 'russian' => 'cyrillic',
346 'romanian' => 'east_european',
347 'slovak' => 'east_european',
348 'slovenian' => 'east_european',
349 'spanish' => 'west_european',
350 'svedish' => 'west_european',
351 'turkish' => 'east_european',
352 'ukrainian' => 'cyrillic',
355 // mapping of language (family) names to charsets on Unix
356 var $lang_to_charset_unix=array(
357 'west_european' => 'iso-8859-1',
358 'estonian' => 'iso-8859-1',
359 'east_european' => 'iso-8859-2',
360 'baltic' => 'iso-8859-4',
361 'cyrillic' => 'iso-8859-5',
362 'arabic' => 'iso-8859-6',
363 'greek' => 'iso-8859-7',
364 'hebrew' => 'iso-8859-8',
365 'turkish' => 'iso-8859-9',
366 'thai' => 'iso-8859-11', // = TIS-620
367 'lithuanian' => 'iso-8859-13',
368 'chinese' => 'gb2312', // = euc-cn
369 'japanese' => 'euc-jp',
370 'korean' => 'euc-kr',
371 'simpl_chinese' => 'gb2312',
372 'trad_chinese' => 'big5',
376 // mapping of language (family) names to charsets on Windows
377 var $lang_to_charset_windows=array(
378 'east_european' => 'windows-1250',
379 'cyrillic' => 'windows-1251',
380 'west_european' => 'windows-1252',
381 'greek' => 'windows-1253',
382 'turkish' => 'windows-1254',
383 'hebrew' => 'windows-1255',
384 'arabic' => 'windows-1256',
385 'baltic' => 'windows-1257',
386 'estonian' => 'windows-1257',
387 'lithuanian' => 'windows-1257',
388 'vietnamese' => 'windows-1258',
391 'chinese' => 'gb2312',
392 'japanese' => 'shift_jis',
393 'simpl_chinese' => 'gb2312',
394 'trad_chinese' => 'big5',
397 // mapping of locale names to charsets
398 var $locale_to_charset=array(
399 'japanese.euc' => 'euc-jp',
400 'ja_jp.ujis' => 'euc-jp',
401 'korean.euc' => 'euc-kr',
407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
408 // Empty values means "iso-8859-1"
409 var $charSetArray = array(
417 'cz' => 'windows-1250',
418 'pl' => 'iso-8859-2',
419 'si' => 'windows-1250',
421 'tr' => 'iso-8859-9',
424 'ru' => 'windows-1251',
425 'ro' => 'iso-8859-2',
427 'sk' => 'windows-1250',
428 'lt' => 'windows-1257',
430 'hr' => 'windows-1250',
431 'hu' => 'iso-8859-2',
433 'th' => 'iso-8859-11',
434 'gr' => 'iso-8859-7',
437 'bg' => 'windows-1251',
439 'et' => 'iso-8859-4',
440 'ar' => 'iso-8859-6',
442 'ua' => 'windows-1251',
446 'ca' => 'iso-8859-15',
447 'ba' => 'iso-8859-2',
454 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
455 // Empty values means sames as Typo3
456 var $isoArray = array(
479 'gl' => '', // Greenlandic
494 'ba' => '', // Bosnian
499 * Normalize - changes input character set to lowercase letters.
501 * @param string Input charset
502 * @return string Normalized charset
503 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
505 function parse_charset($charset) {
506 $charset = strtolower($charset);
507 if (isset($this->synonyms
[$charset])) $charset = $this->synonyms
[$charset];
513 * Get the charset of a locale.
516 * ln_CN language / country
517 * ln_CN.cs language / country / charset
518 * ln_CN.cs@mod language / country / charset / modifier
520 * @param string Locale string
521 * @return string Charset resolved for locale string
522 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
524 function get_locale_charset($locale) {
525 $locale = strtolower($locale);
527 // exact locale specific charset?
528 if (isset($this->locale_to_charset
[$locale])) return $this->locale_to_charset
[$locale];
531 list($locale,$modifier) = explode('@',$locale);
533 // locale contains charset: use it
534 list($locale,$charset) = explode('.',$locale);
535 if ($charset) return $this->parse_charset($charset);
537 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
538 if ($modifier == 'euro') return 'iso-8859-15';
541 list($language,$country) = explode('_',$locale);
542 if (isset($this->lang_to_langfamily
[$language])) $language = $this->lang_to_langfamily
[$language];
544 if (TYPO3_OS
== 'WIN') {
545 $cs = $this->lang_to_charset_windows
[$language];
547 $cs = $this->lang_to_charset_unix
[$language];
550 return $cs ?
$cs : 'iso-8859-1';
561 /********************************************
563 * Charset Conversion functions
565 ********************************************/
568 * Convert from one charset to another charset.
570 * @param string Input string
571 * @param string From charset (the current charset of the string)
572 * @param string To charset (the output charset wanted)
573 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
574 * @return string Converted string
577 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
578 if ($fromCS==$toCS) return $str;
580 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
581 if ($toCS=='utf-8' ||
!$useEntityForNoChar) {
582 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
584 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
585 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
589 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
590 if (false !== $conv_str) return $conv_str;
594 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
595 if (false !== $conv_str) return $conv_str;
598 // fallback to TYPO3 conversion
601 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
602 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
607 * Convert all elements in ARRAY from one charset to another charset.
608 * NOTICE: Array is passed by reference!
610 * @param string Input array, possibly multidimensional
611 * @param string From charset (the current charset of the string)
612 * @param string To charset (the output charset wanted)
613 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
617 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
618 foreach($array as $key => $value) {
619 if (is_array($array[$key])) {
620 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
622 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
628 * Converts $str from $charset to UTF-8
630 * @param string String in local charset to convert to UTF-8
631 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
632 * @return string Output string, converted to UTF-8
634 function utf8_encode($str,$charset) {
636 if ($charset === 'utf-8') return $str;
638 // Charset is case-insensitive.
639 if ($this->initCharset($charset)) { // Parse conv. table if not already...
640 $strLen = strlen($str);
643 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in string.
644 $chr=substr($str,$a,1);
646 if (isset($this->twoByteSets
[$charset])) { // If the charset has two bytes per char
647 $ord2 = ord($str{$a+
1});
648 $ord = $ord<<8 & $ord2; // assume big endian
650 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
651 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
652 } else $outStr.=chr($this->noCharByteVal
); // No char exists
654 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
655 if (isset($this->eucBasedSets
[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
656 if ($charset != 'shift_jis' ||
($ord < 0xA0 ||
$ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
658 $ord2=ord(substr($str,$a,1));
659 $ord = $ord*256+
$ord2;
663 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
664 $outStr.= $this->parsedCharsets
[$charset]['local'][$ord];
665 } else $outStr.= chr($this->noCharByteVal
); // No char exists
666 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
673 * Converts $str from UTF-8 to $charset
675 * @param string String in UTF-8 to convert to local charset
676 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
677 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
678 * @return string Output string, converted to local charset
680 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
682 // Charset is case-insensitive.
683 if ($this->initCharset($charset)) { // Parse conv. table if not already...
684 $strLen = strlen($str);
687 for ($a=0,$i=0;$a<$strLen;$a++
,$i++
) { // Traverse each char in UTF-8 string.
688 $chr=substr($str,$a,1);
690 if ($ord>127) { // This means multibyte! (first byte!)
691 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
693 $buf=$chr; // Add first byte
694 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
695 $ord = $ord << 1; // Shift it left and ...
696 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
697 $a++
; // Increase pointer...
698 $buf.=substr($str,$a,1); // ... and add the next char.
702 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
703 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
704 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
705 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
706 } else $outStr.= chr($mByte);
707 } elseif ($useEntityForNoChar) { // Create num entity:
708 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
709 } else $outStr.=chr($this->noCharByteVal
); // No char exists
710 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
711 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
718 * Converts all chars > 127 to numeric entities.
720 * @param string Input string
721 * @return string Output string
723 function utf8_to_entities($str) {
724 $strLen = strlen($str);
727 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
728 $chr=substr($str,$a,1);
730 if ($ord>127) { // This means multibyte! (first byte!)
731 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
732 $buf=$chr; // Add first byte
733 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
734 $ord = $ord << 1; // Shift it left and ...
735 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
736 $a++
; // Increase pointer...
737 $buf.=substr($str,$a,1); // ... and add the next char.
741 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
742 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
743 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
750 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
752 * @param string Input string, UTF-8
753 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
754 * @return string Output string
756 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
757 if ($alsoStdHtmlEnt) {
758 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
761 $token = md5(microtime());
762 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
763 foreach($parts as $k => $v) {
765 if (substr($v,0,1)=='#') { // Dec or hex entities:
766 if (substr($v,1,1)=='x') {
767 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
769 $parts[$k] = $this->UnumberToChar(substr($v,1));
771 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
772 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
773 } else { // No conversion:
774 $parts[$k] ='&'.$v.';';
779 return implode('',$parts);
783 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
785 * @param string Input string, UTF-8
786 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
787 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
788 * @return array Output array with the char numbers
790 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
791 // If entities must be registered as well...:
793 $str = $this->entities_to_utf8($str,1);
796 $strLen = strlen($str);
799 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
800 $chr=substr($str,$a,1);
802 if ($ord>127) { // This means multibyte! (first byte!)
803 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
804 $buf=$chr; // Add first byte
805 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
806 $ord = $ord << 1; // Shift it left and ...
807 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
808 $a++
; // Increase pointer...
809 $buf.=substr($str,$a,1); // ... and add the next char.
813 $outArr[]=$retChar?
$buf:$this->utf8CharToUnumber($buf);
814 } else $outArr[]=$retChar?
chr($this->noCharByteVal
):$this->noCharByteVal
; // No char exists (MIDDLE of MB sequence!)
815 } else $outArr[]=$retChar?
chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
822 * Converts a UNICODE number to a UTF-8 multibyte character
823 * Algorithm based on script found at From: http://czyborra.com/utf/
824 * Unit-tested by Kasper
826 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
828 * bytes | bits | representation
830 * 2 | 11 | 110vvvvv 10vvvvvv
831 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
832 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
833 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
834 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
836 * @param integer UNICODE integer
837 * @return string UTF-8 multibyte character string
838 * @see utf8CharToUnumber()
840 function UnumberToChar($cbyte) {
845 } else if ($cbyte < 0x800) {
846 $str.=chr(0xC0 |
($cbyte >> 6));
847 $str.=chr(0x80 |
($cbyte & 0x3F));
848 } else if ($cbyte < 0x10000) {
849 $str.=chr(0xE0 |
($cbyte >> 12));
850 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
851 $str.=chr(0x80 |
($cbyte & 0x3F));
852 } else if ($cbyte < 0x200000) {
853 $str.=chr(0xF0 |
($cbyte >> 18));
854 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
855 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
856 $str.=chr(0x80 |
($cbyte & 0x3F));
857 } else if ($cbyte < 0x4000000) {
858 $str.=chr(0xF8 |
($cbyte >> 24));
859 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
860 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
861 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
862 $str.=chr(0x80 |
($cbyte & 0x3F));
863 } else if ($cbyte < 0x80000000) {
864 $str.=chr(0xFC |
($cbyte >> 30));
865 $str.=chr(0x80 |
(($cbyte >> 24) & 0x3F));
866 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
867 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
868 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
869 $str.=chr(0x80 |
($cbyte & 0x3F));
870 } else { // Cannot express a 32-bit character in UTF-8
871 $str .= chr($this->noCharByteVal
);
877 * Converts a UTF-8 Multibyte character to a UNICODE number
878 * Unit-tested by Kasper
880 * @param string UTF-8 multibyte character string
881 * @param boolean If set, then a hex. number is returned.
882 * @return integer UNICODE integer
883 * @see UnumberToChar()
885 function utf8CharToUnumber($str,$hex=0) {
886 $ord=ord(substr($str,0,1)); // First char
888 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
890 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
891 $ord = $ord << 1; // Shift it left and ...
892 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
893 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+
1,1))),-6);
896 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
898 $int = bindec($binBuf);
901 return $hex ?
'x'.dechex($int) : $int;
912 /********************************************
916 ********************************************/
919 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
920 * This function is automatically called by the conversion functions
922 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
924 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
925 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
928 function initCharset($charset) {
929 // Only process if the charset is not yet loaded:
930 if (!is_array($this->parsedCharsets
[$charset])) {
932 // Conversion table filename:
933 $charsetConvTableFile = PATH_t3lib
.'csconvtbl/'.$charset.'.tbl';
935 // If the conversion table is found:
936 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
937 // Cache file for charsets:
938 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
939 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
940 if ($cacheFile && @is_file
($cacheFile)) {
941 $this->parsedCharsets
[$charset]=unserialize(t3lib_div
::getUrl($cacheFile));
943 // Parse conversion table into lines:
944 $lines=t3lib_div
::trimExplode(chr(10),t3lib_div
::getUrl($charsetConvTableFile),1);
945 // Initialize the internal variable holding the conv. table:
946 $this->parsedCharsets
[$charset]=array('local'=>array(),'utf8'=>array());
947 // traverse the lines:
949 foreach($lines as $value) {
950 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
952 // Detect type if not done yet: (Done on first real line)
953 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
954 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ?
'whitespaced' : 'ms-token';
956 if ($detectedType=='ms-token') {
957 list($hexbyte,$utf8) = split('=|:',$value,3);
958 } elseif ($detectedType=='whitespaced') {
960 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
962 $utf8 = 'U+'.$regA[2];
964 $decval = hexdec(trim($hexbyte));
966 $utf8decval = hexdec(substr(trim($utf8),2));
967 $this->parsedCharsets
[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
968 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]]=$decval;
973 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets
[$charset]));
982 * This function initializes all UTF-8 character data tables.
984 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
986 * @param string Mode ("case", "ascii", ...)
987 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
990 function initUnicodeData($mode=null) {
992 $cacheFileCase = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
993 $cacheFileASCII = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
995 // Only process if the tables are not yet loaded
998 if (is_array($this->caseFolding
['utf-8'])) return 1;
1000 // Use cached version if possible
1001 if ($cacheFileCase && @is_file
($cacheFileCase)) {
1002 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileCase));
1008 if (is_array($this->toASCII
['utf-8'])) return 1;
1010 // Use cached version if possible
1011 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
1012 $this->toASCII
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileASCII));
1018 // process main Unicode data file
1019 $unicodeDataFile = PATH_t3lib
.'unidata/UnicodeData.txt';
1020 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) return false;
1022 $fh = fopen($unicodeDataFile,'rb');
1023 if (!$fh) return false;
1025 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1026 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1027 $this->caseFolding
['utf-8'] = array();
1028 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
1029 $utf8CaseFolding['toUpper'] = array();
1030 $utf8CaseFolding['toLower'] = array();
1031 $utf8CaseFolding['toTitle'] = array();
1033 $decomposition = array(); // array of temp. decompositions
1034 $mark = array(); // array of chars that are marks (eg. composing accents)
1035 $number = array(); // array of chars that are numbers (eg. digits)
1036 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1038 while (!feof($fh)) {
1039 $line = fgets($fh,4096);
1040 // has a lot of info
1041 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1043 $ord = hexdec($char);
1044 if ($ord > 0xFFFF) break; // only process the BMP
1046 $utf8_char = $this->UnumberToChar($ord);
1048 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1049 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1050 // store "title" only when different from "upper" (only a few)
1051 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1054 case 'M': // mark (accent, umlaut, ...)
1055 $mark["U+$char"] = 1;
1058 case 'N': // numeric value
1059 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1062 // accented Latin letters without "official" decomposition
1064 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1065 $c = ord($match[2]);
1066 if ($match[1] == 'SMALL') $c +
= 32;
1068 $decomposition["U+$char"] = array(dechex($c));
1073 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1075 case '<circle>': // add parenthesis as circle replacement, eg (1)
1076 $match[2] = '0028 '.$match[2].' 0029';
1079 case '<square>': // add square brackets as square replacement, eg [1]
1080 $match[2] = '005B '.$match[2].' 005D';
1083 case '<compat>': // ignore multi char decompositions that start with a space
1084 if (ereg('^0020 ',$match[2])) continue 2;
1087 // ignore Arabic and vertical layout presentation decomposition
1095 $decomposition["U+$char"] = split(' ',$match[2]);
1100 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1101 $specialCasingFile = PATH_t3lib
.'unidata/SpecialCasing.txt';
1102 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1103 $fh = fopen($specialCasingFile,'rb');
1105 while (!feof($fh)) {
1106 $line = fgets($fh,4096);
1107 if ($line{0} != '#' && trim($line) != '') {
1109 list($char,$lower,$title,$upper,$cond) = t3lib_div
::trimExplode(';', $line);
1110 if ($cond == '' ||
$cond{0} == '#') {
1111 $utf8_char = $this->UnumberToChar(hexdec($char));
1112 if ($char != $lower) {
1113 $arr = split(' ',$lower);
1114 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1115 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1117 if ($char != $title && $title != $upper) {
1118 $arr = split(' ',$title);
1119 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1120 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1122 if ($char != $upper) {
1123 $arr = split(' ',$upper);
1124 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1125 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1134 // process custom decompositions
1135 $customTranslitFile = PATH_t3lib
.'unidata/Translit.txt';
1136 if (t3lib_div
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1137 $fh = fopen($customTranslitFile,'rb');
1139 while (!feof($fh)) {
1140 $line = fgets($fh,4096);
1141 if ($line{0} != '#' && trim($line) != '') {
1142 list($char,$translit) = t3lib_div
::trimExplode(';', $line);
1143 if (!$translit) $omit["U+$char"] = 1;
1144 $decomposition["U+$char"] = split(' ', $translit);
1152 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1153 foreach($decomposition as $from => $to) {
1154 $code_decomp = array();
1156 while ($code_value = array_shift($to)) {
1157 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1158 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1159 array_unshift($to, $cv);
1161 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1162 array_push($code_decomp, $code_value);
1165 if (count($code_decomp) ||
isset($omit[$from])) {
1166 $decomposition[$from] = $code_decomp;
1168 unset($decomposition[$from]);
1172 // create ascii only mapping
1173 $this->toASCII
['utf-8'] = array();
1174 $ascii =& $this->toASCII
['utf-8'];
1176 foreach($decomposition as $from => $to) {
1177 $code_decomp = array();
1178 while ($code_value = array_shift($to)) {
1179 $ord = hexdec($code_value);
1181 continue 2; // skip decompositions containing non-ASCII chars
1183 array_push($code_decomp,chr($ord));
1185 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1188 // add numeric decompositions
1189 foreach($number as $from => $to) {
1190 $utf8_char = $this->UnumberToChar(hexdec($from));
1191 if (!isset($ascii[$utf8_char])) {
1192 $ascii[$utf8_char] = $to;
1196 if ($cacheFileCase) {
1197 t3lib_div
::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1200 if ($cacheFileASCII) {
1201 t3lib_div
::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1208 * This function initializes the folding table for a charset other than UTF-8.
1209 * This function is automatically called by the case folding functions.
1211 * @param string Charset for which to initialize case folding.
1212 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1215 function initCaseFolding($charset) {
1216 // Only process if the case table is not yet loaded:
1217 if (is_array($this->caseFolding
[$charset])) return 1;
1219 // Use cached version if possible
1220 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1221 if ($cacheFile && @is_file
($cacheFile)) {
1222 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1226 // init UTF-8 conversion for this charset
1227 if (!$this->initCharset($charset)) {
1231 // UTF-8 case folding is used as the base conversion table
1232 if (!$this->initUnicodeData('case')) {
1236 $nochar = chr($this->noCharByteVal
);
1237 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1238 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1239 $c = $this->utf8_decode($utf8, $charset);
1241 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1242 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1243 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1245 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1246 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1247 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1249 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1250 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1251 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1254 // add the ASCII case table
1255 for ($i=ord('a'); $i<=ord('z'); $i++
) {
1256 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i-32);
1258 for ($i=ord('A'); $i<=ord('Z'); $i++
) {
1259 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i+
32);
1263 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding
[$charset]));
1270 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1271 * This function is automatically called by the ASCII transliteration functions.
1273 * @param string Charset for which to initialize conversion.
1274 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1277 function initToASCII($charset) {
1278 // Only process if the case table is not yet loaded:
1279 if (is_array($this->toASCII
[$charset])) return 1;
1281 // Use cached version if possible
1282 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1283 if ($cacheFile && @is_file
($cacheFile)) {
1284 $this->toASCII
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1288 // init UTF-8 conversion for this charset
1289 if (!$this->initCharset($charset)) {
1293 // UTF-8/ASCII transliteration is used as the base conversion table
1294 if (!$this->initUnicodeData('ascii')) {
1298 $nochar = chr($this->noCharByteVal
);
1299 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1300 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1301 $c = $this->utf8_decode($utf8, $charset);
1303 if (isset($this->toASCII
['utf-8'][$utf8])) {
1304 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1309 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII
[$charset]));
1330 /********************************************
1332 * String operation functions
1334 ********************************************/
1337 * Returns a part of a string.
1338 * Unit-tested by Kasper (single byte charsets only)
1340 * @param string The character set
1341 * @param string Character string
1342 * @param integer Start position (character position)
1343 * @param integer Length (in characters)
1344 * @return string The substring
1345 * @see substr(), mb_substr()
1346 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1348 function substr($charset,$string,$start,$len=null) {
1349 if ($len===0) return '';
1351 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1352 // cannot omit $len, when specifying charset
1354 $enc = mb_internal_encoding(); // save internal encoding
1355 mb_internal_encoding($charset);
1356 $str = mb_substr($string,$start);
1357 mb_internal_encoding($enc); // restore internal encoding
1362 return mb_substr($string,$start,$len,$charset);
1364 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1365 // cannot omit $len, when specifying charset
1367 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1368 iconv_set_encoding('internal_encoding',$charset);
1369 $str = iconv_substr($string,$start);
1370 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1375 return iconv_substr($string,$start,$len,$charset);
1377 } elseif ($charset == 'utf-8') {
1378 return $this->utf8_substr($string,$start,$len);
1379 } elseif ($this->eucBasedSets
[$charset]) {
1380 return $this->euc_substr($string,$start,$charset,$len);
1381 } elseif ($this->twoByteSets
[$charset]) {
1382 return substr($string,$start*2,$len*2);
1383 } elseif ($this->fourByteSets
[$charset]) {
1384 return substr($string,$start*4,$len*4);
1387 // treat everything else as single-byte encoding
1388 return $len === NULL ?
substr($string,$start) : substr($string,$start,$len);
1392 * Counts the number of characters.
1393 * Unit-tested by Kasper (single byte charsets only)
1395 * @param string The character set
1396 * @param string Character string
1397 * @return integer The number of characters
1399 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1401 function strlen($charset,$string) {
1402 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1403 return mb_strlen($string,$charset);
1404 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1405 return iconv_strlen($string,$charset);
1406 } elseif ($charset == 'utf-8') {
1407 return $this->utf8_strlen($string);
1408 } elseif ($this->eucBasedSets
[$charset]) {
1409 return $this->euc_strlen($string,$charset);
1410 } elseif ($this->twoByteSets
[$charset]) {
1411 return strlen($string)/2;
1412 } elseif ($this->fourByteSets
[$charset]) {
1413 return strlen($string)/4;
1415 // treat everything else as single-byte encoding
1416 return strlen($string);
1420 * Truncates a string and pre-/appends a string.
1421 * Unit tested by Kasper
1423 * @param string The character set
1424 * @param string Character string
1425 * @param integer Length (in characters)
1426 * @param string Crop signifier
1427 * @return string The shortened string
1428 * @see substr(), mb_strimwidth()
1429 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1431 function crop($charset,$string,$len,$crop='') {
1432 if (intval($len) == 0) return $string;
1434 if ($charset == 'utf-8') {
1435 $i = $this->utf8_char2byte_pos($string,$len);
1436 } elseif ($this->eucBasedSets
[$charset]) {
1437 $i = $this->euc_char2byte_pos($string,$len,$charset);
1442 $i = strlen($string)+
$len;
1443 if ($i<=0) $i = false;
1447 if ($i === false) { // $len outside actual string length
1451 if (strlen($string{$i})) {
1452 return substr($string,0,$i).$crop;
1456 if (strlen($string{$i-1})) {
1457 return $crop.substr($string,$i);
1462 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1464 return substr($string,0,$i).$crop;
1466 return $crop.substr($string,$i);
1475 * Cuts a string short at a given byte length.
1477 * @param string The character set
1478 * @param string Character string
1479 * @param integer The byte length
1480 * @return string The shortened string
1482 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1484 function strtrunc($charset,$string,$len) {
1485 if ($len <= 0) return '';
1487 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1488 return mb_strcut($string,0,$len,$charset);
1489 } elseif ($charset == 'utf-8') {
1490 return $this->utf8_strtrunc($string,$len);
1491 } elseif ($this->eucBasedSets
[$charset]) {
1492 return $this->euc_strtrunc($string,$charset);
1493 } elseif ($this->twoByteSets
[$charset]) {
1494 if ($len %
2) $len--; // don't cut at odd positions
1495 } elseif ($this->fourByteSets
[$charset]) {
1497 $len -= $x; // realign to position dividable by four
1499 // treat everything else as single-byte encoding
1500 return substr($string,0,$len);
1504 * Translates all characters of a string into their respective case values.
1505 * Unlike strtolower() and strtoupper() this method is locale independent.
1506 * Note that the string length may change!
1507 * eg. lower case German �(sharp S) becomes upper case "SS"
1508 * Unit-tested by Kasper
1509 * Real case folding is language dependent, this method ignores this fact.
1511 * @param string Character set of string
1512 * @param string Input string to convert case for
1513 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1514 * @return string The converted string
1515 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1516 * @see strtolower(), strtoupper()
1518 function conv_case($charset,$string,$case) {
1519 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) {
1520 if ($case == 'toLower') {
1521 return mb_strtolower($string,$charset);
1523 return mb_strtoupper($string,$charset);
1525 } elseif ($charset == 'utf-8') {
1526 return $this->utf8_char_mapping($string,'case',$case);
1527 } elseif (isset($this->eucBasedSets
[$charset])) {
1528 return $this->euc_char_mapping($string,$charset,'case',$case);
1530 // treat everything else as single-byte encoding
1531 return $this->sb_char_mapping($string,$charset,'case',$case);
1538 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1540 * @param string Character set of string
1541 * @param string Input string to convert
1542 * @return string The converted string
1544 function specCharsToASCII($charset,$string) {
1545 if ($charset == 'utf-8') {
1546 return $this->utf8_char_mapping($string,'ascii');
1547 } elseif (isset($this->eucBasedSets
[$charset])) {
1548 return $this->euc_char_mapping($string,$charset,'ascii');
1550 // treat everything else as single-byte encoding
1551 return $this->sb_char_mapping($string,$charset,'ascii');
1568 /********************************************
1570 * Internal string operation functions
1572 ********************************************/
1575 * Maps all characters of a string in a single byte charset.
1577 * @param string the string
1578 * @param string the charset
1579 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1580 * @param string 'case': conversion 'toLower' or 'toUpper'
1581 * @return string the converted string
1582 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1584 function sb_char_mapping($str,$charset,$mode,$opt='') {
1587 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1588 $map =& $this->caseFolding
[$charset][$opt];
1592 if (!$this->initToASCII($charset)) return $str; // do nothing
1593 $map =& $this->toASCII
[$charset];
1601 for($i=0; strlen($str{$i}); $i++
) {
1603 if (isset($map[$c])) {
1622 /********************************************
1624 * Internal UTF-8 string operation functions
1626 ********************************************/
1629 * Returns a part of a UTF-8 string.
1630 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1632 * @param string UTF-8 string
1633 * @param integer Start position (character position)
1634 * @param integer Length (in characters)
1635 * @return string The substring
1637 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1639 function utf8_substr($str,$start,$len=null) {
1640 if (!strcmp($len,'0')) return '';
1642 $byte_start = $this->utf8_char2byte_pos($str,$start);
1643 if ($byte_start === false) {
1645 return false; // $start outside string length
1651 $str = substr($str,$byte_start);
1654 $byte_end = $this->utf8_char2byte_pos($str,$len);
1655 if ($byte_end === false) // $len outside actual string length
1656 return $len<0 ?
'' : $str; // When length is less than zero and exceeds, then we return blank string.
1658 return substr($str,0,$byte_end);
1664 * Counts the number of characters of a string in UTF-8.
1665 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1667 * @param string UTF-8 multibyte character string
1668 * @return integer The number of characters
1670 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1672 function utf8_strlen($str) {
1674 for($i=0; strlen($str{$i}); $i++
) {
1676 if (!($c & 0x80)) // single-byte (0xxxxxx)
1678 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1685 * Truncates a string in UTF-8 short at a given byte length.
1687 * @param string UTF-8 multibyte character string
1688 * @param integer the byte length
1689 * @return string the shortened string
1691 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1693 function utf8_strtrunc($str,$len) {
1695 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1696 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1697 if ($i <= 0) return ''; // sanity check
1698 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++
; // calculate number of bytes
1699 if ($bc+
$i > $len) return substr($str,0,$i);
1700 // fallthru: multibyte char fits into length
1702 return substr($str,0,$len);
1706 * Find position of first occurrence of a string, both arguments are in UTF-8.
1708 * @param string UTF-8 string to search in
1709 * @param string UTF-8 string to search for
1710 * @param integer Positition to start the search
1711 * @return integer The character position
1713 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1715 function utf8_strpos($haystack,$needle,$offset=0) {
1716 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1717 return mb_strpos($haystack,$needle,$offset,'utf-8');
1718 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1719 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1722 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1723 if ($byte_offset === false) return false; // offset beyond string length
1725 $byte_pos = strpos($haystack,$needle,$byte_offset);
1726 if ($byte_pos === false) return false; // needle not found
1728 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1732 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1734 * @param string UTF-8 string to search in
1735 * @param string UTF-8 character to search for (single character)
1736 * @return integer The character position
1738 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1740 function utf8_strrpos($haystack,$needle) {
1741 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1742 return mb_strrpos($haystack,$needle,'utf-8');
1743 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1744 return iconv_strrpos($haystack,$needle,$offset,'utf-8');
1747 $byte_pos = strrpos($haystack,$needle);
1748 if ($byte_pos === false) return false; // needle not found
1750 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1754 * Translates a character position into an 'absolute' byte position.
1755 * Unit tested by Kasper.
1757 * @param string UTF-8 string
1758 * @param integer Character position (negative values start from the end)
1759 * @return integer Byte position
1760 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1762 function utf8_char2byte_pos($str,$pos) {
1763 $n = 0; // number of characters found
1764 $p = abs($pos); // number of characters wanted
1770 $i = strlen($str)-1;
1774 for( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1775 $c = (int)ord($str{$i});
1776 if (!($c & 0x80)) // single-byte (0xxxxxx)
1778 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1781 if (!strlen($str{$i})) return false; // offset beyond string length
1784 // skip trailing multi-byte data bytes
1785 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++
; }
1795 * Translates an 'absolute' byte position into a character position.
1796 * Unit tested by Kasper.
1798 * @param string UTF-8 string
1799 * @param integer byte position
1800 * @return integer character position
1801 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1803 function utf8_byte2char_pos($str,$pos) {
1804 $n = 0; // number of characters
1805 for($i=$pos; $i>0; $i--) {
1806 $c = (int)ord($str{$i});
1807 if (!($c & 0x80)) // single-byte (0xxxxxx)
1809 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1812 if (!strlen($str{$i})) return false; // offset beyond string length
1818 * Maps all characters of an UTF-8 string.
1820 * @param string UTF-8 string
1821 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1822 * @param string 'case': conversion 'toLower' or 'toUpper'
1823 * @return string the converted string
1824 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1826 function utf8_char_mapping($str,$mode,$opt='') {
1827 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1832 $map =& $this->caseFolding
['utf-8'][$opt];
1836 $map =& $this->toASCII
['utf-8'];
1843 for($i=0; strlen($str{$i}); $i++
) {
1845 if (!($c & 0x80)) // single-byte (0xxxxxx)
1847 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1848 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++
; } // calculate number of bytes
1849 $mbc = substr($str,$i,$bc);
1853 if (isset($map[$mbc])) {
1880 /********************************************
1882 * Internal EUC string operation functions
1884 * Extended Unix Code:
1885 * ASCII compatible 7bit single bytes chars
1886 * 8bit two byte chars
1888 * Shift-JIS is treated as a special case.
1890 ********************************************/
1893 * Cuts a string in the EUC charset family short at a given byte length.
1895 * @param string EUC multibyte character string
1896 * @param integer the byte length
1897 * @param string the charset
1898 * @return string the shortened string
1900 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1902 function euc_strtrunc($str,$len,$charset) {
1903 $sjis = ($charset == 'shift_jis');
1904 for ($i=0; strlen($str{$i}) && $i<$len; $i++
) {
1907 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1910 if ($c >= 0x80) $i++
; // advance a double-byte char
1913 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1916 return substr($str,0,$len-1); // we ended on a first byte
1918 return substr($str,0,$len);
1922 * Returns a part of a string in the EUC charset family.
1924 * @param string EUC multibyte character string
1925 * @param integer start position (character position)
1926 * @param string the charset
1927 * @param integer length (in characters)
1928 * @return string the substring
1929 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1931 function euc_substr($str,$start,$charset,$len=null) {
1932 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1933 if ($byte_start === false) return false; // $start outside string length
1935 $str = substr($str,$byte_start);
1938 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1939 if ($byte_end === false) // $len outside actual string length
1942 return substr($str,0,$byte_end);
1948 * Counts the number of characters of a string in the EUC charset family.
1950 * @param string EUC multibyte character string
1951 * @param string the charset
1952 * @return integer the number of characters
1954 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1956 function euc_strlen($str,$charset) {
1957 $sjis = ($charset == 'shift_jis');
1959 for ($i=0; strlen($str{$i}); $i++
) {
1962 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1965 if ($c >= 0x80) $i++
; // advance a double-byte char
1975 * Translates a character position into an 'absolute' byte position.
1977 * @param string EUC multibyte character string
1978 * @param integer character position (negative values start from the end)
1979 * @param string the charset
1980 * @return integer byte position
1981 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1983 function euc_char2byte_pos($str,$pos,$charset) {
1984 $sjis = ($charset == 'shift_jis');
1985 $n = 0; // number of characters seen
1986 $p = abs($pos); // number of characters wanted
1992 $i = strlen($str)-1;
1996 for ( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1999 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i+
=$d; // advance a double-byte char
2002 if ($c >= 0x80) $i+
=$d; // advance a double-byte char
2007 if (!strlen($str{$i})) return false; // offset beyond string length
2009 if ($pos < 0) $i++
; // correct offset
2015 * Maps all characters of a string in the EUC charset family.
2017 * @param string EUC multibyte character string
2018 * @param string the charset
2019 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2020 * @param string 'case': conversion 'toLower' or 'toUpper'
2021 * @return string the converted string
2022 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2024 function euc_char_mapping($str,$charset,$mode,$opt='') {
2027 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2028 $map =& $this->caseFolding
[$charset][$opt];
2032 if (!$this->initToASCII($charset)) return $str; // do nothing
2033 $map =& $this->toASCII
[$charset];
2040 $sjis = ($charset == 'shift_jis');
2042 for($i=0; strlen($str{$i}); $i++
) {
2047 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
2048 $mbc = substr($str,$i,2);
2053 if ($c >= 0x80) { // a double-byte char
2054 $mbc = substr($str,$i,2);
2059 if (isset($map[$mbc])) {
2071 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2072 include_once($TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);