2 /***************************************************************
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
27 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
35 * Functions working on UTF-8 strings:
40 * - implode/explode/join
42 * Functions nearly working on UTF-8 strings:
44 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
45 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
46 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
47 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
48 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
50 * Functions NOT working on UTF-8 strings:
62 * Class for conversion between charsets
64 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
65 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
72 * @var t3lib_l10n_Locales
76 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
78 // This is the array where parsed conversion tables are stored (cached)
79 var $parsedCharsets = array();
81 // An array where case folding data will be stored (cached)
82 var $caseFolding = array();
84 // An array where charset-to-ASCII mappings are stored (cached)
85 var $toASCII = array();
87 // This tells the converter which charsets has two bytes per char:
88 var $twoByteSets = array(
89 'ucs-2' => 1, // 2-byte Unicode
92 // This tells the converter which charsets has four bytes per char:
93 var $fourByteSets = array(
94 'ucs-4' => 1, // 4-byte Unicode
95 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
98 // This tells the converter which charsets use a scheme like the Extended Unix Code:
99 var $eucBasedSets = array(
100 'gb2312' => 1, // Chinese, simplified.
101 'big5' => 1, // Chinese, traditional.
102 'euc-kr' => 1, // Korean
103 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
106 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
107 // http://czyborra.com/charsets/iso8859.html
108 var $synonyms = array(
110 'us-ascii' => 'ascii',
111 'cp819' => 'iso-8859-1',
112 'ibm819' => 'iso-8859-1',
113 'iso-ir-100' => 'iso-8859-1',
114 'iso-ir-101' => 'iso-8859-2',
115 'iso-ir-109' => 'iso-8859-3',
116 'iso-ir-110' => 'iso-8859-4',
117 'iso-ir-144' => 'iso-8859-5',
118 'iso-ir-127' => 'iso-8859-6',
119 'iso-ir-126' => 'iso-8859-7',
120 'iso-ir-138' => 'iso-8859-8',
121 'iso-ir-148' => 'iso-8859-9',
122 'iso-ir-157' => 'iso-8859-10',
123 'iso-ir-179' => 'iso-8859-13',
124 'iso-ir-199' => 'iso-8859-14',
125 'iso-ir-203' => 'iso-8859-15',
126 'csisolatin1' => 'iso-8859-1',
127 'csisolatin2' => 'iso-8859-2',
128 'csisolatin3' => 'iso-8859-3',
129 'csisolatin5' => 'iso-8859-9',
130 'csisolatin8' => 'iso-8859-14',
131 'csisolatin9' => 'iso-8859-15',
132 'csisolatingreek' => 'iso-8859-7',
133 'iso-celtic' => 'iso-8859-14',
134 'latin1' => 'iso-8859-1',
135 'latin2' => 'iso-8859-2',
136 'latin3' => 'iso-8859-3',
137 'latin5' => 'iso-8859-9',
138 'latin6' => 'iso-8859-10',
139 'latin8' => 'iso-8859-14',
140 'latin9' => 'iso-8859-15',
141 'l1' => 'iso-8859-1',
142 'l2' => 'iso-8859-2',
143 'l3' => 'iso-8859-3',
144 'l5' => 'iso-8859-9',
145 'l6' => 'iso-8859-10',
146 'l8' => 'iso-8859-14',
147 'l9' => 'iso-8859-15',
148 'cyrillic' => 'iso-8859-5',
149 'arabic' => 'iso-8859-6',
150 'tis-620' => 'iso-8859-11',
151 'win874' => 'windows-874',
152 'win1250' => 'windows-1250',
153 'win1251' => 'windows-1251',
154 'win1252' => 'windows-1252',
155 'win1253' => 'windows-1253',
156 'win1254' => 'windows-1254',
157 'win1255' => 'windows-1255',
158 'win1256' => 'windows-1256',
159 'win1257' => 'windows-1257',
160 'win1258' => 'windows-1258',
161 'cp1250' => 'windows-1250',
162 'cp1251' => 'windows-1251',
163 'cp1252' => 'windows-1252',
164 'ms-ee' => 'windows-1250',
165 'ms-ansi' => 'windows-1252',
166 'ms-greek' => 'windows-1253',
167 'ms-turk' => 'windows-1254',
168 'winbaltrim' => 'windows-1257',
169 'koi-8ru' => 'koi-8r',
173 'macintosh' => 'macroman',
174 'euc-cn' => 'gb2312',
175 'x-euc-cn' => 'gb2312',
181 'sjis' => 'shift_jis',
182 'shift-jis' => 'shift_jis',
183 'cp932' => 'shift_jis',
194 // mapping of iso-639-1 language codes to script names
195 var $lang_to_script = array(
196 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
197 'af' => 'west_european', //Afrikaans
199 'bg' => 'cyrillic', // Bulgarian
200 'bs' => 'east_european', // Bosnian
201 'cs' => 'east_european', // Czech
202 'da' => 'west_european', // Danish
203 'de' => 'west_european', // German
204 'es' => 'west_european', // Spanish
206 'eo' => 'unicode', // Esperanto
207 'eu' => 'west_european', // Basque
208 'fa' => 'arabic', // Persian
209 'fi' => 'west_european', // Finish
210 'fo' => 'west_european', // Faroese
211 'fr' => 'west_european', // French
212 'ga' => 'west_european', // Irish
213 'gl' => 'west_european', // Galician
215 'he' => 'hebrew', // Hebrew (since 1998)
216 'hi' => 'unicode', // Hindi
217 'hr' => 'east_european', // Croatian
218 'hu' => 'east_european', // Hungarian
219 'iw' => 'hebrew', // Hebrew (til 1998)
220 'is' => 'west_european', // Icelandic
221 'it' => 'west_european', // Italian
223 'ka' => 'unicode', // Georgian
224 'kl' => 'west_european', // Greenlandic
225 'km' => 'unicode', // Khmer
227 'lt' => 'lithuanian',
228 'lv' => 'west_european', // Latvian/Lettish
229 'nl' => 'west_european', // Dutch
230 'no' => 'west_european', // Norwegian
231 'nb' => 'west_european', // Norwegian Bokmal
232 'nn' => 'west_european', // Norwegian Nynorsk
233 'pl' => 'east_european', // Polish
234 'pt' => 'west_european', // Portuguese
235 'ro' => 'east_european', // Romanian
236 'ru' => 'cyrillic', // Russian
237 'sk' => 'east_european', // Slovak
238 'sl' => 'east_european', // Slovenian
239 'sr' => 'cyrillic', // Serbian
240 'sv' => 'west_european', // Swedish
241 'sq' => 'albanian', // Albanian
243 'uk' => 'cyrillic', // Ukranian
244 'vi' => 'vietnamese',
246 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
247 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
248 'afk'=> 'west_european', // Afrikaans
250 'bgr' => 'cyrillic', // Bulgarian
251 'cat' => 'west_european', // Catalan
252 'chs' => 'simpl_chinese',
253 'cht' => 'trad_chinese',
254 'csy' => 'east_european', // Czech
255 'dan' => 'west_european', // Danisch
256 'deu' => 'west_european', // German
257 'dea' => 'west_european', // German (Austrian)
258 'des' => 'west_european', // German (Swiss)
259 'ena' => 'west_european', // English (Australian)
260 'enc' => 'west_european', // English (Canadian)
261 'eng' => 'west_european', // English
262 'enz' => 'west_european', // English (New Zealand)
263 'enu' => 'west_european', // English (United States)
264 'euq' => 'west_european', // Basque
265 'fos' => 'west_european', // Faroese
266 'far' => 'arabic', // Persian
267 'fin' => 'west_european', // Finish
268 'fra' => 'west_european', // French
269 'frb' => 'west_european', // French (Belgian)
270 'frc' => 'west_european', // French (Canadian)
271 'frs' => 'west_european', // French (Swiss)
272 'geo' => 'unicode', // Georgian
273 'glg' => 'west_european', // Galician
276 'hin' => 'unicode', // Hindi
277 'hun' => 'east_european', // Hungarian
278 'isl' => 'west_euorpean', // Icelandic
279 'ita' => 'west_european', // Italian
280 'its' => 'west_european', // Italian (Swiss)
282 'khm' => 'unicode', // Khmer
284 'lth' => 'lithuanian',
285 'lvi' => 'west_european', // Latvian/Lettish
286 'msl' => 'west_european', // Malay
287 'nlb' => 'west_european', // Dutch (Belgian)
288 'nld' => 'west_european', // Dutch
289 'nor' => 'west_european', // Norwegian (bokmal)
290 'non' => 'west_european', // Norwegian (nynorsk)
291 'plk' => 'east_european', // Polish
292 'ptg' => 'west_european', // Portuguese
293 'ptb' => 'west_european', // Portuguese (Brazil)
294 'rom' => 'east_european', // Romanian
295 'rus' => 'cyrillic', // Russian
296 'slv' => 'east_european', // Slovenian
297 'sky' => 'east_european', // Slovak
298 'srl' => 'east_european', // Serbian (Latin)
299 'srb' => 'cyrillic', // Serbian (Cyrillic)
300 'esp' => 'west_european', // Spanish (trad. sort)
301 'esm' => 'west_european', // Spanish (Mexican)
302 'esn' => 'west_european', // Spanish (internat. sort)
303 'sve' => 'west_european', // Swedish
304 'sqi' => 'albanian', // Albanian
307 'ukr' => 'cyrillic', // Ukrainian
308 // English language names
309 'afrikaans' => 'west_european',
310 'albanian' => 'albanian',
311 'arabic' => 'arabic',
312 'basque' => 'west_european',
313 'bosnian' => 'east_european',
314 'bulgarian' => 'east_european',
315 'catalan' => 'west_european',
316 'croatian' => 'east_european',
317 'czech' => 'east_european',
318 'danish' => 'west_european',
319 'dutch' => 'west_european',
320 'english' => 'west_european',
321 'esperanto' => 'unicode',
322 'estonian' => 'estonian',
323 'faroese' => 'west_european',
325 'finnish' => 'west_european',
326 'french' => 'west_european',
327 'galician' => 'west_european',
328 'georgian' => 'unicode',
329 'german' => 'west_european',
331 'greenlandic' => 'west_european',
332 'hebrew' => 'hebrew',
333 'hindi' => 'unicode',
334 'hungarian' => 'east_european',
335 'icelandic' => 'west_european',
336 'italian' => 'west_european',
337 'khmer' => 'unicode',
338 'latvian' => 'west_european',
339 'lettish' => 'west_european',
340 'lithuanian' => 'lithuanian',
341 'malay' => 'west_european',
342 'norwegian' => 'west_european',
343 'persian' => 'arabic',
344 'polish' => 'east_european',
345 'portuguese' => 'west_european',
346 'russian' => 'cyrillic',
347 'romanian' => 'east_european',
348 'serbian' => 'cyrillic',
349 'slovak' => 'east_european',
350 'slovenian' => 'east_european',
351 'spanish' => 'west_european',
352 'svedish' => 'west_european',
354 'turkish' => 'turkish',
355 'ukrainian' => 'cyrillic',
358 // mapping of language (family) names to charsets on Unix
359 var $script_to_charset_unix = array(
360 'west_european' => 'iso-8859-1',
361 'estonian' => 'iso-8859-1',
362 'east_european' => 'iso-8859-2',
363 'baltic' => 'iso-8859-4',
364 'cyrillic' => 'iso-8859-5',
365 'arabic' => 'iso-8859-6',
366 'greek' => 'iso-8859-7',
367 'hebrew' => 'iso-8859-8',
368 'turkish' => 'iso-8859-9',
369 'thai' => 'iso-8859-11', // = TIS-620
370 'lithuanian' => 'iso-8859-13',
371 'chinese' => 'gb2312', // = euc-cn
372 'japanese' => 'euc-jp',
373 'korean' => 'euc-kr',
374 'simpl_chinese' => 'gb2312',
375 'trad_chinese' => 'big5',
377 'unicode' => 'utf-8',
378 'albanian' => 'utf-8'
381 // mapping of language (family) names to charsets on Windows
382 var $script_to_charset_windows = array(
383 'east_european' => 'windows-1250',
384 'cyrillic' => 'windows-1251',
385 'west_european' => 'windows-1252',
386 'greek' => 'windows-1253',
387 'turkish' => 'windows-1254',
388 'hebrew' => 'windows-1255',
389 'arabic' => 'windows-1256',
390 'baltic' => 'windows-1257',
391 'estonian' => 'windows-1257',
392 'lithuanian' => 'windows-1257',
393 'vietnamese' => 'windows-1258',
396 'chinese' => 'gb2312',
397 'japanese' => 'shift_jis',
398 'simpl_chinese' => 'gb2312',
399 'trad_chinese' => 'big5',
400 'albanian' => 'windows-1250',
404 // mapping of locale names to charsets
405 var $locale_to_charset = array(
406 'japanese.euc' => 'euc-jp',
407 'ja_jp.ujis' => 'euc-jp',
408 'korean.euc' => 'euc-kr',
409 'sr@Latn' => 'iso-8859-2',
415 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
416 // Empty values means "iso-8859-1"
417 var $charSetArray = array(
419 'ar' => 'iso-8859-6',
420 'ba' => 'iso-8859-2',
421 'bg' => 'windows-1251',
423 'ca' => 'iso-8859-15',
425 'cs' => 'windows-1250',
426 'cz' => 'windows-1250',
430 'el' => 'iso-8859-7',
433 'et' => 'iso-8859-4',
443 'gr' => 'iso-8859-7',
447 'hr' => 'windows-1250',
448 'hu' => 'iso-8859-2',
458 'lt' => 'windows-1257',
464 'pl' => 'iso-8859-2',
468 'ro' => 'iso-8859-2',
469 'ru' => 'windows-1251',
471 'si' => 'windows-1250',
472 'sk' => 'windows-1250',
473 'sl' => 'windows-1250',
477 'th' => 'iso-8859-11',
478 'tr' => 'iso-8859-9',
479 'ua' => 'windows-1251',
480 'uk' => 'windows-1251',
486 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
487 // Missing keys means: same as TYPO3
488 // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
489 var $isoArray = array(
510 * Default constructor.
512 public function __construct() {
513 $this->locales
= t3lib_div
::makeInstance('t3lib_l10n_Locales');
517 * Normalize - changes input character set to lowercase letters.
519 * @param string Input charset
520 * @return string Normalized charset
521 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
523 function parse_charset($charset) {
524 $charset = trim(strtolower($charset));
525 if (isset($this->synonyms
[$charset])) {
526 $charset = $this->synonyms
[$charset];
533 * Get the charset of a locale.
536 * ln_CN language / country
537 * ln_CN.cs language / country / charset
538 * ln_CN.cs@mod language / country / charset / modifier
540 * @param string Locale string
541 * @return string Charset resolved for locale string
542 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
544 function get_locale_charset($locale) {
545 $locale = strtolower($locale);
547 // exact locale specific charset?
548 if (isset($this->locale_to_charset
[$locale])) {
549 return $this->locale_to_charset
[$locale];
553 list($locale, $modifier) = explode('@', $locale);
555 // locale contains charset: use it
556 list($locale, $charset) = explode('.', $locale);
558 return $this->parse_charset($charset);
561 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
562 if ($modifier == 'euro') {
563 return 'iso-8859-15';
567 list($language, $country) = explode('_', $locale);
568 if (isset($this->lang_to_script
[$language])) {
569 $script = $this->lang_to_script
[$language];
572 if (TYPO3_OS
== 'WIN') {
573 $cs = $this->script_to_charset_windows
[$script] ?
$this->script_to_charset_windows
[$script] : 'windows-1252';
575 $cs = $this->script_to_charset_unix
[$script] ?
$this->script_to_charset_unix
[$script] : 'utf-8';
582 /********************************************
584 * Charset Conversion functions
586 ********************************************/
589 * Convert from one charset to another charset.
591 * @param string Input string
592 * @param string From charset (the current charset of the string)
593 * @param string To charset (the output charset wanted)
594 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
595 * @return string Converted string
598 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
599 if ($fromCS == $toCS) {
603 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
604 if ($toCS == 'utf-8' ||
!$useEntityForNoChar) {
605 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
607 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
608 if (FALSE !== $conv_str) {
610 } // returns FALSE for unsupported charsets
614 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
615 if (FALSE !== $conv_str) {
621 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
622 if (FALSE !== $conv_str) {
627 // fallback to TYPO3 conversion
630 if ($fromCS != 'utf-8') {
631 $str = $this->utf8_encode($str, $fromCS);
633 if ($toCS != 'utf-8') {
634 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
640 * Convert all elements in ARRAY with type string from one charset to another charset.
641 * NOTICE: Array is passed by reference!
643 * @param string Input array, possibly multidimensional
644 * @param string From charset (the current charset of the string)
645 * @param string To charset (the output charset wanted)
646 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
650 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
651 foreach ($array as $key => $value) {
652 if (is_array($array[$key])) {
653 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
654 } elseif (is_string($array[$key])) {
655 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
661 * Converts $str from $charset to UTF-8
663 * @param string String in local charset to convert to UTF-8
664 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
665 * @return string Output string, converted to UTF-8
667 function utf8_encode($str, $charset) {
669 if ($charset === 'utf-8') {
673 // Charset is case-insensitive.
674 if ($this->initCharset($charset)) { // Parse conv. table if not already...
675 $strLen = strlen($str);
678 for ($a = 0; $a < $strLen; $a++
) { // Traverse each char in string.
679 $chr = substr($str, $a, 1);
681 if (isset($this->twoByteSets
[$charset])) { // If the charset has two bytes per char
682 $ord2 = ord($str[$a +
1]);
683 $ord = $ord << 8 |
$ord2; // assume big endian
685 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
686 $outStr .= $this->parsedCharsets
[$charset]['local'][$ord];
688 $outStr .= chr($this->noCharByteVal
);
691 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
692 if (isset($this->eucBasedSets
[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
693 if ($charset != 'shift_jis' ||
($ord < 0xA0 ||
$ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
695 $ord2 = ord(substr($str, $a, 1));
696 $ord = $ord * 256 +
$ord2;
700 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
701 $outStr .= $this->parsedCharsets
[$charset]['local'][$ord];
703 $outStr .= chr($this->noCharByteVal
);
707 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
714 * Converts $str from UTF-8 to $charset
716 * @param string String in UTF-8 to convert to local charset
717 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
718 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
719 * @return string Output string, converted to local charset
721 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
723 if ($charset === 'utf-8') {
727 // Charset is case-insensitive.
728 if ($this->initCharset($charset)) { // Parse conv. table if not already...
729 $strLen = strlen($str);
732 for ($a = 0, $i = 0; $a < $strLen; $a++
, $i++
) { // Traverse each char in UTF-8 string.
733 $chr = substr($str, $a, 1);
735 if ($ord > 127) { // This means multibyte! (first byte!)
736 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
738 $buf = $chr; // Add first byte
739 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
740 $ord = $ord << 1; // Shift it left and ...
741 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
742 $a++
; // Increase pointer...
743 $buf .= substr($str, $a, 1); // ... and add the next char.
749 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
750 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
751 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
752 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
754 $outStr .= chr($mByte);
756 } elseif ($useEntityForNoChar) { // Create num entity:
757 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
759 $outStr .= chr($this->noCharByteVal
);
762 $outStr .= chr($this->noCharByteVal
);
763 } // No char exists (MIDDLE of MB sequence!)
766 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
773 * Converts all chars > 127 to numeric entities.
775 * @param string Input string
776 * @return string Output string
778 function utf8_to_entities($str) {
779 $strLen = strlen($str);
782 for ($a = 0; $a < $strLen; $a++
) { // Traverse each char in UTF-8 string.
783 $chr = substr($str, $a, 1);
785 if ($ord > 127) { // This means multibyte! (first byte!)
786 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
787 $buf = $chr; // Add first byte
788 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
789 $ord = $ord << 1; // Shift it left and ...
790 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
791 $a++
; // Increase pointer...
792 $buf .= substr($str, $a, 1); // ... and add the next char.
798 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
800 $outStr .= chr($this->noCharByteVal
);
801 } // No char exists (MIDDLE of MB sequence!)
804 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
811 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
813 * @param string Input string, UTF-8
814 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
815 * @return string Output string
817 function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
818 // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
819 // see http://php.net/manual/en/function.get-html-translation-table.php
820 $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<');
822 if ($alsoStdHtmlEnt) {
823 if ($applyPhpCompatibilityFix === TRUE) {
824 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
, ENT_COMPAT
));
826 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
, ENT_COMPAT
, 'UTF-8'));
830 $token = md5(microtime());
831 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
832 foreach ($parts as $k => $v) {
833 // only take every second element
839 if (substr($v, $position, 1) == '#') { // Dec or hex entities:
841 if (substr($v, $position, 1) == 'x') {
842 $v = hexdec(substr($v, ++
$position));
844 $v = substr($v, $position);
846 $parts[$k] = $this->UnumberToChar($v);
847 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
848 $v = $trans_tbl['&' . $v . ';'];
849 if ($applyPhpCompatibilityFix === TRUE) {
850 $v = $this->utf8_encode($v, 'iso-8859-1');
853 } else { // No conversion:
854 $parts[$k] = '&' . $v . ';';
858 return implode('', $parts);
862 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
864 * @param string Input string, UTF-8
865 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
866 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
867 * @return array Output array with the char numbers
869 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
870 // If entities must be registered as well...:
872 $str = $this->entities_to_utf8($str, 1);
875 $strLen = strlen($str);
878 for ($a = 0; $a < $strLen; $a++
) { // Traverse each char in UTF-8 string.
879 $chr = substr($str, $a, 1);
881 if ($ord > 127) { // This means multibyte! (first byte!)
882 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
883 $buf = $chr; // Add first byte
884 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
885 $ord = $ord << 1; // Shift it left and ...
886 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
887 $a++
; // Increase pointer...
888 $buf .= substr($str, $a, 1); // ... and add the next char.
894 $outArr[] = $retChar ?
$buf : $this->utf8CharToUnumber($buf);
896 $outArr[] = $retChar ?
chr($this->noCharByteVal
) : $this->noCharByteVal
;
897 } // No char exists (MIDDLE of MB sequence!)
899 $outArr[] = $retChar ?
chr($ord) : $ord;
900 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
907 * Converts a UNICODE number to a UTF-8 multibyte character
908 * Algorithm based on script found at From: http://czyborra.com/utf/
909 * Unit-tested by Kasper
911 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
913 * bytes | bits | representation
915 * 2 | 11 | 110vvvvv 10vvvvvv
916 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
917 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
918 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
919 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
921 * @param integer UNICODE integer
922 * @return string UTF-8 multibyte character string
923 * @see utf8CharToUnumber()
925 function UnumberToChar($cbyte) {
931 if ($cbyte < 0x800) {
932 $str .= chr(0xC0 |
($cbyte >> 6));
933 $str .= chr(0x80 |
($cbyte & 0x3F));
935 if ($cbyte < 0x10000) {
936 $str .= chr(0xE0 |
($cbyte >> 12));
937 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
938 $str .= chr(0x80 |
($cbyte & 0x3F));
940 if ($cbyte < 0x200000) {
941 $str .= chr(0xF0 |
($cbyte >> 18));
942 $str .= chr(0x80 |
(($cbyte >> 12) & 0x3F));
943 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
944 $str .= chr(0x80 |
($cbyte & 0x3F));
946 if ($cbyte < 0x4000000) {
947 $str .= chr(0xF8 |
($cbyte >> 24));
948 $str .= chr(0x80 |
(($cbyte >> 18) & 0x3F));
949 $str .= chr(0x80 |
(($cbyte >> 12) & 0x3F));
950 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
951 $str .= chr(0x80 |
($cbyte & 0x3F));
953 if ($cbyte < 0x80000000) {
954 $str .= chr(0xFC |
($cbyte >> 30));
955 $str .= chr(0x80 |
(($cbyte >> 24) & 0x3F));
956 $str .= chr(0x80 |
(($cbyte >> 18) & 0x3F));
957 $str .= chr(0x80 |
(($cbyte >> 12) & 0x3F));
958 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
959 $str .= chr(0x80 |
($cbyte & 0x3F));
960 } else { // Cannot express a 32-bit character in UTF-8
961 $str .= chr($this->noCharByteVal
);
972 * Converts a UTF-8 Multibyte character to a UNICODE number
973 * Unit-tested by Kasper
975 * @param string UTF-8 multibyte character string
976 * @param boolean If set, then a hex. number is returned.
977 * @return integer UNICODE integer
978 * @see UnumberToChar()
980 function utf8CharToUnumber($str, $hex = 0) {
981 $ord = ord(substr($str, 0, 1)); // First char
983 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
985 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
986 $ord = $ord << 1; // Shift it left and ...
987 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
988 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b +
1, 1))), -6);
993 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
995 $int = bindec($binBuf);
1000 return $hex ?
'x' . dechex($int) : $int;
1004 /********************************************
1008 ********************************************/
1011 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1012 * This function is automatically called by the conversion functions
1014 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1016 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1017 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1020 function initCharset($charset) {
1021 // Only process if the charset is not yet loaded:
1022 if (!is_array($this->parsedCharsets
[$charset])) {
1024 // Conversion table filename:
1025 $charsetConvTableFile = PATH_t3lib
. 'csconvtbl/' . $charset . '.tbl';
1027 // If the conversion table is found:
1028 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
1029 // Cache file for charsets:
1030 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1031 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1032 if ($cacheFile && @is_file
($cacheFile)) {
1033 $this->parsedCharsets
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1035 // Parse conversion table into lines:
1036 $lines = t3lib_div
::trimExplode(LF
, t3lib_div
::getUrl($charsetConvTableFile), 1);
1037 // Initialize the internal variable holding the conv. table:
1038 $this->parsedCharsets
[$charset] = array('local' => array(), 'utf8' => array());
1039 // traverse the lines:
1041 foreach ($lines as $value) {
1042 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1044 // Detect type if not done yet: (Done on first real line)
1045 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1046 if (!$detectedType) {
1047 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ?
'whitespaced' : 'ms-token';
1050 if ($detectedType == 'ms-token') {
1051 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1052 } elseif ($detectedType == 'whitespaced') {
1054 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1055 $hexbyte = $regA[1];
1056 $utf8 = 'U+' . $regA[2];
1058 $decval = hexdec(trim($hexbyte));
1059 if ($decval > 127) {
1060 $utf8decval = hexdec(substr(trim($utf8), 2));
1061 $this->parsedCharsets
[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1062 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]] = $decval;
1067 t3lib_div
::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets
[$charset]));
1080 * This function initializes all UTF-8 character data tables.
1082 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1084 * @param string Mode ("case", "ascii", ...)
1085 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1088 function initUnicodeData($mode = NULL) {
1090 $cacheFileCase = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1091 $cacheFileASCII = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1093 // Only process if the tables are not yet loaded
1096 if (is_array($this->caseFolding
['utf-8'])) {
1100 // Use cached version if possible
1101 if ($cacheFileCase && @is_file
($cacheFileCase)) {
1102 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileCase));
1108 if (is_array($this->toASCII
['utf-8'])) {
1112 // Use cached version if possible
1113 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
1114 $this->toASCII
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileASCII));
1120 // process main Unicode data file
1121 $unicodeDataFile = PATH_t3lib
. 'unidata/UnicodeData.txt';
1122 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) {
1126 $fh = fopen($unicodeDataFile, 'rb');
1131 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1132 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1133 $this->caseFolding
['utf-8'] = array();
1134 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
1135 $utf8CaseFolding['toUpper'] = array();
1136 $utf8CaseFolding['toLower'] = array();
1137 $utf8CaseFolding['toTitle'] = array();
1139 $decomposition = array(); // array of temp. decompositions
1140 $mark = array(); // array of chars that are marks (eg. composing accents)
1141 $number = array(); // array of chars that are numbers (eg. digits)
1142 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1144 while (!feof($fh)) {
1145 $line = fgets($fh, 4096);
1146 // has a lot of info
1147 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1149 $ord = hexdec($char);
1150 if ($ord > 0xFFFF) {
1152 } // only process the BMP
1154 $utf8_char = $this->UnumberToChar($ord);
1157 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1160 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1162 // store "title" only when different from "upper" (only a few)
1163 if ($title && $title != $upper) {
1164 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1168 case 'M': // mark (accent, umlaut, ...)
1169 $mark["U+$char"] = 1;
1172 case 'N': // numeric value
1173 if ($ord > 0x80 && $num != '') {
1174 $number["U+$char"] = $num;
1178 // accented Latin letters without "official" decomposition
1180 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1181 $c = ord($match[2]);
1182 if ($match[1] == 'SMALL') {
1186 $decomposition["U+$char"] = array(dechex($c));
1191 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1192 switch ($match[1]) {
1193 case '<circle>': // add parenthesis as circle replacement, eg (1)
1194 $match[2] = '0028 ' . $match[2] . ' 0029';
1197 case '<square>': // add square brackets as square replacement, eg [1]
1198 $match[2] = '005B ' . $match[2] . ' 005D';
1201 case '<compat>': // ignore multi char decompositions that start with a space
1202 if (preg_match('/^0020 /', $match[2])) {
1207 // ignore Arabic and vertical layout presentation decomposition
1215 $decomposition["U+$char"] = explode(' ', $match[2]);
1220 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1221 $specialCasingFile = PATH_t3lib
. 'unidata/SpecialCasing.txt';
1222 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1223 $fh = fopen($specialCasingFile, 'rb');
1225 while (!feof($fh)) {
1226 $line = fgets($fh, 4096);
1227 if ($line[0] != '#' && trim($line) != '') {
1229 list($char, $lower, $title, $upper, $cond) = t3lib_div
::trimExplode(';', $line);
1230 if ($cond == '' ||
$cond[0] == '#') {
1231 $utf8_char = $this->UnumberToChar(hexdec($char));
1232 if ($char != $lower) {
1233 $arr = explode(' ', $lower);
1234 for ($i = 0; isset($arr[$i]); $i++
) {
1235 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1237 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1239 if ($char != $title && $title != $upper) {
1240 $arr = explode(' ', $title);
1241 for ($i = 0; isset($arr[$i]); $i++
) {
1242 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1246 if ($char != $upper) {
1247 $arr = explode(' ', $upper);
1248 for ($i = 0; isset($arr[$i]); $i++
) {
1249 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1251 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1260 // process custom decompositions
1261 $customTranslitFile = PATH_t3lib
. 'unidata/Translit.txt';
1262 if (t3lib_div
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1263 $fh = fopen($customTranslitFile, 'rb');
1265 while (!feof($fh)) {
1266 $line = fgets($fh, 4096);
1267 if ($line[0] != '#' && trim($line) != '') {
1268 list($char, $translit) = t3lib_div
::trimExplode(';', $line);
1270 $omit["U+$char"] = 1;
1272 $decomposition["U+$char"] = explode(' ', $translit);
1280 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1281 foreach ($decomposition as $from => $to) {
1282 $code_decomp = array();
1284 while ($code_value = array_shift($to)) {
1285 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1286 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1287 array_unshift($to, $cv);
1289 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1290 array_push($code_decomp, $code_value);
1293 if (count($code_decomp) ||
isset($omit[$from])) {
1294 $decomposition[$from] = $code_decomp;
1296 unset($decomposition[$from]);
1300 // create ascii only mapping
1301 $this->toASCII
['utf-8'] = array();
1302 $ascii =& $this->toASCII
['utf-8'];
1304 foreach ($decomposition as $from => $to) {
1305 $code_decomp = array();
1306 while ($code_value = array_shift($to)) {
1307 $ord = hexdec($code_value);
1310 } // skip decompositions containing non-ASCII chars
1313 array_push($code_decomp, chr($ord));
1316 $ascii[$this->UnumberToChar(hexdec(str_replace('U+', '0x', $from)))] = join('', $code_decomp);
1319 // add numeric decompositions
1320 foreach ($number as $from => $to) {
1321 $utf8_char = $this->UnumberToChar(hexdec(str_replace('U+', '0x', $from)));
1322 if (!isset($ascii[$utf8_char])) {
1323 $ascii[$utf8_char] = $to;
1327 if ($cacheFileCase) {
1328 t3lib_div
::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1331 if ($cacheFileASCII) {
1332 t3lib_div
::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1339 * This function initializes the folding table for a charset other than UTF-8.
1340 * This function is automatically called by the case folding functions.
1342 * @param string Charset for which to initialize case folding.
1343 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1346 function initCaseFolding($charset) {
1347 // Only process if the case table is not yet loaded:
1348 if (is_array($this->caseFolding
[$charset])) {
1352 // Use cached version if possible
1353 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1354 if ($cacheFile && @is_file
($cacheFile)) {
1355 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1359 // init UTF-8 conversion for this charset
1360 if (!$this->initCharset($charset)) {
1364 // UTF-8 case folding is used as the base conversion table
1365 if (!$this->initUnicodeData('case')) {
1369 $nochar = chr($this->noCharByteVal
);
1370 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1371 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1372 $c = $this->utf8_decode($utf8, $charset);
1374 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1375 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1376 if ($cc != '' && $cc != $nochar) {
1377 $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1380 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1381 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1382 if ($cc != '' && $cc != $nochar) {
1383 $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1386 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1387 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1388 if ($cc != '' && $cc != $nochar) {
1389 $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1393 // add the ASCII case table
1394 for ($i = ord('a'); $i <= ord('z'); $i++
) {
1395 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i - 32);
1397 for ($i = ord('A'); $i <= ord('Z'); $i++
) {
1398 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i +
32);
1402 t3lib_div
::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding
[$charset]));
1409 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1410 * This function is automatically called by the ASCII transliteration functions.
1412 * @param string Charset for which to initialize conversion.
1413 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1416 function initToASCII($charset) {
1417 // Only process if the case table is not yet loaded:
1418 if (is_array($this->toASCII
[$charset])) {
1422 // Use cached version if possible
1423 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1424 if ($cacheFile && @is_file
($cacheFile)) {
1425 $this->toASCII
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1429 // init UTF-8 conversion for this charset
1430 if (!$this->initCharset($charset)) {
1434 // UTF-8/ASCII transliteration is used as the base conversion table
1435 if (!$this->initUnicodeData('ascii')) {
1439 $nochar = chr($this->noCharByteVal
);
1440 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1441 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1442 $c = $this->utf8_decode($utf8, $charset);
1444 if (isset($this->toASCII
['utf-8'][$utf8])) {
1445 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1450 t3lib_div
::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII
[$charset]));
1457 /********************************************
1459 * String operation functions
1461 ********************************************/
1464 * Returns a part of a string.
1465 * Unit-tested by Kasper (single byte charsets only)
1467 * @param string The character set
1468 * @param string Character string
1469 * @param integer Start position (character position)
1470 * @param integer Length (in characters)
1471 * @return string The substring
1472 * @see substr(), mb_substr()
1473 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1475 function substr($charset, $string, $start, $len = NULL) {
1476 if ($len === 0 ||
$string === '') {
1480 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1481 // cannot omit $len, when specifying charset
1483 $enc = mb_internal_encoding(); // save internal encoding
1484 mb_internal_encoding($charset);
1485 $str = mb_substr($string, $start);
1486 mb_internal_encoding($enc); // restore internal encoding
1491 return mb_substr($string, $start, $len, $charset);
1493 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1494 // cannot omit $len, when specifying charset
1496 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1497 iconv_set_encoding('internal_encoding', $charset);
1498 $str = iconv_substr($string, $start);
1499 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1504 return iconv_substr($string, $start, $len, $charset);
1506 } elseif ($charset == 'utf-8') {
1507 return $this->utf8_substr($string, $start, $len);
1508 } elseif ($this->eucBasedSets
[$charset]) {
1509 return $this->euc_substr($string, $start, $charset, $len);
1510 } elseif ($this->twoByteSets
[$charset]) {
1511 return substr($string, $start * 2, $len * 2);
1512 } elseif ($this->fourByteSets
[$charset]) {
1513 return substr($string, $start * 4, $len * 4);
1516 // treat everything else as single-byte encoding
1517 return $len === NULL ?
substr($string, $start) : substr($string, $start, $len);
1521 * Counts the number of characters.
1522 * Unit-tested by Kasper (single byte charsets only)
1524 * @param string The character set
1525 * @param string Character string
1526 * @return integer The number of characters
1528 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1530 function strlen($charset, $string) {
1531 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1532 return mb_strlen($string, $charset);
1533 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1534 return iconv_strlen($string, $charset);
1535 } elseif ($charset == 'utf-8') {
1536 return $this->utf8_strlen($string);
1537 } elseif ($this->eucBasedSets
[$charset]) {
1538 return $this->euc_strlen($string, $charset);
1539 } elseif ($this->twoByteSets
[$charset]) {
1540 return strlen($string) / 2;
1541 } elseif ($this->fourByteSets
[$charset]) {
1542 return strlen($string) / 4;
1544 // treat everything else as single-byte encoding
1545 return strlen($string);
1549 * Method to crop strings using the mb_substr function.
1551 * @param string The character set
1552 * @param string String to be cropped
1553 * @param integer Crop length (in characters)
1554 * @param string Crop signifier
1555 * @return string The shortened string
1556 * @see mb_strlen(), mb_substr()
1558 protected function cropMbstring($charset, $string, $len, $crop = '') {
1559 if (intval($len) === 0 ||
mb_strlen($string, $charset) <= abs($len)) {
1564 $string = mb_substr($string, 0, $len, $charset) . $crop;
1566 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1573 * Truncates a string and pre-/appends a string.
1574 * Unit tested by Kasper
1576 * @param string The character set
1577 * @param string Character string
1578 * @param integer Length (in characters)
1579 * @param string Crop signifier
1580 * @return string The shortened string
1581 * @see substr(), mb_strimwidth()
1582 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1584 function crop($charset, $string, $len, $crop = '') {
1585 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1586 return $this->cropMbstring($charset, $string, $len, $crop);
1589 if (intval($len) == 0) {
1593 if ($charset == 'utf-8') {
1594 $i = $this->utf8_char2byte_pos($string, $len);
1595 } elseif ($this->eucBasedSets
[$charset]) {
1596 $i = $this->euc_char2byte_pos($string, $len, $charset);
1601 $i = strlen($string) +
$len;
1608 if ($i === FALSE) { // $len outside actual string length
1612 if (strlen($string[$i])) {
1613 return substr($string, 0, $i) . $crop;
1617 if (strlen($string[$i - 1])) {
1618 return $crop . substr($string, $i);
1623 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1625 return substr($string,0,$i).$crop;
1627 return $crop.substr($string,$i);
1636 * Cuts a string short at a given byte length.
1638 * @param string The character set
1639 * @param string Character string
1640 * @param integer The byte length
1641 * @return string The shortened string
1643 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1645 function strtrunc($charset, $string, $len) {
1650 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1651 return mb_strcut($string, 0, $len, $charset);
1652 } elseif ($charset == 'utf-8') {
1653 return $this->utf8_strtrunc($string, $len);
1654 } elseif ($this->eucBasedSets
[$charset]) {
1655 return $this->euc_strtrunc($string, $len, $charset);
1656 } elseif ($this->twoByteSets
[$charset]) {
1659 } // don't cut at odd positions
1660 } elseif ($this->fourByteSets
[$charset]) {
1662 $len -= $x; // realign to position dividable by four
1664 // treat everything else as single-byte encoding
1665 return substr($string, 0, $len);
1669 * Translates all characters of a string into their respective case values.
1670 * Unlike strtolower() and strtoupper() this method is locale independent.
1671 * Note that the string length may change!
1672 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1673 * Unit-tested by Kasper
1674 * Real case folding is language dependent, this method ignores this fact.
1676 * @param string Character set of string
1677 * @param string Input string to convert case for
1678 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1679 * @return string The converted string
1680 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1681 * @see strtolower(), strtoupper()
1683 function conv_case($charset, $string, $case) {
1684 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1685 if ($case == 'toLower') {
1686 $string = mb_strtolower($string, $charset);
1688 $string = mb_strtoupper($string, $charset);
1690 } elseif ($charset == 'utf-8') {
1691 $string = $this->utf8_char_mapping($string, 'case', $case);
1692 } elseif (isset($this->eucBasedSets
[$charset])) {
1693 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1695 // treat everything else as single-byte encoding
1696 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1703 * Equivalent of lcfirst/ucfirst but using character set.
1705 * @param string $charset
1706 * @param string $string
1707 * @param string $case
1709 * @see t3lib_cs::conv_case()
1711 public function convCaseFirst($charset, $string, $case) {
1712 $firstChar = $this->substr($charset, $string, 0, 1);
1713 $firstChar = $this->conv_case($charset, $firstChar, $case);
1714 $remainder = $this->substr($charset, $string, 1);
1715 return $firstChar . $remainder;
1719 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1721 * @param string $charset Character set of string
1722 * @param string $string Input string to convert
1723 * @return string The converted string
1725 function specCharsToASCII($charset, $string) {
1726 if ($charset == 'utf-8') {
1727 $string = $this->utf8_char_mapping($string, 'ascii');
1728 } elseif (isset($this->eucBasedSets
[$charset])) {
1729 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1731 // treat everything else as single-byte encoding
1732 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1740 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1741 * into a TYPO3-readable language code
1742 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1743 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1744 * @return string a preferred language that TYPO3 supports, or "default" if none found
1745 * @author Benjamin Mack (benni.typo3.org)
1747 public function getPreferredClientLanguage($languageCodesList) {
1748 $allLanguageCodes = array();
1749 $selectedLanguage = 'default';
1751 // get all languages where TYPO3 code is the same as the ISO code
1752 foreach ($this->charSetArray
as $typo3Lang => $charSet) {
1753 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1756 // get all languages where TYPO3 code differs from ISO code
1757 // or needs the country part
1758 // the iso codes will here overwrite the default typo3 language in the key
1759 foreach ($this->locales
->getIsoMapping() as $typo3Lang => $isoLang) {
1760 $isoLang = join('-', explode('_', $isoLang));
1761 $allLanguageCodes[$typo3Lang] = $isoLang;
1764 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1765 $allLanguageCodes = array_flip($allLanguageCodes);
1768 $preferredLanguages = t3lib_div
::trimExplode(',', $languageCodesList);
1769 // order the preferred languages after they key
1770 $sortedPreferredLanguages = array();
1771 foreach ($preferredLanguages as $preferredLanguage) {
1773 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1774 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1776 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1779 // loop through the languages, with the highest priority first
1780 arsort($sortedPreferredLanguages, SORT_NUMERIC
);
1781 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1782 if (isset($allLanguageCodes[$preferredLanguage])) {
1783 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1787 // strip the country code from the end
1788 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1789 if (isset($allLanguageCodes[$preferredLanguage])) {
1790 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1794 if (!$selectedLanguage ||
$selectedLanguage == 'en') {
1795 $selectedLanguage = 'default';
1797 return $selectedLanguage;
1801 /********************************************
1803 * Internal string operation functions
1805 ********************************************/
1808 * Maps all characters of a string in a single byte charset.
1810 * @param string the string
1811 * @param string the charset
1812 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1813 * @param string 'case': conversion 'toLower' or 'toUpper'
1814 * @return string the converted string
1815 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1817 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1820 if (!$this->initCaseFolding($charset)) {
1823 $map =& $this->caseFolding
[$charset][$opt];
1827 if (!$this->initToASCII($charset)) {
1830 $map =& $this->toASCII
[$charset];
1838 for ($i = 0; strlen($str[$i]); $i++
) {
1840 if (isset($map[$c])) {
1851 /********************************************
1853 * Internal UTF-8 string operation functions
1855 ********************************************/
1858 * Returns a part of a UTF-8 string.
1859 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1861 * @param string UTF-8 string
1862 * @param integer Start position (character position)
1863 * @param integer Length (in characters)
1864 * @return string The substring
1866 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1868 function utf8_substr($str, $start, $len = NULL) {
1869 if (!strcmp($len, '0')) {
1873 $byte_start = $this->utf8_char2byte_pos($str, $start);
1874 if ($byte_start === FALSE) {
1876 return FALSE; // $start outside string length
1882 $str = substr($str, $byte_start);
1885 $byte_end = $this->utf8_char2byte_pos($str, $len);
1886 if ($byte_end === FALSE) // $len outside actual string length
1888 return $len < 0 ?
'' : $str;
1889 } // When length is less than zero and exceeds, then we return blank string.
1892 return substr($str, 0, $byte_end);
1901 * Counts the number of characters of a string in UTF-8.
1902 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1904 * @param string UTF-8 multibyte character string
1905 * @return integer The number of characters
1907 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1909 function utf8_strlen($str) {
1911 for ($i = 0; strlen($str[$i]); $i++
) {
1913 if (!($c & 0x80)) // single-byte (0xxxxxx)
1917 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1926 * Truncates a string in UTF-8 short at a given byte length.
1928 * @param string UTF-8 multibyte character string
1929 * @param integer the byte length
1930 * @return string the shortened string
1932 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1934 function utf8_strtrunc($str, $len) {
1936 if (ord($str[$i]) & 0x80) { // part of a multibyte sequence
1937 for (; $i > 0 && !(ord($str[$i]) & 0x40); $i--) {
1938 // find the first byte
1944 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1) {
1945 // calculate number of bytes
1948 if ($bc +
$i > $len) {
1949 return substr($str, 0, $i);
1951 // fallthru: multibyte char fits into length
1953 return substr($str, 0, $len);
1957 * Find position of first occurrence of a string, both arguments are in UTF-8.
1959 * @param string UTF-8 string to search in
1960 * @param string UTF-8 string to search for
1961 * @param integer Positition to start the search
1962 * @return integer The character position
1964 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1966 function utf8_strpos($haystack, $needle, $offset = 0) {
1967 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1968 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1969 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1970 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1973 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1974 if ($byte_offset === FALSE) {
1976 } // offset beyond string length
1978 $byte_pos = strpos($haystack, $needle, $byte_offset);
1979 if ($byte_pos === FALSE) {
1981 } // needle not found
1983 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1987 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1989 * @param string UTF-8 string to search in
1990 * @param string UTF-8 character to search for (single character)
1991 * @return integer The character position
1993 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1995 function utf8_strrpos($haystack, $needle) {
1996 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1997 return mb_strrpos($haystack, $needle, 'utf-8');
1998 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1999 return iconv_strrpos($haystack, $needle, 'utf-8');
2002 $byte_pos = strrpos($haystack, $needle);
2003 if ($byte_pos === FALSE) {
2005 } // needle not found
2007 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2011 * Translates a character position into an 'absolute' byte position.
2012 * Unit tested by Kasper.
2014 * @param string UTF-8 string
2015 * @param integer Character position (negative values start from the end)
2016 * @return integer Byte position
2017 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2019 function utf8_char2byte_pos($str, $pos) {
2020 $n = 0; // number of characters found
2021 $p = abs($pos); // number of characters wanted
2027 $i = strlen($str) - 1;
2031 for (; strlen($str[$i]) && $n < $p; $i +
= $d) {
2032 $c = (int) ord($str[$i]);
2033 if (!($c & 0x80)) // single-byte (0xxxxxx)
2037 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2042 if (!strlen($str[$i])) {
2044 } // offset beyond string length
2047 // skip trailing multi-byte data bytes
2048 while ((ord($str[$i]) & 0x80) && !(ord($str[$i]) & 0x40)) {
2060 * Translates an 'absolute' byte position into a character position.
2061 * Unit tested by Kasper.
2063 * @param string UTF-8 string
2064 * @param integer byte position
2065 * @return integer character position
2066 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2068 function utf8_byte2char_pos($str, $pos) {
2069 $n = 0; // number of characters
2070 for ($i = $pos; $i > 0; $i--) {
2071 $c = (int) ord($str[$i]);
2072 if (!($c & 0x80)) // single-byte (0xxxxxx)
2076 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2081 if (!strlen($str[$i])) {
2083 } // offset beyond string length
2089 * Maps all characters of an UTF-8 string.
2091 * @param string UTF-8 string
2092 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2093 * @param string 'case': conversion 'toLower' or 'toUpper'
2094 * @return string the converted string
2095 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2097 function utf8_char_mapping($str, $mode, $opt = '') {
2098 if (!$this->initUnicodeData($mode)) {
2105 $map =& $this->caseFolding
['utf-8'][$opt];
2109 $map =& $this->toASCII
['utf-8'];
2116 for ($i = 0; strlen($str[$i]); $i++
) {
2118 if (!($c & 0x80)) // single-byte (0xxxxxx)
2122 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2123 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2125 } // calculate number of bytes
2126 $mbc = substr($str, $i, $bc);
2130 if (isset($map[$mbc])) {
2141 /********************************************
2143 * Internal EUC string operation functions
2145 * Extended Unix Code:
2146 * ASCII compatible 7bit single bytes chars
2147 * 8bit two byte chars
2149 * Shift-JIS is treated as a special case.
2151 ********************************************/
2154 * Cuts a string in the EUC charset family short at a given byte length.
2156 * @param string EUC multibyte character string
2157 * @param integer the byte length
2158 * @param string the charset
2159 * @return string the shortened string
2161 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2163 function euc_strtrunc($str, $len, $charset) {
2164 $sjis = ($charset == 'shift_jis');
2165 for ($i = 0; strlen($str[$i]) && $i < $len; $i++
) {
2168 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) {
2170 } // advance a double-byte char
2175 } // advance a double-byte char
2178 if (!strlen($str[$i])) {
2180 } // string shorter than supplied length
2183 return substr($str, 0, $len - 1); // we ended on a first byte
2185 return substr($str, 0, $len);
2190 * Returns a part of a string in the EUC charset family.
2192 * @param string EUC multibyte character string
2193 * @param integer start position (character position)
2194 * @param string the charset
2195 * @param integer length (in characters)
2196 * @return string the substring
2197 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2199 function euc_substr($str, $start, $charset, $len = NULL) {
2200 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2201 if ($byte_start === FALSE) {
2203 } // $start outside string length
2205 $str = substr($str, $byte_start);
2208 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2209 if ($byte_end === FALSE) // $len outside actual string length
2215 return substr($str, 0, $byte_end);
2224 * Counts the number of characters of a string in the EUC charset family.
2226 * @param string EUC multibyte character string
2227 * @param string the charset
2228 * @return integer the number of characters
2230 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2232 function euc_strlen($str, $charset) {
2233 $sjis = ($charset == 'shift_jis');
2235 for ($i = 0; strlen($str[$i]); $i++
) {
2238 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) {
2240 } // advance a double-byte char
2245 } // advance a double-byte char
2255 * Translates a character position into an 'absolute' byte position.
2257 * @param string EUC multibyte character string
2258 * @param integer character position (negative values start from the end)
2259 * @param string the charset
2260 * @return integer byte position
2261 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2263 function euc_char2byte_pos($str, $pos, $charset) {
2264 $sjis = ($charset == 'shift_jis');
2265 $n = 0; // number of characters seen
2266 $p = abs($pos); // number of characters wanted
2272 $i = strlen($str) - 1;
2276 for (; strlen($str[$i]) && $n < $p; $i +
= $d) {
2279 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) {
2281 } // advance a double-byte char
2286 } // advance a double-byte char
2291 if (!strlen($str[$i])) {
2293 } // offset beyond string length
2303 * Maps all characters of a string in the EUC charset family.
2305 * @param string EUC multibyte character string
2306 * @param string the charset
2307 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2308 * @param string 'case': conversion 'toLower' or 'toUpper'
2309 * @return string the converted string
2310 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2312 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2315 if (!$this->initCaseFolding($charset)) {
2318 $map =& $this->caseFolding
[$charset][$opt];
2322 if (!$this->initToASCII($charset)) {
2325 $map =& $this->toASCII
[$charset];
2332 $sjis = ($charset == 'shift_jis');
2334 for ($i = 0; strlen($str[$i]); $i++
) {
2339 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
2340 $mbc = substr($str, $i, 2);
2345 if ($c >= 0x80) { // a double-byte char
2346 $mbc = substr($str, $i, 2);
2351 if (isset($map[$mbc])) {
2363 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2364 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);