2 /***************************************************************
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
29 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
33 * [CLASS/FUNCTION INDEX of SCRIPT]
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
87 * (This index is automatically created/updated by the extension "extdeveval")
95 * Functions working on UTF-8 strings:
100 * - implode/explode/join
102 * Functions nearly working on UTF-8 strings:
104 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
105 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
106 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
107 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
108 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
110 * Functions NOT working on UTF-8 strings:
122 * Class for conversion between charsets
124 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
125 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
130 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
132 // This is the array where parsed conversion tables are stored (cached)
133 var $parsedCharsets = array();
135 // An array where case folding data will be stored (cached)
136 var $caseFolding = array();
138 // An array where charset-to-ASCII mappings are stored (cached)
139 var $toASCII = array();
141 // This tells the converter which charsets has two bytes per char:
142 var $twoByteSets = array(
143 'ucs-2' => 1, // 2-byte Unicode
146 // This tells the converter which charsets has four bytes per char:
147 var $fourByteSets = array(
148 'ucs-4' => 1, // 4-byte Unicode
149 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
152 // This tells the converter which charsets use a scheme like the Extended Unix Code:
153 var $eucBasedSets = array(
154 'gb2312' => 1, // Chinese, simplified.
155 'big5' => 1, // Chinese, traditional.
156 'euc-kr' => 1, // Korean
157 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
160 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
161 // http://czyborra.com/charsets/iso8859.html
162 var $synonyms = array(
164 'us-ascii' => 'ascii',
165 'cp819' => 'iso-8859-1',
166 'ibm819' => 'iso-8859-1',
167 'iso-ir-100' => 'iso-8859-1',
168 'iso-ir-101' => 'iso-8859-2',
169 'iso-ir-109' => 'iso-8859-3',
170 'iso-ir-110' => 'iso-8859-4',
171 'iso-ir-144' => 'iso-8859-5',
172 'iso-ir-127' => 'iso-8859-6',
173 'iso-ir-126' => 'iso-8859-7',
174 'iso-ir-138' => 'iso-8859-8',
175 'iso-ir-148' => 'iso-8859-9',
176 'iso-ir-157' => 'iso-8859-10',
177 'iso-ir-179' => 'iso-8859-13',
178 'iso-ir-199' => 'iso-8859-14',
179 'iso-ir-203' => 'iso-8859-15',
180 'csisolatin1' => 'iso-8859-1',
181 'csisolatin2' => 'iso-8859-2',
182 'csisolatin3' => 'iso-8859-3',
183 'csisolatin5' => 'iso-8859-9',
184 'csisolatin8' => 'iso-8859-14',
185 'csisolatin9' => 'iso-8859-15',
186 'csisolatingreek' => 'iso-8859-7',
187 'iso-celtic' => 'iso-8859-14',
188 'latin1' => 'iso-8859-1',
189 'latin2' => 'iso-8859-2',
190 'latin3' => 'iso-8859-3',
191 'latin5' => 'iso-8859-9',
192 'latin6' => 'iso-8859-10',
193 'latin8' => 'iso-8859-14',
194 'latin9' => 'iso-8859-15',
195 'l1' => 'iso-8859-1',
196 'l2' => 'iso-8859-2',
197 'l3' => 'iso-8859-3',
198 'l5' => 'iso-8859-9',
199 'l6' => 'iso-8859-10',
200 'l8' => 'iso-8859-14',
201 'l9' => 'iso-8859-15',
202 'cyrillic' => 'iso-8859-5',
203 'arabic' => 'iso-8859-6',
204 'tis-620' => 'iso-8859-11',
205 'win874' => 'windows-874',
206 'win1250' => 'windows-1250',
207 'win1251' => 'windows-1251',
208 'win1252' => 'windows-1252',
209 'win1253' => 'windows-1253',
210 'win1254' => 'windows-1254',
211 'win1255' => 'windows-1255',
212 'win1256' => 'windows-1256',
213 'win1257' => 'windows-1257',
214 'win1258' => 'windows-1258',
215 'cp1250' => 'windows-1250',
216 'cp1251' => 'windows-1251',
217 'cp1252' => 'windows-1252',
218 'ms-ee' => 'windows-1250',
219 'ms-ansi' => 'windows-1252',
220 'ms-greek' => 'windows-1253',
221 'ms-turk' => 'windows-1254',
222 'winbaltrim' => 'windows-1257',
223 'koi-8ru' => 'koi-8r',
227 'macintosh' => 'macroman',
228 'euc-cn' => 'gb2312',
229 'x-euc-cn' => 'gb2312',
235 'sjis' => 'shift_jis',
236 'shift-jis' => 'shift_jis',
237 'cp932' => 'shift_jis',
248 // mapping of iso-639-1 language codes to script names
249 var $lang_to_script = array(
250 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
252 'bg' => 'cyrillic', // Bulgarian
253 'bs' => 'east_european', // Bosnian
254 'cs' => 'east_european', // Czech
255 'da' => 'west_european', // Danish
256 'de' => 'west_european', // German
257 'es' => 'west_european', // Spanish
259 'eo' => 'unicode', // Esperanto
260 'eu' => 'west_european', // Basque
261 'fa' => 'arabic', // Persian
262 'fi' => 'west_european', // Finish
263 'fo' => 'west_european', // Faroese
264 'fr' => 'west_european', // French
265 'ga' => 'west_european', // Irish
266 'gl' => 'west_european', // Galician
268 'he' => 'hebrew', // Hebrew (since 1998)
269 'hi' => 'unicode', // Hindi
270 'hr' => 'east_european', // Croatian
271 'hu' => 'east_european', // Hungarian
272 'iw' => 'hebrew', // Hebrew (til 1998)
273 'is' => 'west_european', // Icelandic
274 'it' => 'west_european', // Italian
276 'ka' => 'unicode', // Georgian
277 'kl' => 'west_european', // Greenlandic
278 'km' => 'unicode', // Khmer
280 'lt' => 'lithuanian',
281 'lv' => 'west_european', // Latvian/Lettish
282 'nl' => 'west_european', // Dutch
283 'no' => 'west_european', // Norwegian
284 'nb' => 'west_european', // Norwegian Bokmal
285 'nn' => 'west_european', // Norwegian Nynorsk
286 'pl' => 'east_european', // Polish
287 'pt' => 'west_european', // Portuguese
288 'ro' => 'east_european', // Romanian
289 'ru' => 'cyrillic', // Russian
290 'sk' => 'east_european', // Slovak
291 'sl' => 'east_european', // Slovenian
292 'sr' => 'cyrillic', // Serbian
293 'sv' => 'west_european', // Swedish
294 'sq' => 'albanian', // Albanian
296 'uk' => 'cyrillic', // Ukranian
297 'vi' => 'vietnamese',
299 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
300 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
302 'bgr' => 'cyrillic', // Bulgarian
303 'cat' => 'west_european', // Catalan
304 'chs' => 'simpl_chinese',
305 'cht' => 'trad_chinese',
306 'csy' => 'east_european', // Czech
307 'dan' => 'west_european', // Danisch
308 'deu' => 'west_european', // German
309 'dea' => 'west_european', // German (Austrian)
310 'des' => 'west_european', // German (Swiss)
311 'ena' => 'west_european', // English (Australian)
312 'enc' => 'west_european', // English (Canadian)
313 'eng' => 'west_european', // English
314 'enz' => 'west_european', // English (New Zealand)
315 'enu' => 'west_european', // English (United States)
316 'euq' => 'west_european', // Basque
317 'fos' => 'west_european', // Faroese
318 'far' => 'arabic', // Persian
319 'fin' => 'west_european', // Finish
320 'fra' => 'west_european', // French
321 'frb' => 'west_european', // French (Belgian)
322 'frc' => 'west_european', // French (Canadian)
323 'frs' => 'west_european', // French (Swiss)
324 'geo' => 'unicode', // Georgian
325 'glg' => 'west_european', // Galician
328 'hin' => 'unicode', // Hindi
329 'hun' => 'east_european', // Hungarian
330 'isl' => 'west_euorpean', // Icelandic
331 'ita' => 'west_european', // Italian
332 'its' => 'west_european', // Italian (Swiss)
334 'khm' => 'unicode', // Khmer
336 'lth' => 'lithuanian',
337 'lvi' => 'west_european', // Latvian/Lettish
338 'msl' => 'west_european', // Malay
339 'nlb' => 'west_european', // Dutch (Belgian)
340 'nld' => 'west_european', // Dutch
341 'nor' => 'west_european', // Norwegian (bokmal)
342 'non' => 'west_european', // Norwegian (nynorsk)
343 'plk' => 'east_european', // Polish
344 'ptg' => 'west_european', // Portuguese
345 'ptb' => 'west_european', // Portuguese (Brazil)
346 'rom' => 'east_european', // Romanian
347 'rus' => 'cyrillic', // Russian
348 'slv' => 'east_european', // Slovenian
349 'sky' => 'east_european', // Slovak
350 'srl' => 'east_european', // Serbian (Latin)
351 'srb' => 'cyrillic', // Serbian (Cyrillic)
352 'esp' => 'west_european', // Spanish (trad. sort)
353 'esm' => 'west_european', // Spanish (Mexican)
354 'esn' => 'west_european', // Spanish (internat. sort)
355 'sve' => 'west_european', // Swedish
356 'sqi' => 'albanian', // Albanian
359 'ukr' => 'cyrillic', // Ukrainian
360 // English language names
361 'albanian' => 'albanian',
362 'arabic' => 'arabic',
363 'basque' => 'west_european',
364 'bosnian' => 'east_european',
365 'bulgarian' => 'east_european',
366 'catalan' => 'west_european',
367 'croatian' => 'east_european',
368 'czech' => 'east_european',
369 'danish' => 'west_european',
370 'dutch' => 'west_european',
371 'english' => 'west_european',
372 'esperanto' => 'unicode',
373 'estonian' => 'estonian',
374 'faroese' => 'west_european',
376 'finnish' => 'west_european',
377 'french' => 'west_european',
378 'galician' => 'west_european',
379 'georgian' => 'unicode',
380 'german' => 'west_european',
382 'greenlandic' => 'west_european',
383 'hebrew' => 'hebrew',
384 'hindi' => 'unicode',
385 'hungarian' => 'east_european',
386 'icelandic' => 'west_european',
387 'italian' => 'west_european',
388 'khmer' => 'unicode',
389 'latvian' => 'west_european',
390 'lettish' => 'west_european',
391 'lithuanian' => 'lithuanian',
392 'malay' => 'west_european',
393 'norwegian' => 'west_european',
394 'persian' => 'arabic',
395 'polish' => 'east_european',
396 'portuguese' => 'west_european',
397 'russian' => 'cyrillic',
398 'romanian' => 'east_european',
399 'serbian' => 'cyrillic',
400 'slovak' => 'east_european',
401 'slovenian' => 'east_european',
402 'spanish' => 'west_european',
403 'svedish' => 'west_european',
405 'turkish' => 'turkish',
406 'ukrainian' => 'cyrillic',
409 // mapping of language (family) names to charsets on Unix
410 var $script_to_charset_unix = array(
411 'west_european' => 'iso-8859-1',
412 'estonian' => 'iso-8859-1',
413 'east_european' => 'iso-8859-2',
414 'baltic' => 'iso-8859-4',
415 'cyrillic' => 'iso-8859-5',
416 'arabic' => 'iso-8859-6',
417 'greek' => 'iso-8859-7',
418 'hebrew' => 'iso-8859-8',
419 'turkish' => 'iso-8859-9',
420 'thai' => 'iso-8859-11', // = TIS-620
421 'lithuanian' => 'iso-8859-13',
422 'chinese' => 'gb2312', // = euc-cn
423 'japanese' => 'euc-jp',
424 'korean' => 'euc-kr',
425 'simpl_chinese' => 'gb2312',
426 'trad_chinese' => 'big5',
428 'unicode' => 'utf-8',
429 'albanian' => 'utf-8'
432 // mapping of language (family) names to charsets on Windows
433 var $script_to_charset_windows = array(
434 'east_european' => 'windows-1250',
435 'cyrillic' => 'windows-1251',
436 'west_european' => 'windows-1252',
437 'greek' => 'windows-1253',
438 'turkish' => 'windows-1254',
439 'hebrew' => 'windows-1255',
440 'arabic' => 'windows-1256',
441 'baltic' => 'windows-1257',
442 'estonian' => 'windows-1257',
443 'lithuanian' => 'windows-1257',
444 'vietnamese' => 'windows-1258',
447 'chinese' => 'gb2312',
448 'japanese' => 'shift_jis',
449 'simpl_chinese' => 'gb2312',
450 'trad_chinese' => 'big5',
451 'albanian' => 'windows-1250',
455 // mapping of locale names to charsets
456 var $locale_to_charset = array(
457 'japanese.euc' => 'euc-jp',
458 'ja_jp.ujis' => 'euc-jp',
459 'korean.euc' => 'euc-kr',
460 'sr@Latn' => 'iso-8859-2',
466 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
467 // Empty values means "iso-8859-1"
468 var $charSetArray = array(
476 'cz' => 'windows-1250',
477 'pl' => 'iso-8859-2',
478 'si' => 'windows-1250',
480 'tr' => 'iso-8859-9',
483 'ru' => 'windows-1251',
484 'ro' => 'iso-8859-2',
486 'sk' => 'windows-1250',
487 'lt' => 'windows-1257',
489 'hr' => 'windows-1250',
490 'hu' => 'iso-8859-2',
492 'th' => 'iso-8859-11',
493 'gr' => 'iso-8859-7',
496 'bg' => 'windows-1251',
498 'et' => 'iso-8859-4',
499 'ar' => 'iso-8859-6',
501 'ua' => 'windows-1251',
505 'ca' => 'iso-8859-15',
506 'ba' => 'iso-8859-2',
521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
522 // Missing keys means: same as Typo3
523 var $isoArray = array(
544 * Normalize - changes input character set to lowercase letters.
546 * @param string Input charset
547 * @return string Normalized charset
548 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
550 function parse_charset($charset) {
551 $charset = trim(strtolower($charset));
552 if (isset($this->synonyms
[$charset])) {
553 $charset = $this->synonyms
[$charset];
560 * Get the charset of a locale.
563 * ln_CN language / country
564 * ln_CN.cs language / country / charset
565 * ln_CN.cs@mod language / country / charset / modifier
567 * @param string Locale string
568 * @return string Charset resolved for locale string
569 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
571 function get_locale_charset($locale) {
572 $locale = strtolower($locale);
574 // exact locale specific charset?
575 if (isset($this->locale_to_charset
[$locale])) {
576 return $this->locale_to_charset
[$locale];
580 list($locale, $modifier) = explode('@', $locale);
582 // locale contains charset: use it
583 list($locale, $charset) = explode('.', $locale);
585 return $this->parse_charset($charset);
588 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
589 if ($modifier == 'euro') {
590 return 'iso-8859-15';
594 list($language, $country) = explode('_', $locale);
595 if (isset($this->lang_to_script
[$language])) {
596 $script = $this->lang_to_script
[$language];
599 if (TYPO3_OS
== 'WIN') {
600 $cs = $this->script_to_charset_windows
[$script] ?
$this->script_to_charset_windows
[$script] : 'windows-1252';
602 $cs = $this->script_to_charset_unix
[$script] ?
$this->script_to_charset_unix
[$script] : 'iso-8859-1';
609 /********************************************
611 * Charset Conversion functions
613 ********************************************/
616 * Convert from one charset to another charset.
618 * @param string Input string
619 * @param string From charset (the current charset of the string)
620 * @param string To charset (the output charset wanted)
621 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
622 * @return string Converted string
625 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
626 if ($fromCS == $toCS) {
630 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
631 if ($toCS == 'utf-8' ||
!$useEntityForNoChar) {
632 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
634 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
635 if (FALSE !== $conv_str) {
637 } // returns false for unsupported charsets
641 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
642 if (FALSE !== $conv_str) {
648 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
649 if (FALSE !== $conv_str) {
654 // fallback to TYPO3 conversion
657 if ($fromCS != 'utf-8') {
658 $str = $this->utf8_encode($str, $fromCS);
660 if ($toCS != 'utf-8') {
661 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
667 * Convert all elements in ARRAY with type string from one charset to another charset.
668 * NOTICE: Array is passed by reference!
670 * @param string Input array, possibly multidimensional
671 * @param string From charset (the current charset of the string)
672 * @param string To charset (the output charset wanted)
673 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
677 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
678 foreach ($array as $key => $value) {
679 if (is_array($array[$key])) {
680 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
681 } elseif (is_string($array[$key])) {
682 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
688 * Converts $str from $charset to UTF-8
690 * @param string String in local charset to convert to UTF-8
691 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
692 * @return string Output string, converted to UTF-8
694 function utf8_encode($str, $charset) {
696 if ($charset === 'utf-8') {
700 // Charset is case-insensitive.
701 if ($this->initCharset($charset)) { // Parse conv. table if not already...
702 $strLen = strlen($str);
705 for ($a = 0; $a < $strLen; $a++
) { // Traverse each char in string.
706 $chr = substr($str, $a, 1);
708 if (isset($this->twoByteSets
[$charset])) { // If the charset has two bytes per char
709 $ord2 = ord($str{$a +
1});
710 $ord = $ord << 8 |
$ord2; // assume big endian
712 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
713 $outStr .= $this->parsedCharsets
[$charset]['local'][$ord];
715 $outStr .= chr($this->noCharByteVal
);
718 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
719 if (isset($this->eucBasedSets
[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
720 if ($charset != 'shift_jis' ||
($ord < 0xA0 ||
$ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
722 $ord2 = ord(substr($str, $a, 1));
723 $ord = $ord * 256 +
$ord2;
727 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
728 $outStr .= $this->parsedCharsets
[$charset]['local'][$ord];
730 $outStr .= chr($this->noCharByteVal
);
734 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
741 * Converts $str from UTF-8 to $charset
743 * @param string String in UTF-8 to convert to local charset
744 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
745 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
746 * @return string Output string, converted to local charset
748 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
750 if ($charset === 'utf-8') {
754 // Charset is case-insensitive.
755 if ($this->initCharset($charset)) { // Parse conv. table if not already...
756 $strLen = strlen($str);
759 for ($a = 0, $i = 0; $a < $strLen; $a++
, $i++
) { // Traverse each char in UTF-8 string.
760 $chr = substr($str, $a, 1);
762 if ($ord > 127) { // This means multibyte! (first byte!)
763 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
765 $buf = $chr; // Add first byte
766 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
767 $ord = $ord << 1; // Shift it left and ...
768 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
769 $a++
; // Increase pointer...
770 $buf .= substr($str, $a, 1); // ... and add the next char.
776 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
777 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
778 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
779 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
781 $outStr .= chr($mByte);
783 } elseif ($useEntityForNoChar) { // Create num entity:
784 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
786 $outStr .= chr($this->noCharByteVal
);
789 $outStr .= chr($this->noCharByteVal
);
790 } // No char exists (MIDDLE of MB sequence!)
793 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
800 * Converts all chars > 127 to numeric entities.
802 * @param string Input string
803 * @return string Output string
805 function utf8_to_entities($str) {
806 $strLen = strlen($str);
809 for ($a = 0; $a < $strLen; $a++
) { // Traverse each char in UTF-8 string.
810 $chr = substr($str, $a, 1);
812 if ($ord > 127) { // This means multibyte! (first byte!)
813 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
814 $buf = $chr; // Add first byte
815 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
816 $ord = $ord << 1; // Shift it left and ...
817 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
818 $a++
; // Increase pointer...
819 $buf .= substr($str, $a, 1); // ... and add the next char.
825 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
827 $outStr .= chr($this->noCharByteVal
);
828 } // No char exists (MIDDLE of MB sequence!)
831 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
838 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
840 * @param string Input string, UTF-8
841 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
842 * @return string Output string
844 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
845 if ($alsoStdHtmlEnt) {
846 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
849 $token = md5(microtime());
850 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
851 foreach ($parts as $k => $v) {
853 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
854 if (substr($v, 1, 1) == 'x') {
855 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
857 $parts[$k] = $this->UnumberToChar(substr($v, 1));
859 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
860 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
861 } else { // No conversion:
862 $parts[$k] = '&' . $v . ';';
867 return implode('', $parts);
871 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
873 * @param string Input string, UTF-8
874 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
875 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
876 * @return array Output array with the char numbers
878 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
879 // If entities must be registered as well...:
881 $str = $this->entities_to_utf8($str, 1);
884 $strLen = strlen($str);
887 for ($a = 0; $a < $strLen; $a++
) { // Traverse each char in UTF-8 string.
888 $chr = substr($str, $a, 1);
890 if ($ord > 127) { // This means multibyte! (first byte!)
891 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
892 $buf = $chr; // Add first byte
893 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
894 $ord = $ord << 1; // Shift it left and ...
895 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
896 $a++
; // Increase pointer...
897 $buf .= substr($str, $a, 1); // ... and add the next char.
903 $outArr[] = $retChar ?
$buf : $this->utf8CharToUnumber($buf);
905 $outArr[] = $retChar ?
chr($this->noCharByteVal
) : $this->noCharByteVal
;
906 } // No char exists (MIDDLE of MB sequence!)
908 $outArr[] = $retChar ?
chr($ord) : $ord;
909 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
916 * Converts a UNICODE number to a UTF-8 multibyte character
917 * Algorithm based on script found at From: http://czyborra.com/utf/
918 * Unit-tested by Kasper
920 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
922 * bytes | bits | representation
924 * 2 | 11 | 110vvvvv 10vvvvvv
925 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
926 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
927 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
928 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
930 * @param integer UNICODE integer
931 * @return string UTF-8 multibyte character string
932 * @see utf8CharToUnumber()
934 function UnumberToChar($cbyte) {
940 if ($cbyte < 0x800) {
941 $str .= chr(0xC0 |
($cbyte >> 6));
942 $str .= chr(0x80 |
($cbyte & 0x3F));
944 if ($cbyte < 0x10000) {
945 $str .= chr(0xE0 |
($cbyte >> 12));
946 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
947 $str .= chr(0x80 |
($cbyte & 0x3F));
949 if ($cbyte < 0x200000) {
950 $str .= chr(0xF0 |
($cbyte >> 18));
951 $str .= chr(0x80 |
(($cbyte >> 12) & 0x3F));
952 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
953 $str .= chr(0x80 |
($cbyte & 0x3F));
955 if ($cbyte < 0x4000000) {
956 $str .= chr(0xF8 |
($cbyte >> 24));
957 $str .= chr(0x80 |
(($cbyte >> 18) & 0x3F));
958 $str .= chr(0x80 |
(($cbyte >> 12) & 0x3F));
959 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
960 $str .= chr(0x80 |
($cbyte & 0x3F));
962 if ($cbyte < 0x80000000) {
963 $str .= chr(0xFC |
($cbyte >> 30));
964 $str .= chr(0x80 |
(($cbyte >> 24) & 0x3F));
965 $str .= chr(0x80 |
(($cbyte >> 18) & 0x3F));
966 $str .= chr(0x80 |
(($cbyte >> 12) & 0x3F));
967 $str .= chr(0x80 |
(($cbyte >> 6) & 0x3F));
968 $str .= chr(0x80 |
($cbyte & 0x3F));
969 } else { // Cannot express a 32-bit character in UTF-8
970 $str .= chr($this->noCharByteVal
);
981 * Converts a UTF-8 Multibyte character to a UNICODE number
982 * Unit-tested by Kasper
984 * @param string UTF-8 multibyte character string
985 * @param boolean If set, then a hex. number is returned.
986 * @return integer UNICODE integer
987 * @see UnumberToChar()
989 function utf8CharToUnumber($str, $hex = 0) {
990 $ord = ord(substr($str, 0, 1)); // First char
992 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
994 for ($b = 0; $b < 8; $b++
) { // for each byte in multibyte string...
995 $ord = $ord << 1; // Shift it left and ...
996 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
997 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b +
1, 1))), -6);
1002 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
1004 $int = bindec($binBuf);
1009 return $hex ?
'x' . dechex($int) : $int;
1013 /********************************************
1017 ********************************************/
1020 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1021 * This function is automatically called by the conversion functions
1023 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1025 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1026 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1029 function initCharset($charset) {
1030 // Only process if the charset is not yet loaded:
1031 if (empty($this->parsedCharsets
[$charset]) ||
!is_array($this->parsedCharsets
[$charset])) {
1033 // Conversion table filename:
1034 $charsetConvTableFile = PATH_t3lib
. 'csconvtbl/' . $charset . '.tbl';
1036 // If the conversion table is found:
1037 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
1038 // Cache file for charsets:
1039 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1040 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1041 if ($cacheFile && @is_file
($cacheFile)) {
1042 $this->parsedCharsets
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1044 // Parse conversion table into lines:
1045 $lines = t3lib_div
::trimExplode(LF
, t3lib_div
::getUrl($charsetConvTableFile), 1);
1046 // Initialize the internal variable holding the conv. table:
1047 $this->parsedCharsets
[$charset] = array('local' => array(), 'utf8' => array());
1048 // traverse the lines:
1050 foreach ($lines as $value) {
1051 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1053 // Detect type if not done yet: (Done on first real line)
1054 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1055 if (!$detectedType) {
1056 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ?
'whitespaced' : 'ms-token';
1059 if ($detectedType == 'ms-token') {
1060 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1061 } elseif ($detectedType == 'whitespaced') {
1063 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1064 $hexbyte = $regA[1];
1065 $utf8 = 'U+' . $regA[2];
1067 $decval = hexdec(trim($hexbyte));
1068 if ($decval > 127) {
1069 $utf8decval = hexdec(substr(trim($utf8), 2));
1070 $this->parsedCharsets
[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1071 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]] = $decval;
1076 t3lib_div
::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets
[$charset]));
1089 * This function initializes all UTF-8 character data tables.
1091 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1093 * @param string Mode ("case", "ascii", ...)
1094 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1097 function initUnicodeData($mode = NULL) {
1099 $cacheFileCase = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1100 $cacheFileASCII = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1102 // Only process if the tables are not yet loaded
1105 if (is_array($this->caseFolding
['utf-8'])) {
1109 // Use cached version if possible
1110 if ($cacheFileCase && @is_file
($cacheFileCase)) {
1111 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileCase));
1117 if (is_array($this->toASCII
['utf-8'])) {
1121 // Use cached version if possible
1122 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
1123 $this->toASCII
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileASCII));
1129 // process main Unicode data file
1130 $unicodeDataFile = PATH_t3lib
. 'unidata/UnicodeData.txt';
1131 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) {
1135 $fh = fopen($unicodeDataFile, 'rb');
1140 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1141 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1142 $this->caseFolding
['utf-8'] = array();
1143 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
1144 $utf8CaseFolding['toUpper'] = array();
1145 $utf8CaseFolding['toLower'] = array();
1146 $utf8CaseFolding['toTitle'] = array();
1148 $decomposition = array(); // array of temp. decompositions
1149 $mark = array(); // array of chars that are marks (eg. composing accents)
1150 $number = array(); // array of chars that are numbers (eg. digits)
1151 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1153 while (!feof($fh)) {
1154 $line = fgets($fh, 4096);
1155 // has a lot of info
1156 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1158 $ord = hexdec($char);
1159 if ($ord > 0xFFFF) {
1161 } // only process the BMP
1163 $utf8_char = $this->UnumberToChar($ord);
1166 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1169 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1171 // store "title" only when different from "upper" (only a few)
1172 if ($title && $title != $upper) {
1173 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1177 case 'M': // mark (accent, umlaut, ...)
1178 $mark["U+$char"] = 1;
1181 case 'N': // numeric value
1182 if ($ord > 0x80 && $num != '') {
1183 $number["U+$char"] = $num;
1187 // accented Latin letters without "official" decomposition
1189 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1190 $c = ord($match[2]);
1191 if ($match[1] == 'SMALL') {
1195 $decomposition["U+$char"] = array(dechex($c));
1200 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1201 switch ($match[1]) {
1202 case '<circle>': // add parenthesis as circle replacement, eg (1)
1203 $match[2] = '0028 ' . $match[2] . ' 0029';
1206 case '<square>': // add square brackets as square replacement, eg [1]
1207 $match[2] = '005B ' . $match[2] . ' 005D';
1210 case '<compat>': // ignore multi char decompositions that start with a space
1211 if (preg_match('/^0020 /', $match[2])) {
1216 // ignore Arabic and vertical layout presentation decomposition
1224 $decomposition["U+$char"] = explode(' ', $match[2]);
1229 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1230 $specialCasingFile = PATH_t3lib
. 'unidata/SpecialCasing.txt';
1231 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1232 $fh = fopen($specialCasingFile, 'rb');
1234 while (!feof($fh)) {
1235 $line = fgets($fh, 4096);
1236 if ($line{0} != '#' && trim($line) != '') {
1238 list($char, $lower, $title, $upper, $cond) = t3lib_div
::trimExplode(';', $line);
1239 if ($cond == '' ||
$cond{0} == '#') {
1240 $utf8_char = $this->UnumberToChar(hexdec($char));
1241 if ($char != $lower) {
1242 $arr = explode(' ', $lower);
1243 for ($i = 0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1246 if ($char != $title && $title != $upper) {
1247 $arr = explode(' ', $title);
1248 for ($i = 0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1249 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1251 if ($char != $upper) {
1252 $arr = explode(' ', $upper);
1253 for ($i = 0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1254 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1263 // process custom decompositions
1264 $customTranslitFile = PATH_t3lib
. 'unidata/Translit.txt';
1265 if (t3lib_div
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1266 $fh = fopen($customTranslitFile, 'rb');
1268 while (!feof($fh)) {
1269 $line = fgets($fh, 4096);
1270 if ($line{0} != '#' && trim($line) != '') {
1271 list($char, $translit) = t3lib_div
::trimExplode(';', $line);
1273 $omit["U+$char"] = 1;
1275 $decomposition["U+$char"] = explode(' ', $translit);
1283 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1284 foreach ($decomposition as $from => $to) {
1285 $code_decomp = array();
1287 while ($code_value = array_shift($to)) {
1288 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1289 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1290 array_unshift($to, $cv);
1292 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1293 array_push($code_decomp, $code_value);
1296 if (count($code_decomp) ||
isset($omit[$from])) {
1297 $decomposition[$from] = $code_decomp;
1299 unset($decomposition[$from]);
1303 // create ascii only mapping
1304 $this->toASCII
['utf-8'] = array();
1305 $ascii =& $this->toASCII
['utf-8'];
1307 foreach ($decomposition as $from => $to) {
1308 $code_decomp = array();
1309 while ($code_value = array_shift($to)) {
1310 $ord = hexdec($code_value);
1313 } // skip decompositions containing non-ASCII chars
1316 array_push($code_decomp, chr($ord));
1319 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1322 // add numeric decompositions
1323 foreach ($number as $from => $to) {
1324 $utf8_char = $this->UnumberToChar(hexdec($from));
1325 if (!isset($ascii[$utf8_char])) {
1326 $ascii[$utf8_char] = $to;
1330 if ($cacheFileCase) {
1331 t3lib_div
::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1334 if ($cacheFileASCII) {
1335 t3lib_div
::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1342 * This function initializes the folding table for a charset other than UTF-8.
1343 * This function is automatically called by the case folding functions.
1345 * @param string Charset for which to initialize case folding.
1346 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1349 function initCaseFolding($charset) {
1350 // Only process if the case table is not yet loaded:
1351 if (is_array($this->caseFolding
[$charset])) {
1355 // Use cached version if possible
1356 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1357 if ($cacheFile && @is_file
($cacheFile)) {
1358 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1362 // init UTF-8 conversion for this charset
1363 if (!$this->initCharset($charset)) {
1367 // UTF-8 case folding is used as the base conversion table
1368 if (!$this->initUnicodeData('case')) {
1372 $nochar = chr($this->noCharByteVal
);
1373 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1374 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1375 $c = $this->utf8_decode($utf8, $charset);
1377 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1378 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1379 if ($cc != '' && $cc != $nochar) {
1380 $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1383 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1384 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1385 if ($cc != '' && $cc != $nochar) {
1386 $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1389 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1390 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1391 if ($cc != '' && $cc != $nochar) {
1392 $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1396 // add the ASCII case table
1397 for ($i = ord('a'); $i <= ord('z'); $i++
) {
1398 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i - 32);
1400 for ($i = ord('A'); $i <= ord('Z'); $i++
) {
1401 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i +
32);
1405 t3lib_div
::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding
[$charset]));
1412 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1413 * This function is automatically called by the ASCII transliteration functions.
1415 * @param string Charset for which to initialize conversion.
1416 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1419 function initToASCII($charset) {
1420 // Only process if the case table is not yet loaded:
1421 if (is_array($this->toASCII
[$charset])) {
1425 // Use cached version if possible
1426 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1427 if ($cacheFile && @is_file
($cacheFile)) {
1428 $this->toASCII
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1432 // init UTF-8 conversion for this charset
1433 if (!$this->initCharset($charset)) {
1437 // UTF-8/ASCII transliteration is used as the base conversion table
1438 if (!$this->initUnicodeData('ascii')) {
1442 $nochar = chr($this->noCharByteVal
);
1443 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1444 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1445 $c = $this->utf8_decode($utf8, $charset);
1447 if (isset($this->toASCII
['utf-8'][$utf8])) {
1448 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1453 t3lib_div
::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII
[$charset]));
1460 /********************************************
1462 * String operation functions
1464 ********************************************/
1467 * Returns a part of a string.
1468 * Unit-tested by Kasper (single byte charsets only)
1470 * @param string The character set
1471 * @param string Character string
1472 * @param integer Start position (character position)
1473 * @param integer Length (in characters)
1474 * @return string The substring
1475 * @see substr(), mb_substr()
1476 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1478 function substr($charset, $string, $start, $len = NULL) {
1479 if ($len === 0 ||
$string === '') {
1483 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1484 // cannot omit $len, when specifying charset
1486 $enc = mb_internal_encoding(); // save internal encoding
1487 mb_internal_encoding($charset);
1488 $str = mb_substr($string, $start);
1489 mb_internal_encoding($enc); // restore internal encoding
1494 return mb_substr($string, $start, $len, $charset);
1496 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1497 // cannot omit $len, when specifying charset
1499 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1500 iconv_set_encoding('internal_encoding', $charset);
1501 $str = iconv_substr($string, $start);
1502 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1507 return iconv_substr($string, $start, $len, $charset);
1509 } elseif ($charset == 'utf-8') {
1510 return $this->utf8_substr($string, $start, $len);
1511 } elseif ($this->eucBasedSets
[$charset]) {
1512 return $this->euc_substr($string, $start, $charset, $len);
1513 } elseif ($this->twoByteSets
[$charset]) {
1514 return substr($string, $start * 2, $len * 2);
1515 } elseif ($this->fourByteSets
[$charset]) {
1516 return substr($string, $start * 4, $len * 4);
1519 // treat everything else as single-byte encoding
1520 return $len === NULL ?
substr($string, $start) : substr($string, $start, $len);
1524 * Counts the number of characters.
1525 * Unit-tested by Kasper (single byte charsets only)
1527 * @param string The character set
1528 * @param string Character string
1529 * @return integer The number of characters
1531 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1533 function strlen($charset, $string) {
1534 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1535 return mb_strlen($string, $charset);
1536 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1537 return iconv_strlen($string, $charset);
1538 } elseif ($charset == 'utf-8') {
1539 return $this->utf8_strlen($string);
1540 } elseif ($this->eucBasedSets
[$charset]) {
1541 return $this->euc_strlen($string, $charset);
1542 } elseif ($this->twoByteSets
[$charset]) {
1543 return strlen($string) / 2;
1544 } elseif ($this->fourByteSets
[$charset]) {
1545 return strlen($string) / 4;
1547 // treat everything else as single-byte encoding
1548 return strlen($string);
1552 * Method to crop strings using the mb_substr function.
1554 * @param string The character set
1555 * @param string String to be cropped
1556 * @param integer Crop length (in characters)
1557 * @param string Crop signifier
1558 * @return string The shortened string
1559 * @see mb_strlen(), mb_substr()
1561 protected function cropMbstring($charset, $string, $len, $crop = '') {
1562 if (intval($len) === 0 ||
mb_strlen($string, $charset) <= abs($len)) {
1567 $string = mb_substr($string, 0, $len, $charset) . $crop;
1569 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1576 * Truncates a string and pre-/appends a string.
1577 * Unit tested by Kasper
1579 * @param string The character set
1580 * @param string Character string
1581 * @param integer Length (in characters)
1582 * @param string Crop signifier
1583 * @return string The shortened string
1584 * @see substr(), mb_strimwidth()
1585 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1587 function crop($charset, $string, $len, $crop = '') {
1588 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1589 return $this->cropMbstring($charset, $string, $len, $crop);
1592 if (intval($len) == 0) {
1596 if ($charset == 'utf-8') {
1597 $i = $this->utf8_char2byte_pos($string, $len);
1598 } elseif ($this->eucBasedSets
[$charset]) {
1599 $i = $this->euc_char2byte_pos($string, $len, $charset);
1604 $i = strlen($string) +
$len;
1611 if ($i === FALSE) { // $len outside actual string length
1615 if (strlen($string{$i})) {
1616 return substr($string, 0, $i) . $crop;
1620 if (strlen($string{$i - 1})) {
1621 return $crop . substr($string, $i);
1626 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1628 return substr($string,0,$i).$crop;
1630 return $crop.substr($string,$i);
1639 * Cuts a string short at a given byte length.
1641 * @param string The character set
1642 * @param string Character string
1643 * @param integer The byte length
1644 * @return string The shortened string
1646 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1648 function strtrunc($charset, $string, $len) {
1653 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1654 return mb_strcut($string, 0, $len, $charset);
1655 } elseif ($charset == 'utf-8') {
1656 return $this->utf8_strtrunc($string, $len);
1657 } elseif ($this->eucBasedSets
[$charset]) {
1658 return $this->euc_strtrunc($string, $len, $charset);
1659 } elseif ($this->twoByteSets
[$charset]) {
1662 } // don't cut at odd positions
1663 } elseif ($this->fourByteSets
[$charset]) {
1665 $len -= $x; // realign to position dividable by four
1667 // treat everything else as single-byte encoding
1668 return substr($string, 0, $len);
1672 * Translates all characters of a string into their respective case values.
1673 * Unlike strtolower() and strtoupper() this method is locale independent.
1674 * Note that the string length may change!
1675 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1676 * Unit-tested by Kasper
1677 * Real case folding is language dependent, this method ignores this fact.
1679 * @param string Character set of string
1680 * @param string Input string to convert case for
1681 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1682 * @return string The converted string
1683 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1684 * @see strtolower(), strtoupper()
1686 function conv_case($charset, $string, $case) {
1687 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1688 if ($case == 'toLower') {
1689 $string = mb_strtolower($string, $charset);
1691 $string = mb_strtoupper($string, $charset);
1693 } elseif ($charset == 'utf-8') {
1694 $string = $this->utf8_char_mapping($string, 'case', $case);
1695 } elseif (isset($this->eucBasedSets
[$charset])) {
1696 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1698 // treat everything else as single-byte encoding
1699 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1706 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1708 * @param string Character set of string
1709 * @param string Input string to convert
1710 * @return string The converted string
1712 function specCharsToASCII($charset, $string) {
1713 if ($charset == 'utf-8') {
1714 $string = $this->utf8_char_mapping($string, 'ascii');
1715 } elseif (isset($this->eucBasedSets
[$charset])) {
1716 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1718 // treat everything else as single-byte encoding
1719 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1727 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1728 * into a TYPO3-readable language code
1729 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1730 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1731 * @return string a preferred language that TYPO3 supports, or "default" if none found
1732 * @author Benjamin Mack (benni.typo3.org)
1734 public function getPreferredClientLanguage($languageCodesList) {
1735 $allLanguageCodes = array();
1736 $selectedLanguage = 'default';
1738 // get all languages where TYPO3 code is the same as the ISO code
1739 foreach ($this->charSetArray
as $typo3Lang => $charSet) {
1740 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1743 // get all languages where TYPO3 code differs from ISO code
1744 // or needs the country part
1745 // the iso codes will here overwrite the default typo3 language in the key
1746 foreach ($this->isoArray
as $typo3Lang => $isoLang) {
1747 $isoLang = join('-', explode('_', $isoLang));
1748 $allLanguageCodes[$typo3Lang] = $isoLang;
1751 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1752 $allLanguageCodes = array_flip($allLanguageCodes);
1755 $preferredLanguages = t3lib_div
::trimExplode(',', $languageCodesList);
1756 // order the preferred languages after they key
1757 $sortedPreferredLanguages = array();
1758 foreach ($preferredLanguages as $preferredLanguage) {
1760 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1761 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1763 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1766 // loop through the languages, with the highest priority first
1767 arsort($sortedPreferredLanguages, SORT_NUMERIC
);
1768 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1769 if (isset($allLanguageCodes[$preferredLanguage])) {
1770 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1774 // strip the country code from the end
1775 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1776 if (isset($allLanguageCodes[$preferredLanguage])) {
1777 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1781 if (!$selectedLanguage ||
$selectedLanguage == 'en') {
1782 $selectedLanguage = 'default';
1784 return $selectedLanguage;
1788 /********************************************
1790 * Internal string operation functions
1792 ********************************************/
1795 * Maps all characters of a string in a single byte charset.
1797 * @param string the string
1798 * @param string the charset
1799 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1800 * @param string 'case': conversion 'toLower' or 'toUpper'
1801 * @return string the converted string
1802 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1804 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1807 if (!$this->initCaseFolding($charset)) {
1810 $map =& $this->caseFolding
[$charset][$opt];
1814 if (!$this->initToASCII($charset)) {
1817 $map =& $this->toASCII
[$charset];
1825 for ($i = 0; strlen($str{$i}); $i++
) {
1827 if (isset($map[$c])) {
1838 /********************************************
1840 * Internal UTF-8 string operation functions
1842 ********************************************/
1845 * Returns a part of a UTF-8 string.
1846 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1848 * @param string UTF-8 string
1849 * @param integer Start position (character position)
1850 * @param integer Length (in characters)
1851 * @return string The substring
1853 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1855 function utf8_substr($str, $start, $len = NULL) {
1856 if (!strcmp($len, '0')) {
1860 $byte_start = $this->utf8_char2byte_pos($str, $start);
1861 if ($byte_start === FALSE) {
1863 return FALSE; // $start outside string length
1869 $str = substr($str, $byte_start);
1872 $byte_end = $this->utf8_char2byte_pos($str, $len);
1873 if ($byte_end === FALSE) // $len outside actual string length
1875 return $len < 0 ?
'' : $str;
1876 } // When length is less than zero and exceeds, then we return blank string.
1879 return substr($str, 0, $byte_end);
1888 * Counts the number of characters of a string in UTF-8.
1889 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1891 * @param string UTF-8 multibyte character string
1892 * @return integer The number of characters
1894 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1896 function utf8_strlen($str) {
1898 for ($i = 0; strlen($str{$i}); $i++
) {
1900 if (!($c & 0x80)) // single-byte (0xxxxxx)
1904 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1913 * Truncates a string in UTF-8 short at a given byte length.
1915 * @param string UTF-8 multibyte character string
1916 * @param integer the byte length
1917 * @return string the shortened string
1919 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1921 function utf8_strtrunc($str, $len) {
1923 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1924 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1928 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++
; // calculate number of bytes
1929 if ($bc +
$i > $len) {
1930 return substr($str, 0, $i);
1932 // fallthru: multibyte char fits into length
1934 return substr($str, 0, $len);
1938 * Find position of first occurrence of a string, both arguments are in UTF-8.
1940 * @param string UTF-8 string to search in
1941 * @param string UTF-8 string to search for
1942 * @param integer Positition to start the search
1943 * @return integer The character position
1945 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1947 function utf8_strpos($haystack, $needle, $offset = 0) {
1948 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1949 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1950 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1951 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1954 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1955 if ($byte_offset === FALSE) {
1957 } // offset beyond string length
1959 $byte_pos = strpos($haystack, $needle, $byte_offset);
1960 if ($byte_pos === FALSE) {
1962 } // needle not found
1964 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1968 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1970 * @param string UTF-8 string to search in
1971 * @param string UTF-8 character to search for (single character)
1972 * @return integer The character position
1974 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1976 function utf8_strrpos($haystack, $needle) {
1977 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1978 return mb_strrpos($haystack, $needle, 'utf-8');
1979 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1980 return iconv_strrpos($haystack, $needle, 'utf-8');
1983 $byte_pos = strrpos($haystack, $needle);
1984 if ($byte_pos === FALSE) {
1986 } // needle not found
1988 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1992 * Translates a character position into an 'absolute' byte position.
1993 * Unit tested by Kasper.
1995 * @param string UTF-8 string
1996 * @param integer Character position (negative values start from the end)
1997 * @return integer Byte position
1998 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2000 function utf8_char2byte_pos($str, $pos) {
2001 $n = 0; // number of characters found
2002 $p = abs($pos); // number of characters wanted
2008 $i = strlen($str) - 1;
2012 for (; strlen($str{$i}) && $n < $p; $i +
= $d) {
2013 $c = (int) ord($str{$i});
2014 if (!($c & 0x80)) // single-byte (0xxxxxx)
2018 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2023 if (!strlen($str{$i})) {
2025 } // offset beyond string length
2028 // skip trailing multi-byte data bytes
2029 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2041 * Translates an 'absolute' byte position into a character position.
2042 * Unit tested by Kasper.
2044 * @param string UTF-8 string
2045 * @param integer byte position
2046 * @return integer character position
2047 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2049 function utf8_byte2char_pos($str, $pos) {
2050 $n = 0; // number of characters
2051 for ($i = $pos; $i > 0; $i--) {
2052 $c = (int) ord($str{$i});
2053 if (!($c & 0x80)) // single-byte (0xxxxxx)
2057 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2062 if (!strlen($str{$i})) {
2064 } // offset beyond string length
2070 * Maps all characters of an UTF-8 string.
2072 * @param string UTF-8 string
2073 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2074 * @param string 'case': conversion 'toLower' or 'toUpper'
2075 * @return string the converted string
2076 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2078 function utf8_char_mapping($str, $mode, $opt = '') {
2079 if (!$this->initUnicodeData($mode)) {
2086 $map =& $this->caseFolding
['utf-8'][$opt];
2090 $map =& $this->toASCII
['utf-8'];
2097 for ($i = 0; strlen($str{$i}); $i++
) {
2099 if (!($c & 0x80)) // single-byte (0xxxxxx)
2103 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2104 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2106 } // calculate number of bytes
2107 $mbc = substr($str, $i, $bc);
2111 if (isset($map[$mbc])) {
2122 /********************************************
2124 * Internal EUC string operation functions
2126 * Extended Unix Code:
2127 * ASCII compatible 7bit single bytes chars
2128 * 8bit two byte chars
2130 * Shift-JIS is treated as a special case.
2132 ********************************************/
2135 * Cuts a string in the EUC charset family short at a given byte length.
2137 * @param string EUC multibyte character string
2138 * @param integer the byte length
2139 * @param string the charset
2140 * @return string the shortened string
2142 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2144 function euc_strtrunc($str, $len, $charset) {
2145 $sjis = ($charset == 'shift_jis');
2146 for ($i = 0; strlen($str{$i}) && $i < $len; $i++
) {
2149 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) {
2151 } // advance a double-byte char
2156 } // advance a double-byte char
2159 if (!strlen($str{$i})) {
2161 } // string shorter than supplied length
2164 return substr($str, 0, $len - 1); // we ended on a first byte
2166 return substr($str, 0, $len);
2171 * Returns a part of a string in the EUC charset family.
2173 * @param string EUC multibyte character string
2174 * @param integer start position (character position)
2175 * @param string the charset
2176 * @param integer length (in characters)
2177 * @return string the substring
2178 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2180 function euc_substr($str, $start, $charset, $len = NULL) {
2181 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2182 if ($byte_start === FALSE) {
2184 } // $start outside string length
2186 $str = substr($str, $byte_start);
2189 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2190 if ($byte_end === FALSE) // $len outside actual string length
2196 return substr($str, 0, $byte_end);
2205 * Counts the number of characters of a string in the EUC charset family.
2207 * @param string EUC multibyte character string
2208 * @param string the charset
2209 * @return integer the number of characters
2211 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2213 function euc_strlen($str, $charset) {
2214 $sjis = ($charset == 'shift_jis');
2216 for ($i = 0; strlen($str{$i}); $i++
) {
2219 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) {
2221 } // advance a double-byte char
2226 } // advance a double-byte char
2236 * Translates a character position into an 'absolute' byte position.
2238 * @param string EUC multibyte character string
2239 * @param integer character position (negative values start from the end)
2240 * @param string the charset
2241 * @return integer byte position
2242 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2244 function euc_char2byte_pos($str, $pos, $charset) {
2245 $sjis = ($charset == 'shift_jis');
2246 $n = 0; // number of characters seen
2247 $p = abs($pos); // number of characters wanted
2253 $i = strlen($str) - 1;
2257 for (; strlen($str{$i}) && $n < $p; $i +
= $d) {
2260 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) {
2262 } // advance a double-byte char
2267 } // advance a double-byte char
2272 if (!strlen($str{$i})) {
2274 } // offset beyond string length
2284 * Maps all characters of a string in the EUC charset family.
2286 * @param string EUC multibyte character string
2287 * @param string the charset
2288 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2289 * @param string 'case': conversion 'toLower' or 'toUpper'
2290 * @return string the converted string
2291 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2293 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2296 if (!$this->initCaseFolding($charset)) {
2299 $map =& $this->caseFolding
[$charset][$opt];
2303 if (!$this->initToASCII($charset)) {
2306 $map =& $this->toASCII
[$charset];
2313 $sjis = ($charset == 'shift_jis');
2315 for ($i = 0; strlen($str{$i}); $i++
) {
2320 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
2321 $mbc = substr($str, $i, 2);
2326 if ($c >= 0x80) { // a double-byte char
2327 $mbc = substr($str, $i, 2);
2332 if (isset($map[$mbc])) {
2344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2345 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);