Merge branch 'm22_MDL-33053_AICC_flattened_TOC' of git://github.com/scara/moodle...
[moodle.git] / lib / typo3 / class.t3lib_cs.php
blob0bcbfe2f8d8e7eca32bf31d8e1c71de90b79b854
1 <?php
2 /***************************************************************
3 * Copyright notice
5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
6 * All rights reserved
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
24 /**
25 * Class for conversion between charsets.
27 * $Id$
29 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
30 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
32 /**
33 * [CLASS/FUNCTION INDEX of SCRIPT]
37 * 136: class t3lib_cs
38 * 488: function parse_charset($charset)
39 * 507: function get_locale_charset($locale)
41 * SECTION: Charset Conversion functions
42 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
43 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 617: function utf8_encode($str,$charset)
45 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
46 * 706: function utf8_to_entities($str)
47 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
48 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
49 * 823: function UnumberToChar($cbyte)
50 * 868: function utf8CharToUnumber($str,$hex=0)
52 * SECTION: Init functions
53 * 911: function initCharset($charset)
54 * 973: function initUnicodeData($mode=null)
55 * 1198: function initCaseFolding($charset)
56 * 1260: function initToASCII($charset)
58 * SECTION: String operation functions
59 * 1331: function substr($charset,$string,$start,$len=null)
60 * 1384: function strlen($charset,$string)
61 * 1414: function crop($charset,$string,$len,$crop='')
62 * 1467: function strtrunc($charset,$string,$len)
63 * 1501: function conv_case($charset,$string,$case)
64 * 1527: function specCharsToASCII($charset,$string)
66 * SECTION: Internal string operation functions
67 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
69 * SECTION: Internal UTF-8 string operation functions
70 * 1622: function utf8_substr($str,$start,$len=null)
71 * 1655: function utf8_strlen($str)
72 * 1676: function utf8_strtrunc($str,$len)
73 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
74 * 1723: function utf8_strrpos($haystack,$needle)
75 * 1745: function utf8_char2byte_pos($str,$pos)
76 * 1786: function utf8_byte2char_pos($str,$pos)
77 * 1809: function utf8_char_mapping($str,$mode,$opt='')
79 * SECTION: Internal EUC string operation functions
80 * 1885: function euc_strtrunc($str,$len,$charset)
81 * 1914: function euc_substr($str,$start,$charset,$len=null)
82 * 1939: function euc_strlen($str,$charset)
83 * 1966: function euc_char2byte_pos($str,$pos,$charset)
84 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
86 * TOTAL FUNCTIONS: 35
87 * (This index is automatically created/updated by the extension "extdeveval")
92 /**
93 * Notes on UTF-8
95 * Functions working on UTF-8 strings:
97 * - strchr/strstr
98 * - strrchr
99 * - substr_count
100 * - implode/explode/join
102 * Functions nearly working on UTF-8 strings:
104 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
105 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
106 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
107 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
108 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
110 * Functions NOT working on UTF-8 strings:
112 * - str*cmp
113 * - stristr
114 * - stripos
115 * - substr
116 * - strrev
117 * - split/spliti
118 * - ...
122 * Class for conversion between charsets
124 * @author Kasper Skårhøj <kasperYYYY@typo3.com>
125 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
126 * @package TYPO3
127 * @subpackage t3lib
129 class t3lib_cs {
130 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
132 // This is the array where parsed conversion tables are stored (cached)
133 var $parsedCharsets = array();
135 // An array where case folding data will be stored (cached)
136 var $caseFolding = array();
138 // An array where charset-to-ASCII mappings are stored (cached)
139 var $toASCII = array();
141 // This tells the converter which charsets has two bytes per char:
142 var $twoByteSets = array(
143 'ucs-2' => 1, // 2-byte Unicode
146 // This tells the converter which charsets has four bytes per char:
147 var $fourByteSets = array(
148 'ucs-4' => 1, // 4-byte Unicode
149 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
152 // This tells the converter which charsets use a scheme like the Extended Unix Code:
153 var $eucBasedSets = array(
154 'gb2312' => 1, // Chinese, simplified.
155 'big5' => 1, // Chinese, traditional.
156 'euc-kr' => 1, // Korean
157 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
160 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
161 // http://czyborra.com/charsets/iso8859.html
162 var $synonyms = array(
163 'us' => 'ascii',
164 'us-ascii' => 'ascii',
165 'cp819' => 'iso-8859-1',
166 'ibm819' => 'iso-8859-1',
167 'iso-ir-100' => 'iso-8859-1',
168 'iso-ir-101' => 'iso-8859-2',
169 'iso-ir-109' => 'iso-8859-3',
170 'iso-ir-110' => 'iso-8859-4',
171 'iso-ir-144' => 'iso-8859-5',
172 'iso-ir-127' => 'iso-8859-6',
173 'iso-ir-126' => 'iso-8859-7',
174 'iso-ir-138' => 'iso-8859-8',
175 'iso-ir-148' => 'iso-8859-9',
176 'iso-ir-157' => 'iso-8859-10',
177 'iso-ir-179' => 'iso-8859-13',
178 'iso-ir-199' => 'iso-8859-14',
179 'iso-ir-203' => 'iso-8859-15',
180 'csisolatin1' => 'iso-8859-1',
181 'csisolatin2' => 'iso-8859-2',
182 'csisolatin3' => 'iso-8859-3',
183 'csisolatin5' => 'iso-8859-9',
184 'csisolatin8' => 'iso-8859-14',
185 'csisolatin9' => 'iso-8859-15',
186 'csisolatingreek' => 'iso-8859-7',
187 'iso-celtic' => 'iso-8859-14',
188 'latin1' => 'iso-8859-1',
189 'latin2' => 'iso-8859-2',
190 'latin3' => 'iso-8859-3',
191 'latin5' => 'iso-8859-9',
192 'latin6' => 'iso-8859-10',
193 'latin8' => 'iso-8859-14',
194 'latin9' => 'iso-8859-15',
195 'l1' => 'iso-8859-1',
196 'l2' => 'iso-8859-2',
197 'l3' => 'iso-8859-3',
198 'l5' => 'iso-8859-9',
199 'l6' => 'iso-8859-10',
200 'l8' => 'iso-8859-14',
201 'l9' => 'iso-8859-15',
202 'cyrillic' => 'iso-8859-5',
203 'arabic' => 'iso-8859-6',
204 'tis-620' => 'iso-8859-11',
205 'win874' => 'windows-874',
206 'win1250' => 'windows-1250',
207 'win1251' => 'windows-1251',
208 'win1252' => 'windows-1252',
209 'win1253' => 'windows-1253',
210 'win1254' => 'windows-1254',
211 'win1255' => 'windows-1255',
212 'win1256' => 'windows-1256',
213 'win1257' => 'windows-1257',
214 'win1258' => 'windows-1258',
215 'cp1250' => 'windows-1250',
216 'cp1251' => 'windows-1251',
217 'cp1252' => 'windows-1252',
218 'ms-ee' => 'windows-1250',
219 'ms-ansi' => 'windows-1252',
220 'ms-greek' => 'windows-1253',
221 'ms-turk' => 'windows-1254',
222 'winbaltrim' => 'windows-1257',
223 'koi-8ru' => 'koi-8r',
224 'koi8r' => 'koi-8r',
225 'cp878' => 'koi-8r',
226 'mac' => 'macroman',
227 'macintosh' => 'macroman',
228 'euc-cn' => 'gb2312',
229 'x-euc-cn' => 'gb2312',
230 'euccn' => 'gb2312',
231 'cp936' => 'gb2312',
232 'big-5' => 'big5',
233 'cp950' => 'big5',
234 'eucjp' => 'euc-jp',
235 'sjis' => 'shift_jis',
236 'shift-jis' => 'shift_jis',
237 'cp932' => 'shift_jis',
238 'cp949' => 'euc-kr',
239 'utf7' => 'utf-7',
240 'utf8' => 'utf-8',
241 'utf16' => 'utf-16',
242 'utf32' => 'utf-32',
243 'utf8' => 'utf-8',
244 'ucs2' => 'ucs-2',
245 'ucs4' => 'ucs-4',
248 // mapping of iso-639-1 language codes to script names
249 var $lang_to_script = array(
250 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
251 'ar' => 'arabic',
252 'bg' => 'cyrillic', // Bulgarian
253 'bs' => 'east_european', // Bosnian
254 'cs' => 'east_european', // Czech
255 'da' => 'west_european', // Danish
256 'de' => 'west_european', // German
257 'es' => 'west_european', // Spanish
258 'et' => 'estonian',
259 'eo' => 'unicode', // Esperanto
260 'eu' => 'west_european', // Basque
261 'fa' => 'arabic', // Persian
262 'fi' => 'west_european', // Finish
263 'fo' => 'west_european', // Faroese
264 'fr' => 'west_european', // French
265 'ga' => 'west_european', // Irish
266 'gl' => 'west_european', // Galician
267 'gr' => 'greek',
268 'he' => 'hebrew', // Hebrew (since 1998)
269 'hi' => 'unicode', // Hindi
270 'hr' => 'east_european', // Croatian
271 'hu' => 'east_european', // Hungarian
272 'iw' => 'hebrew', // Hebrew (til 1998)
273 'is' => 'west_european', // Icelandic
274 'it' => 'west_european', // Italian
275 'ja' => 'japanese',
276 'ka' => 'unicode', // Georgian
277 'kl' => 'west_european', // Greenlandic
278 'km' => 'unicode', // Khmer
279 'ko' => 'korean',
280 'lt' => 'lithuanian',
281 'lv' => 'west_european', // Latvian/Lettish
282 'nl' => 'west_european', // Dutch
283 'no' => 'west_european', // Norwegian
284 'nb' => 'west_european', // Norwegian Bokmal
285 'nn' => 'west_european', // Norwegian Nynorsk
286 'pl' => 'east_european', // Polish
287 'pt' => 'west_european', // Portuguese
288 'ro' => 'east_european', // Romanian
289 'ru' => 'cyrillic', // Russian
290 'sk' => 'east_european', // Slovak
291 'sl' => 'east_european', // Slovenian
292 'sr' => 'cyrillic', // Serbian
293 'sv' => 'west_european', // Swedish
294 'sq' => 'albanian', // Albanian
295 'th' => 'thai',
296 'uk' => 'cyrillic', // Ukranian
297 'vi' => 'vietnamese',
298 'zh' => 'chinese',
299 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
300 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
301 'ara' => 'arabic',
302 'bgr' => 'cyrillic', // Bulgarian
303 'cat' => 'west_european', // Catalan
304 'chs' => 'simpl_chinese',
305 'cht' => 'trad_chinese',
306 'csy' => 'east_european', // Czech
307 'dan' => 'west_european', // Danisch
308 'deu' => 'west_european', // German
309 'dea' => 'west_european', // German (Austrian)
310 'des' => 'west_european', // German (Swiss)
311 'ena' => 'west_european', // English (Australian)
312 'enc' => 'west_european', // English (Canadian)
313 'eng' => 'west_european', // English
314 'enz' => 'west_european', // English (New Zealand)
315 'enu' => 'west_european', // English (United States)
316 'euq' => 'west_european', // Basque
317 'fos' => 'west_european', // Faroese
318 'far' => 'arabic', // Persian
319 'fin' => 'west_european', // Finish
320 'fra' => 'west_european', // French
321 'frb' => 'west_european', // French (Belgian)
322 'frc' => 'west_european', // French (Canadian)
323 'frs' => 'west_european', // French (Swiss)
324 'geo' => 'unicode', // Georgian
325 'glg' => 'west_european', // Galician
326 'ell' => 'greek',
327 'heb' => 'hebrew',
328 'hin' => 'unicode', // Hindi
329 'hun' => 'east_european', // Hungarian
330 'isl' => 'west_euorpean', // Icelandic
331 'ita' => 'west_european', // Italian
332 'its' => 'west_european', // Italian (Swiss)
333 'jpn' => 'japanese',
334 'khm' => 'unicode', // Khmer
335 'kor' => 'korean',
336 'lth' => 'lithuanian',
337 'lvi' => 'west_european', // Latvian/Lettish
338 'msl' => 'west_european', // Malay
339 'nlb' => 'west_european', // Dutch (Belgian)
340 'nld' => 'west_european', // Dutch
341 'nor' => 'west_european', // Norwegian (bokmal)
342 'non' => 'west_european', // Norwegian (nynorsk)
343 'plk' => 'east_european', // Polish
344 'ptg' => 'west_european', // Portuguese
345 'ptb' => 'west_european', // Portuguese (Brazil)
346 'rom' => 'east_european', // Romanian
347 'rus' => 'cyrillic', // Russian
348 'slv' => 'east_european', // Slovenian
349 'sky' => 'east_european', // Slovak
350 'srl' => 'east_european', // Serbian (Latin)
351 'srb' => 'cyrillic', // Serbian (Cyrillic)
352 'esp' => 'west_european', // Spanish (trad. sort)
353 'esm' => 'west_european', // Spanish (Mexican)
354 'esn' => 'west_european', // Spanish (internat. sort)
355 'sve' => 'west_european', // Swedish
356 'sqi' => 'albanian', // Albanian
357 'tha' => 'thai',
358 'trk' => 'turkish',
359 'ukr' => 'cyrillic', // Ukrainian
360 // English language names
361 'albanian' => 'albanian',
362 'arabic' => 'arabic',
363 'basque' => 'west_european',
364 'bosnian' => 'east_european',
365 'bulgarian' => 'east_european',
366 'catalan' => 'west_european',
367 'croatian' => 'east_european',
368 'czech' => 'east_european',
369 'danish' => 'west_european',
370 'dutch' => 'west_european',
371 'english' => 'west_european',
372 'esperanto' => 'unicode',
373 'estonian' => 'estonian',
374 'faroese' => 'west_european',
375 'farsi' => 'arabic',
376 'finnish' => 'west_european',
377 'french' => 'west_european',
378 'galician' => 'west_european',
379 'georgian' => 'unicode',
380 'german' => 'west_european',
381 'greek' => 'greek',
382 'greenlandic' => 'west_european',
383 'hebrew' => 'hebrew',
384 'hindi' => 'unicode',
385 'hungarian' => 'east_european',
386 'icelandic' => 'west_european',
387 'italian' => 'west_european',
388 'khmer' => 'unicode',
389 'latvian' => 'west_european',
390 'lettish' => 'west_european',
391 'lithuanian' => 'lithuanian',
392 'malay' => 'west_european',
393 'norwegian' => 'west_european',
394 'persian' => 'arabic',
395 'polish' => 'east_european',
396 'portuguese' => 'west_european',
397 'russian' => 'cyrillic',
398 'romanian' => 'east_european',
399 'serbian' => 'cyrillic',
400 'slovak' => 'east_european',
401 'slovenian' => 'east_european',
402 'spanish' => 'west_european',
403 'svedish' => 'west_european',
404 'that' => 'thai',
405 'turkish' => 'turkish',
406 'ukrainian' => 'cyrillic',
409 // mapping of language (family) names to charsets on Unix
410 var $script_to_charset_unix = array(
411 'west_european' => 'iso-8859-1',
412 'estonian' => 'iso-8859-1',
413 'east_european' => 'iso-8859-2',
414 'baltic' => 'iso-8859-4',
415 'cyrillic' => 'iso-8859-5',
416 'arabic' => 'iso-8859-6',
417 'greek' => 'iso-8859-7',
418 'hebrew' => 'iso-8859-8',
419 'turkish' => 'iso-8859-9',
420 'thai' => 'iso-8859-11', // = TIS-620
421 'lithuanian' => 'iso-8859-13',
422 'chinese' => 'gb2312', // = euc-cn
423 'japanese' => 'euc-jp',
424 'korean' => 'euc-kr',
425 'simpl_chinese' => 'gb2312',
426 'trad_chinese' => 'big5',
427 'vietnamese' => '',
428 'unicode' => 'utf-8',
429 'albanian' => 'utf-8'
432 // mapping of language (family) names to charsets on Windows
433 var $script_to_charset_windows = array(
434 'east_european' => 'windows-1250',
435 'cyrillic' => 'windows-1251',
436 'west_european' => 'windows-1252',
437 'greek' => 'windows-1253',
438 'turkish' => 'windows-1254',
439 'hebrew' => 'windows-1255',
440 'arabic' => 'windows-1256',
441 'baltic' => 'windows-1257',
442 'estonian' => 'windows-1257',
443 'lithuanian' => 'windows-1257',
444 'vietnamese' => 'windows-1258',
445 'thai' => 'cp874',
446 'korean' => 'cp949',
447 'chinese' => 'gb2312',
448 'japanese' => 'shift_jis',
449 'simpl_chinese' => 'gb2312',
450 'trad_chinese' => 'big5',
451 'albanian' => 'windows-1250',
452 'unicode' => 'utf-8'
455 // mapping of locale names to charsets
456 var $locale_to_charset = array(
457 'japanese.euc' => 'euc-jp',
458 'ja_jp.ujis' => 'euc-jp',
459 'korean.euc' => 'euc-kr',
460 'sr@Latn' => 'iso-8859-2',
461 'zh_cn' => 'gb2312',
462 'zh_hk' => 'big5',
463 'zh_tw' => 'big5',
466 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
467 // Empty values means "iso-8859-1"
468 var $charSetArray = array(
469 'dk' => '',
470 'de' => '',
471 'no' => '',
472 'it' => '',
473 'fr' => '',
474 'es' => '',
475 'nl' => '',
476 'cz' => 'windows-1250',
477 'pl' => 'iso-8859-2',
478 'si' => 'windows-1250',
479 'fi' => '',
480 'tr' => 'iso-8859-9',
481 'se' => '',
482 'pt' => '',
483 'ru' => 'windows-1251',
484 'ro' => 'iso-8859-2',
485 'ch' => 'gb2312',
486 'sk' => 'windows-1250',
487 'lt' => 'windows-1257',
488 'is' => 'utf-8',
489 'hr' => 'windows-1250',
490 'hu' => 'iso-8859-2',
491 'gl' => '',
492 'th' => 'iso-8859-11',
493 'gr' => 'iso-8859-7',
494 'hk' => 'big5',
495 'eu' => '',
496 'bg' => 'windows-1251',
497 'br' => '',
498 'et' => 'iso-8859-4',
499 'ar' => 'iso-8859-6',
500 'he' => 'utf-8',
501 'ua' => 'windows-1251',
502 'jp' => 'shift_jis',
503 'lv' => 'utf-8',
504 'vn' => 'utf-8',
505 'ca' => 'iso-8859-15',
506 'ba' => 'iso-8859-2',
507 'kr' => 'euc-kr',
508 'eo' => 'utf-8',
509 'my' => '',
510 'hi' => 'utf-8',
511 'fo' => 'utf-8',
512 'fa' => 'utf-8',
513 'sr' => 'utf-8',
514 'sq' => 'utf-8',
515 'ge' => 'utf-8',
516 'ga' => '',
517 'km' => 'utf-8',
518 'qc' => '',
521 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
522 // Missing keys means: same as Typo3
523 var $isoArray = array(
524 'ba' => 'bs',
525 'br' => 'pt_BR',
526 'ch' => 'zh_CN',
527 'cz' => 'cs',
528 'dk' => 'da',
529 'si' => 'sl',
530 'se' => 'sv',
531 'gl' => 'kl',
532 'gr' => 'el',
533 'hk' => 'zh_HK',
534 'kr' => 'ko',
535 'ua' => 'uk',
536 'jp' => 'ja',
537 'qc' => 'fr_CA',
538 'vn' => 'vi',
539 'ge' => 'ka',
540 'ga' => 'gl',
544 * Normalize - changes input character set to lowercase letters.
546 * @param string Input charset
547 * @return string Normalized charset
548 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
550 function parse_charset($charset) {
551 $charset = trim(strtolower($charset));
552 if (isset($this->synonyms[$charset])) {
553 $charset = $this->synonyms[$charset];
556 return $charset;
560 * Get the charset of a locale.
562 * ln language
563 * ln_CN language / country
564 * ln_CN.cs language / country / charset
565 * ln_CN.cs@mod language / country / charset / modifier
567 * @param string Locale string
568 * @return string Charset resolved for locale string
569 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
571 function get_locale_charset($locale) {
572 $locale = strtolower($locale);
574 // exact locale specific charset?
575 if (isset($this->locale_to_charset[$locale])) {
576 return $this->locale_to_charset[$locale];
579 // get modifier
580 list($locale, $modifier) = explode('@', $locale);
582 // locale contains charset: use it
583 list($locale, $charset) = explode('.', $locale);
584 if ($charset) {
585 return $this->parse_charset($charset);
588 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
589 if ($modifier == 'euro') {
590 return 'iso-8859-15';
593 // get language
594 list($language, $country) = explode('_', $locale);
595 if (isset($this->lang_to_script[$language])) {
596 $script = $this->lang_to_script[$language];
599 if (TYPO3_OS == 'WIN') {
600 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
601 } else {
602 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
605 return $cs;
609 /********************************************
611 * Charset Conversion functions
613 ********************************************/
616 * Convert from one charset to another charset.
618 * @param string Input string
619 * @param string From charset (the current charset of the string)
620 * @param string To charset (the output charset wanted)
621 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
622 * @return string Converted string
623 * @see convArray()
625 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
626 if ($fromCS == $toCS) {
627 return $str;
630 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
631 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
632 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
633 case 'mbstring':
634 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
635 if (FALSE !== $conv_str) {
636 return $conv_str;
637 } // returns false for unsupported charsets
638 break;
640 case 'iconv':
641 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
642 if (FALSE !== $conv_str) {
643 return $conv_str;
645 break;
647 case 'recode':
648 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
649 if (FALSE !== $conv_str) {
650 return $conv_str;
652 break;
654 // fallback to TYPO3 conversion
657 if ($fromCS != 'utf-8') {
658 $str = $this->utf8_encode($str, $fromCS);
660 if ($toCS != 'utf-8') {
661 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
663 return $str;
667 * Convert all elements in ARRAY with type string from one charset to another charset.
668 * NOTICE: Array is passed by reference!
670 * @param string Input array, possibly multidimensional
671 * @param string From charset (the current charset of the string)
672 * @param string To charset (the output charset wanted)
673 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
674 * @return void
675 * @see conv()
677 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
678 foreach ($array as $key => $value) {
679 if (is_array($array[$key])) {
680 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
681 } elseif (is_string($array[$key])) {
682 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
688 * Converts $str from $charset to UTF-8
690 * @param string String in local charset to convert to UTF-8
691 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
692 * @return string Output string, converted to UTF-8
694 function utf8_encode($str, $charset) {
696 if ($charset === 'utf-8') {
697 return $str;
700 // Charset is case-insensitive.
701 if ($this->initCharset($charset)) { // Parse conv. table if not already...
702 $strLen = strlen($str);
703 $outStr = '';
705 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
706 $chr = substr($str, $a, 1);
707 $ord = ord($chr);
708 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
709 $ord2 = ord($str{$a + 1});
710 $ord = $ord << 8 | $ord2; // assume big endian
712 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
713 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
714 } else {
715 $outStr .= chr($this->noCharByteVal);
716 } // No char exists
717 $a++;
718 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
719 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
720 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
721 $a++;
722 $ord2 = ord(substr($str, $a, 1));
723 $ord = $ord * 256 + $ord2;
727 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
728 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
729 } else {
730 $outStr .= chr($this->noCharByteVal);
731 } // No char exists
732 } else {
733 $outStr .= $chr;
734 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
736 return $outStr;
741 * Converts $str from UTF-8 to $charset
743 * @param string String in UTF-8 to convert to local charset
744 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
745 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
746 * @return string Output string, converted to local charset
748 function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
750 if ($charset === 'utf-8') {
751 return $str;
754 // Charset is case-insensitive.
755 if ($this->initCharset($charset)) { // Parse conv. table if not already...
756 $strLen = strlen($str);
757 $outStr = '';
758 $buf = '';
759 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
760 $chr = substr($str, $a, 1);
761 $ord = ord($chr);
762 if ($ord > 127) { // This means multibyte! (first byte!)
763 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
765 $buf = $chr; // Add first byte
766 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
767 $ord = $ord << 1; // Shift it left and ...
768 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
769 $a++; // Increase pointer...
770 $buf .= substr($str, $a, 1); // ... and add the next char.
771 } else {
772 break;
776 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
777 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
778 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
779 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
780 } else {
781 $outStr .= chr($mByte);
783 } elseif ($useEntityForNoChar) { // Create num entity:
784 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
785 } else {
786 $outStr .= chr($this->noCharByteVal);
787 } // No char exists
788 } else {
789 $outStr .= chr($this->noCharByteVal);
790 } // No char exists (MIDDLE of MB sequence!)
791 } else {
792 $outStr .= $chr;
793 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
795 return $outStr;
800 * Converts all chars > 127 to numeric entities.
802 * @param string Input string
803 * @return string Output string
805 function utf8_to_entities($str) {
806 $strLen = strlen($str);
807 $outStr = '';
808 $buf = '';
809 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
810 $chr = substr($str, $a, 1);
811 $ord = ord($chr);
812 if ($ord > 127) { // This means multibyte! (first byte!)
813 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
814 $buf = $chr; // Add first byte
815 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
816 $ord = $ord << 1; // Shift it left and ...
817 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
818 $a++; // Increase pointer...
819 $buf .= substr($str, $a, 1); // ... and add the next char.
820 } else {
821 break;
825 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
826 } else {
827 $outStr .= chr($this->noCharByteVal);
828 } // No char exists (MIDDLE of MB sequence!)
829 } else {
830 $outStr .= $chr;
831 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
834 return $outStr;
838 * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
840 * @param string Input string, UTF-8
841 * @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
842 * @return string Output string
844 function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
845 if ($alsoStdHtmlEnt) {
846 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
849 $token = md5(microtime());
850 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
851 foreach ($parts as $k => $v) {
852 if ($k % 2) {
853 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
854 if (substr($v, 1, 1) == 'x') {
855 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
856 } else {
857 $parts[$k] = $this->UnumberToChar(substr($v, 1));
859 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
860 $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
861 } else { // No conversion:
862 $parts[$k] = '&' . $v . ';';
867 return implode('', $parts);
871 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
873 * @param string Input string, UTF-8
874 * @param boolean If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
875 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
876 * @return array Output array with the char numbers
878 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
879 // If entities must be registered as well...:
880 if ($convEntities) {
881 $str = $this->entities_to_utf8($str, 1);
883 // Do conversion:
884 $strLen = strlen($str);
885 $outArr = array();
886 $buf = '';
887 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
888 $chr = substr($str, $a, 1);
889 $ord = ord($chr);
890 if ($ord > 127) { // This means multibyte! (first byte!)
891 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
892 $buf = $chr; // Add first byte
893 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
894 $ord = $ord << 1; // Shift it left and ...
895 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
896 $a++; // Increase pointer...
897 $buf .= substr($str, $a, 1); // ... and add the next char.
898 } else {
899 break;
903 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
904 } else {
905 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
906 } // No char exists (MIDDLE of MB sequence!)
907 } else {
908 $outArr[] = $retChar ? chr($ord) : $ord;
909 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
912 return $outArr;
916 * Converts a UNICODE number to a UTF-8 multibyte character
917 * Algorithm based on script found at From: http://czyborra.com/utf/
918 * Unit-tested by Kasper
920 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
922 * bytes | bits | representation
923 * 1 | 7 | 0vvvvvvv
924 * 2 | 11 | 110vvvvv 10vvvvvv
925 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
926 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
927 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
928 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
930 * @param integer UNICODE integer
931 * @return string UTF-8 multibyte character string
932 * @see utf8CharToUnumber()
934 function UnumberToChar($cbyte) {
935 $str = '';
937 if ($cbyte < 0x80) {
938 $str .= chr($cbyte);
939 } else {
940 if ($cbyte < 0x800) {
941 $str .= chr(0xC0 | ($cbyte >> 6));
942 $str .= chr(0x80 | ($cbyte & 0x3F));
943 } else {
944 if ($cbyte < 0x10000) {
945 $str .= chr(0xE0 | ($cbyte >> 12));
946 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
947 $str .= chr(0x80 | ($cbyte & 0x3F));
948 } else {
949 if ($cbyte < 0x200000) {
950 $str .= chr(0xF0 | ($cbyte >> 18));
951 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
952 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
953 $str .= chr(0x80 | ($cbyte & 0x3F));
954 } else {
955 if ($cbyte < 0x4000000) {
956 $str .= chr(0xF8 | ($cbyte >> 24));
957 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
958 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
959 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
960 $str .= chr(0x80 | ($cbyte & 0x3F));
961 } else {
962 if ($cbyte < 0x80000000) {
963 $str .= chr(0xFC | ($cbyte >> 30));
964 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
965 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
966 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
967 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
968 $str .= chr(0x80 | ($cbyte & 0x3F));
969 } else { // Cannot express a 32-bit character in UTF-8
970 $str .= chr($this->noCharByteVal);
977 return $str;
981 * Converts a UTF-8 Multibyte character to a UNICODE number
982 * Unit-tested by Kasper
984 * @param string UTF-8 multibyte character string
985 * @param boolean If set, then a hex. number is returned.
986 * @return integer UNICODE integer
987 * @see UnumberToChar()
989 function utf8CharToUnumber($str, $hex = 0) {
990 $ord = ord(substr($str, 0, 1)); // First char
992 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
993 $binBuf = '';
994 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
995 $ord = $ord << 1; // Shift it left and ...
996 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
997 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
998 } else {
999 break;
1002 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
1004 $int = bindec($binBuf);
1005 } else {
1006 $int = $ord;
1009 return $hex ? 'x' . dechex($int) : $int;
1013 /********************************************
1015 * Init functions
1017 ********************************************/
1020 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1021 * This function is automatically called by the conversion functions
1023 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1025 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1026 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1027 * @access private
1029 function initCharset($charset) {
1030 // Only process if the charset is not yet loaded:
1031 if (empty($this->parsedCharsets[$charset]) || !is_array($this->parsedCharsets[$charset])) {
1033 // Conversion table filename:
1034 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1036 // If the conversion table is found:
1037 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1038 // Cache file for charsets:
1039 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1040 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1041 if ($cacheFile && @is_file($cacheFile)) {
1042 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1043 } else {
1044 // Parse conversion table into lines:
1045 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1046 // Initialize the internal variable holding the conv. table:
1047 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1048 // traverse the lines:
1049 $detectedType = '';
1050 foreach ($lines as $value) {
1051 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1053 // Detect type if not done yet: (Done on first real line)
1054 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1055 if (!$detectedType) {
1056 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1059 if ($detectedType == 'ms-token') {
1060 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1061 } elseif ($detectedType == 'whitespaced') {
1062 $regA = array();
1063 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1064 $hexbyte = $regA[1];
1065 $utf8 = 'U+' . $regA[2];
1067 $decval = hexdec(trim($hexbyte));
1068 if ($decval > 127) {
1069 $utf8decval = hexdec(substr(trim($utf8), 2));
1070 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1071 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1075 if ($cacheFile) {
1076 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1079 return 2;
1080 } else {
1081 return FALSE;
1083 } else {
1084 return 1;
1089 * This function initializes all UTF-8 character data tables.
1091 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1093 * @param string Mode ("case", "ascii", ...)
1094 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1095 * @access private
1097 function initUnicodeData($mode = NULL) {
1098 // cache files
1099 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1100 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1102 // Only process if the tables are not yet loaded
1103 switch ($mode) {
1104 case 'case':
1105 if (is_array($this->caseFolding['utf-8'])) {
1106 return 1;
1109 // Use cached version if possible
1110 if ($cacheFileCase && @is_file($cacheFileCase)) {
1111 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1112 return 2;
1114 break;
1116 case 'ascii':
1117 if (is_array($this->toASCII['utf-8'])) {
1118 return 1;
1121 // Use cached version if possible
1122 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1123 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1124 return 2;
1126 break;
1129 // process main Unicode data file
1130 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1131 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1132 return FALSE;
1135 $fh = fopen($unicodeDataFile, 'rb');
1136 if (!$fh) {
1137 return FALSE;
1140 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1141 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1142 $this->caseFolding['utf-8'] = array();
1143 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1144 $utf8CaseFolding['toUpper'] = array();
1145 $utf8CaseFolding['toLower'] = array();
1146 $utf8CaseFolding['toTitle'] = array();
1148 $decomposition = array(); // array of temp. decompositions
1149 $mark = array(); // array of chars that are marks (eg. composing accents)
1150 $number = array(); // array of chars that are numbers (eg. digits)
1151 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1153 while (!feof($fh)) {
1154 $line = fgets($fh, 4096);
1155 // has a lot of info
1156 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1158 $ord = hexdec($char);
1159 if ($ord > 0xFFFF) {
1160 break;
1161 } // only process the BMP
1163 $utf8_char = $this->UnumberToChar($ord);
1165 if ($upper) {
1166 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1168 if ($lower) {
1169 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1171 // store "title" only when different from "upper" (only a few)
1172 if ($title && $title != $upper) {
1173 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1176 switch ($cat{0}) {
1177 case 'M': // mark (accent, umlaut, ...)
1178 $mark["U+$char"] = 1;
1179 break;
1181 case 'N': // numeric value
1182 if ($ord > 0x80 && $num != '') {
1183 $number["U+$char"] = $num;
1187 // accented Latin letters without "official" decomposition
1188 $match = array();
1189 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1190 $c = ord($match[2]);
1191 if ($match[1] == 'SMALL') {
1192 $c += 32;
1195 $decomposition["U+$char"] = array(dechex($c));
1196 continue;
1199 $match = array();
1200 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1201 switch ($match[1]) {
1202 case '<circle>': // add parenthesis as circle replacement, eg (1)
1203 $match[2] = '0028 ' . $match[2] . ' 0029';
1204 break;
1206 case '<square>': // add square brackets as square replacement, eg [1]
1207 $match[2] = '005B ' . $match[2] . ' 005D';
1208 break;
1210 case '<compat>': // ignore multi char decompositions that start with a space
1211 if (preg_match('/^0020 /', $match[2])) {
1212 continue 2;
1214 break;
1216 // ignore Arabic and vertical layout presentation decomposition
1217 case '<initial>':
1218 case '<medial>':
1219 case '<final>':
1220 case '<isolated>':
1221 case '<vertical>':
1222 continue 2;
1224 $decomposition["U+$char"] = explode(' ', $match[2]);
1227 fclose($fh);
1229 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1230 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1231 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1232 $fh = fopen($specialCasingFile, 'rb');
1233 if ($fh) {
1234 while (!feof($fh)) {
1235 $line = fgets($fh, 4096);
1236 if ($line{0} != '#' && trim($line) != '') {
1238 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1239 if ($cond == '' || $cond{0} == '#') {
1240 $utf8_char = $this->UnumberToChar(hexdec($char));
1241 if ($char != $lower) {
1242 $arr = explode(' ', $lower);
1243 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1246 if ($char != $title && $title != $upper) {
1247 $arr = explode(' ', $title);
1248 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1249 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1251 if ($char != $upper) {
1252 $arr = explode(' ', $upper);
1253 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1254 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1259 fclose($fh);
1263 // process custom decompositions
1264 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1265 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1266 $fh = fopen($customTranslitFile, 'rb');
1267 if ($fh) {
1268 while (!feof($fh)) {
1269 $line = fgets($fh, 4096);
1270 if ($line{0} != '#' && trim($line) != '') {
1271 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1272 if (!$translit) {
1273 $omit["U+$char"] = 1;
1275 $decomposition["U+$char"] = explode(' ', $translit);
1279 fclose($fh);
1283 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1284 foreach ($decomposition as $from => $to) {
1285 $code_decomp = array();
1287 while ($code_value = array_shift($to)) {
1288 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1289 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1290 array_unshift($to, $cv);
1292 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1293 array_push($code_decomp, $code_value);
1296 if (count($code_decomp) || isset($omit[$from])) {
1297 $decomposition[$from] = $code_decomp;
1298 } else {
1299 unset($decomposition[$from]);
1303 // create ascii only mapping
1304 $this->toASCII['utf-8'] = array();
1305 $ascii =& $this->toASCII['utf-8'];
1307 foreach ($decomposition as $from => $to) {
1308 $code_decomp = array();
1309 while ($code_value = array_shift($to)) {
1310 $ord = hexdec($code_value);
1311 if ($ord > 127) {
1312 continue 2;
1313 } // skip decompositions containing non-ASCII chars
1314 else
1316 array_push($code_decomp, chr($ord));
1319 $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1322 // add numeric decompositions
1323 foreach ($number as $from => $to) {
1324 $utf8_char = $this->UnumberToChar(hexdec($from));
1325 if (!isset($ascii[$utf8_char])) {
1326 $ascii[$utf8_char] = $to;
1330 if ($cacheFileCase) {
1331 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1334 if ($cacheFileASCII) {
1335 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1338 return 3;
1342 * This function initializes the folding table for a charset other than UTF-8.
1343 * This function is automatically called by the case folding functions.
1345 * @param string Charset for which to initialize case folding.
1346 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1347 * @access private
1349 function initCaseFolding($charset) {
1350 // Only process if the case table is not yet loaded:
1351 if (is_array($this->caseFolding[$charset])) {
1352 return 1;
1355 // Use cached version if possible
1356 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1357 if ($cacheFile && @is_file($cacheFile)) {
1358 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1359 return 2;
1362 // init UTF-8 conversion for this charset
1363 if (!$this->initCharset($charset)) {
1364 return FALSE;
1367 // UTF-8 case folding is used as the base conversion table
1368 if (!$this->initUnicodeData('case')) {
1369 return FALSE;
1372 $nochar = chr($this->noCharByteVal);
1373 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1374 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1375 $c = $this->utf8_decode($utf8, $charset);
1377 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1378 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1379 if ($cc != '' && $cc != $nochar) {
1380 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1383 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1384 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1385 if ($cc != '' && $cc != $nochar) {
1386 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1389 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1390 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1391 if ($cc != '' && $cc != $nochar) {
1392 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1396 // add the ASCII case table
1397 for ($i = ord('a'); $i <= ord('z'); $i++) {
1398 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1400 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1401 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1404 if ($cacheFile) {
1405 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1408 return 3;
1412 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1413 * This function is automatically called by the ASCII transliteration functions.
1415 * @param string Charset for which to initialize conversion.
1416 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1417 * @access private
1419 function initToASCII($charset) {
1420 // Only process if the case table is not yet loaded:
1421 if (is_array($this->toASCII[$charset])) {
1422 return 1;
1425 // Use cached version if possible
1426 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1427 if ($cacheFile && @is_file($cacheFile)) {
1428 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1429 return 2;
1432 // init UTF-8 conversion for this charset
1433 if (!$this->initCharset($charset)) {
1434 return FALSE;
1437 // UTF-8/ASCII transliteration is used as the base conversion table
1438 if (!$this->initUnicodeData('ascii')) {
1439 return FALSE;
1442 $nochar = chr($this->noCharByteVal);
1443 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1444 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1445 $c = $this->utf8_decode($utf8, $charset);
1447 if (isset($this->toASCII['utf-8'][$utf8])) {
1448 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1452 if ($cacheFile) {
1453 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1456 return 3;
1460 /********************************************
1462 * String operation functions
1464 ********************************************/
1467 * Returns a part of a string.
1468 * Unit-tested by Kasper (single byte charsets only)
1470 * @param string The character set
1471 * @param string Character string
1472 * @param integer Start position (character position)
1473 * @param integer Length (in characters)
1474 * @return string The substring
1475 * @see substr(), mb_substr()
1476 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1478 function substr($charset, $string, $start, $len = NULL) {
1479 if ($len === 0 || $string === '') {
1480 return '';
1483 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1484 // cannot omit $len, when specifying charset
1485 if ($len == NULL) {
1486 $enc = mb_internal_encoding(); // save internal encoding
1487 mb_internal_encoding($charset);
1488 $str = mb_substr($string, $start);
1489 mb_internal_encoding($enc); // restore internal encoding
1491 return $str;
1493 else {
1494 return mb_substr($string, $start, $len, $charset);
1496 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1497 // cannot omit $len, when specifying charset
1498 if ($len == NULL) {
1499 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1500 iconv_set_encoding('internal_encoding', $charset);
1501 $str = iconv_substr($string, $start);
1502 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1504 return $str;
1506 else {
1507 return iconv_substr($string, $start, $len, $charset);
1509 } elseif ($charset == 'utf-8') {
1510 return $this->utf8_substr($string, $start, $len);
1511 } elseif ($this->eucBasedSets[$charset]) {
1512 return $this->euc_substr($string, $start, $charset, $len);
1513 } elseif ($this->twoByteSets[$charset]) {
1514 return substr($string, $start * 2, $len * 2);
1515 } elseif ($this->fourByteSets[$charset]) {
1516 return substr($string, $start * 4, $len * 4);
1519 // treat everything else as single-byte encoding
1520 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1524 * Counts the number of characters.
1525 * Unit-tested by Kasper (single byte charsets only)
1527 * @param string The character set
1528 * @param string Character string
1529 * @return integer The number of characters
1530 * @see strlen()
1531 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1533 function strlen($charset, $string) {
1534 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1535 return mb_strlen($string, $charset);
1536 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1537 return iconv_strlen($string, $charset);
1538 } elseif ($charset == 'utf-8') {
1539 return $this->utf8_strlen($string);
1540 } elseif ($this->eucBasedSets[$charset]) {
1541 return $this->euc_strlen($string, $charset);
1542 } elseif ($this->twoByteSets[$charset]) {
1543 return strlen($string) / 2;
1544 } elseif ($this->fourByteSets[$charset]) {
1545 return strlen($string) / 4;
1547 // treat everything else as single-byte encoding
1548 return strlen($string);
1552 * Method to crop strings using the mb_substr function.
1554 * @param string The character set
1555 * @param string String to be cropped
1556 * @param integer Crop length (in characters)
1557 * @param string Crop signifier
1558 * @return string The shortened string
1559 * @see mb_strlen(), mb_substr()
1561 protected function cropMbstring($charset, $string, $len, $crop = '') {
1562 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1563 return $string;
1566 if ($len > 0) {
1567 $string = mb_substr($string, 0, $len, $charset) . $crop;
1568 } else {
1569 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1572 return $string;
1576 * Truncates a string and pre-/appends a string.
1577 * Unit tested by Kasper
1579 * @param string The character set
1580 * @param string Character string
1581 * @param integer Length (in characters)
1582 * @param string Crop signifier
1583 * @return string The shortened string
1584 * @see substr(), mb_strimwidth()
1585 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1587 function crop($charset, $string, $len, $crop = '') {
1588 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1589 return $this->cropMbstring($charset, $string, $len, $crop);
1592 if (intval($len) == 0) {
1593 return $string;
1596 if ($charset == 'utf-8') {
1597 $i = $this->utf8_char2byte_pos($string, $len);
1598 } elseif ($this->eucBasedSets[$charset]) {
1599 $i = $this->euc_char2byte_pos($string, $len, $charset);
1600 } else {
1601 if ($len > 0) {
1602 $i = $len;
1603 } else {
1604 $i = strlen($string) + $len;
1605 if ($i <= 0) {
1606 $i = FALSE;
1611 if ($i === FALSE) { // $len outside actual string length
1612 return $string;
1613 } else {
1614 if ($len > 0) {
1615 if (strlen($string{$i})) {
1616 return substr($string, 0, $i) . $crop;
1619 } else {
1620 if (strlen($string{$i - 1})) {
1621 return $crop . substr($string, $i);
1626 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1627 if ($len > 0) {
1628 return substr($string,0,$i).$crop;
1629 } else {
1630 return $crop.substr($string,$i);
1635 return $string;
1639 * Cuts a string short at a given byte length.
1641 * @param string The character set
1642 * @param string Character string
1643 * @param integer The byte length
1644 * @return string The shortened string
1645 * @see mb_strcut()
1646 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1648 function strtrunc($charset, $string, $len) {
1649 if ($len <= 0) {
1650 return '';
1653 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1654 return mb_strcut($string, 0, $len, $charset);
1655 } elseif ($charset == 'utf-8') {
1656 return $this->utf8_strtrunc($string, $len);
1657 } elseif ($this->eucBasedSets[$charset]) {
1658 return $this->euc_strtrunc($string, $len, $charset);
1659 } elseif ($this->twoByteSets[$charset]) {
1660 if ($len % 2) {
1661 $len--;
1662 } // don't cut at odd positions
1663 } elseif ($this->fourByteSets[$charset]) {
1664 $x = $len % 4;
1665 $len -= $x; // realign to position dividable by four
1667 // treat everything else as single-byte encoding
1668 return substr($string, 0, $len);
1672 * Translates all characters of a string into their respective case values.
1673 * Unlike strtolower() and strtoupper() this method is locale independent.
1674 * Note that the string length may change!
1675 * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1676 * Unit-tested by Kasper
1677 * Real case folding is language dependent, this method ignores this fact.
1679 * @param string Character set of string
1680 * @param string Input string to convert case for
1681 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1682 * @return string The converted string
1683 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1684 * @see strtolower(), strtoupper()
1686 function conv_case($charset, $string, $case) {
1687 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1688 if ($case == 'toLower') {
1689 $string = mb_strtolower($string, $charset);
1690 } else {
1691 $string = mb_strtoupper($string, $charset);
1693 } elseif ($charset == 'utf-8') {
1694 $string = $this->utf8_char_mapping($string, 'case', $case);
1695 } elseif (isset($this->eucBasedSets[$charset])) {
1696 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1697 } else {
1698 // treat everything else as single-byte encoding
1699 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1702 return $string;
1706 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1708 * @param string Character set of string
1709 * @param string Input string to convert
1710 * @return string The converted string
1712 function specCharsToASCII($charset, $string) {
1713 if ($charset == 'utf-8') {
1714 $string = $this->utf8_char_mapping($string, 'ascii');
1715 } elseif (isset($this->eucBasedSets[$charset])) {
1716 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1717 } else {
1718 // treat everything else as single-byte encoding
1719 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1722 return $string;
1727 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1728 * into a TYPO3-readable language code
1729 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1730 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1731 * @return string a preferred language that TYPO3 supports, or "default" if none found
1732 * @author Benjamin Mack (benni.typo3.org)
1734 public function getPreferredClientLanguage($languageCodesList) {
1735 $allLanguageCodes = array();
1736 $selectedLanguage = 'default';
1738 // get all languages where TYPO3 code is the same as the ISO code
1739 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1740 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1743 // get all languages where TYPO3 code differs from ISO code
1744 // or needs the country part
1745 // the iso codes will here overwrite the default typo3 language in the key
1746 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1747 $isoLang = join('-', explode('_', $isoLang));
1748 $allLanguageCodes[$typo3Lang] = $isoLang;
1751 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1752 $allLanguageCodes = array_flip($allLanguageCodes);
1755 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1756 // order the preferred languages after they key
1757 $sortedPreferredLanguages = array();
1758 foreach ($preferredLanguages as $preferredLanguage) {
1759 $quality = 1.0;
1760 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1761 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1763 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1766 // loop through the languages, with the highest priority first
1767 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1768 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1769 if (isset($allLanguageCodes[$preferredLanguage])) {
1770 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1771 break;
1774 // strip the country code from the end
1775 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1776 if (isset($allLanguageCodes[$preferredLanguage])) {
1777 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1778 break;
1781 if (!$selectedLanguage || $selectedLanguage == 'en') {
1782 $selectedLanguage = 'default';
1784 return $selectedLanguage;
1788 /********************************************
1790 * Internal string operation functions
1792 ********************************************/
1795 * Maps all characters of a string in a single byte charset.
1797 * @param string the string
1798 * @param string the charset
1799 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1800 * @param string 'case': conversion 'toLower' or 'toUpper'
1801 * @return string the converted string
1802 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1804 function sb_char_mapping($str, $charset, $mode, $opt = '') {
1805 switch ($mode) {
1806 case 'case':
1807 if (!$this->initCaseFolding($charset)) {
1808 return $str;
1809 } // do nothing
1810 $map =& $this->caseFolding[$charset][$opt];
1811 break;
1813 case 'ascii':
1814 if (!$this->initToASCII($charset)) {
1815 return $str;
1816 } // do nothing
1817 $map =& $this->toASCII[$charset];
1818 break;
1820 default:
1821 return $str;
1824 $out = '';
1825 for ($i = 0; strlen($str{$i}); $i++) {
1826 $c = $str{$i};
1827 if (isset($map[$c])) {
1828 $out .= $map[$c];
1829 } else {
1830 $out .= $c;
1834 return $out;
1838 /********************************************
1840 * Internal UTF-8 string operation functions
1842 ********************************************/
1845 * Returns a part of a UTF-8 string.
1846 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1848 * @param string UTF-8 string
1849 * @param integer Start position (character position)
1850 * @param integer Length (in characters)
1851 * @return string The substring
1852 * @see substr()
1853 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1855 function utf8_substr($str, $start, $len = NULL) {
1856 if (!strcmp($len, '0')) {
1857 return '';
1860 $byte_start = $this->utf8_char2byte_pos($str, $start);
1861 if ($byte_start === FALSE) {
1862 if ($start > 0) {
1863 return FALSE; // $start outside string length
1864 } else {
1865 $start = 0;
1869 $str = substr($str, $byte_start);
1871 if ($len != NULL) {
1872 $byte_end = $this->utf8_char2byte_pos($str, $len);
1873 if ($byte_end === FALSE) // $len outside actual string length
1875 return $len < 0 ? '' : $str;
1876 } // When length is less than zero and exceeds, then we return blank string.
1877 else
1879 return substr($str, 0, $byte_end);
1882 else {
1883 return $str;
1888 * Counts the number of characters of a string in UTF-8.
1889 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1891 * @param string UTF-8 multibyte character string
1892 * @return integer The number of characters
1893 * @see strlen()
1894 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1896 function utf8_strlen($str) {
1897 $n = 0;
1898 for ($i = 0; strlen($str{$i}); $i++) {
1899 $c = ord($str{$i});
1900 if (!($c & 0x80)) // single-byte (0xxxxxx)
1902 $n++;
1904 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1906 $n++;
1909 return $n;
1913 * Truncates a string in UTF-8 short at a given byte length.
1915 * @param string UTF-8 multibyte character string
1916 * @param integer the byte length
1917 * @return string the shortened string
1918 * @see mb_strcut()
1919 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1921 function utf8_strtrunc($str, $len) {
1922 $i = $len - 1;
1923 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1924 for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1925 if ($i <= 0) {
1926 return '';
1927 } // sanity check
1928 for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1929 if ($bc + $i > $len) {
1930 return substr($str, 0, $i);
1932 // fallthru: multibyte char fits into length
1934 return substr($str, 0, $len);
1938 * Find position of first occurrence of a string, both arguments are in UTF-8.
1940 * @param string UTF-8 string to search in
1941 * @param string UTF-8 string to search for
1942 * @param integer Positition to start the search
1943 * @return integer The character position
1944 * @see strpos()
1945 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1947 function utf8_strpos($haystack, $needle, $offset = 0) {
1948 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1949 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1950 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1951 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1954 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1955 if ($byte_offset === FALSE) {
1956 return FALSE;
1957 } // offset beyond string length
1959 $byte_pos = strpos($haystack, $needle, $byte_offset);
1960 if ($byte_pos === FALSE) {
1961 return FALSE;
1962 } // needle not found
1964 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1968 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1970 * @param string UTF-8 string to search in
1971 * @param string UTF-8 character to search for (single character)
1972 * @return integer The character position
1973 * @see strrpos()
1974 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1976 function utf8_strrpos($haystack, $needle) {
1977 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1978 return mb_strrpos($haystack, $needle, 'utf-8');
1979 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1980 return iconv_strrpos($haystack, $needle, 'utf-8');
1983 $byte_pos = strrpos($haystack, $needle);
1984 if ($byte_pos === FALSE) {
1985 return FALSE;
1986 } // needle not found
1988 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1992 * Translates a character position into an 'absolute' byte position.
1993 * Unit tested by Kasper.
1995 * @param string UTF-8 string
1996 * @param integer Character position (negative values start from the end)
1997 * @return integer Byte position
1998 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2000 function utf8_char2byte_pos($str, $pos) {
2001 $n = 0; // number of characters found
2002 $p = abs($pos); // number of characters wanted
2004 if ($pos >= 0) {
2005 $i = 0;
2006 $d = 1;
2007 } else {
2008 $i = strlen($str) - 1;
2009 $d = -1;
2012 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2013 $c = (int) ord($str{$i});
2014 if (!($c & 0x80)) // single-byte (0xxxxxx)
2016 $n++;
2018 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2020 $n++;
2023 if (!strlen($str{$i})) {
2024 return FALSE;
2025 } // offset beyond string length
2027 if ($pos >= 0) {
2028 // skip trailing multi-byte data bytes
2029 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2030 $i++;
2032 } else {
2033 // correct offset
2034 $i++;
2037 return $i;
2041 * Translates an 'absolute' byte position into a character position.
2042 * Unit tested by Kasper.
2044 * @param string UTF-8 string
2045 * @param integer byte position
2046 * @return integer character position
2047 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2049 function utf8_byte2char_pos($str, $pos) {
2050 $n = 0; // number of characters
2051 for ($i = $pos; $i > 0; $i--) {
2052 $c = (int) ord($str{$i});
2053 if (!($c & 0x80)) // single-byte (0xxxxxx)
2055 $n++;
2057 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2059 $n++;
2062 if (!strlen($str{$i})) {
2063 return FALSE;
2064 } // offset beyond string length
2066 return $n;
2070 * Maps all characters of an UTF-8 string.
2072 * @param string UTF-8 string
2073 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2074 * @param string 'case': conversion 'toLower' or 'toUpper'
2075 * @return string the converted string
2076 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2078 function utf8_char_mapping($str, $mode, $opt = '') {
2079 if (!$this->initUnicodeData($mode)) {
2080 return $str;
2081 } // do nothing
2083 $out = '';
2084 switch ($mode) {
2085 case 'case':
2086 $map =& $this->caseFolding['utf-8'][$opt];
2087 break;
2089 case 'ascii':
2090 $map =& $this->toASCII['utf-8'];
2091 break;
2093 default:
2094 return $str;
2097 for ($i = 0; strlen($str{$i}); $i++) {
2098 $c = ord($str{$i});
2099 if (!($c & 0x80)) // single-byte (0xxxxxx)
2101 $mbc = $str{$i};
2103 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2104 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2105 $bc++;
2106 } // calculate number of bytes
2107 $mbc = substr($str, $i, $bc);
2108 $i += $bc - 1;
2111 if (isset($map[$mbc])) {
2112 $out .= $map[$mbc];
2113 } else {
2114 $out .= $mbc;
2118 return $out;
2122 /********************************************
2124 * Internal EUC string operation functions
2126 * Extended Unix Code:
2127 * ASCII compatible 7bit single bytes chars
2128 * 8bit two byte chars
2130 * Shift-JIS is treated as a special case.
2132 ********************************************/
2135 * Cuts a string in the EUC charset family short at a given byte length.
2137 * @param string EUC multibyte character string
2138 * @param integer the byte length
2139 * @param string the charset
2140 * @return string the shortened string
2141 * @see mb_strcut()
2142 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2144 function euc_strtrunc($str, $len, $charset) {
2145 $sjis = ($charset == 'shift_jis');
2146 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2147 $c = ord($str{$i});
2148 if ($sjis) {
2149 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2150 $i++;
2151 } // advance a double-byte char
2153 else {
2154 if ($c >= 0x80) {
2155 $i++;
2156 } // advance a double-byte char
2159 if (!strlen($str{$i})) {
2160 return $str;
2161 } // string shorter than supplied length
2163 if ($i > $len) {
2164 return substr($str, 0, $len - 1); // we ended on a first byte
2165 } else {
2166 return substr($str, 0, $len);
2171 * Returns a part of a string in the EUC charset family.
2173 * @param string EUC multibyte character string
2174 * @param integer start position (character position)
2175 * @param string the charset
2176 * @param integer length (in characters)
2177 * @return string the substring
2178 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2180 function euc_substr($str, $start, $charset, $len = NULL) {
2181 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2182 if ($byte_start === FALSE) {
2183 return FALSE;
2184 } // $start outside string length
2186 $str = substr($str, $byte_start);
2188 if ($len != NULL) {
2189 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2190 if ($byte_end === FALSE) // $len outside actual string length
2192 return $str;
2194 else
2196 return substr($str, 0, $byte_end);
2199 else {
2200 return $str;
2205 * Counts the number of characters of a string in the EUC charset family.
2207 * @param string EUC multibyte character string
2208 * @param string the charset
2209 * @return integer the number of characters
2210 * @see strlen()
2211 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2213 function euc_strlen($str, $charset) {
2214 $sjis = ($charset == 'shift_jis');
2215 $n = 0;
2216 for ($i = 0; strlen($str{$i}); $i++) {
2217 $c = ord($str{$i});
2218 if ($sjis) {
2219 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2220 $i++;
2221 } // advance a double-byte char
2223 else {
2224 if ($c >= 0x80) {
2225 $i++;
2226 } // advance a double-byte char
2229 $n++;
2232 return $n;
2236 * Translates a character position into an 'absolute' byte position.
2238 * @param string EUC multibyte character string
2239 * @param integer character position (negative values start from the end)
2240 * @param string the charset
2241 * @return integer byte position
2242 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2244 function euc_char2byte_pos($str, $pos, $charset) {
2245 $sjis = ($charset == 'shift_jis');
2246 $n = 0; // number of characters seen
2247 $p = abs($pos); // number of characters wanted
2249 if ($pos >= 0) {
2250 $i = 0;
2251 $d = 1;
2252 } else {
2253 $i = strlen($str) - 1;
2254 $d = -1;
2257 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2258 $c = ord($str{$i});
2259 if ($sjis) {
2260 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2261 $i += $d;
2262 } // advance a double-byte char
2264 else {
2265 if ($c >= 0x80) {
2266 $i += $d;
2267 } // advance a double-byte char
2270 $n++;
2272 if (!strlen($str{$i})) {
2273 return FALSE;
2274 } // offset beyond string length
2276 if ($pos < 0) {
2277 $i++;
2278 } // correct offset
2280 return $i;
2284 * Maps all characters of a string in the EUC charset family.
2286 * @param string EUC multibyte character string
2287 * @param string the charset
2288 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2289 * @param string 'case': conversion 'toLower' or 'toUpper'
2290 * @return string the converted string
2291 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2293 function euc_char_mapping($str, $charset, $mode, $opt = '') {
2294 switch ($mode) {
2295 case 'case':
2296 if (!$this->initCaseFolding($charset)) {
2297 return $str;
2298 } // do nothing
2299 $map =& $this->caseFolding[$charset][$opt];
2300 break;
2302 case 'ascii':
2303 if (!$this->initToASCII($charset)) {
2304 return $str;
2305 } // do nothing
2306 $map =& $this->toASCII[$charset];
2307 break;
2309 default:
2310 return $str;
2313 $sjis = ($charset == 'shift_jis');
2314 $out = '';
2315 for ($i = 0; strlen($str{$i}); $i++) {
2316 $mbc = $str{$i};
2317 $c = ord($mbc);
2319 if ($sjis) {
2320 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2321 $mbc = substr($str, $i, 2);
2322 $i++;
2325 else {
2326 if ($c >= 0x80) { // a double-byte char
2327 $mbc = substr($str, $i, 2);
2328 $i++;
2332 if (isset($map[$mbc])) {
2333 $out .= $map[$mbc];
2334 } else {
2335 $out .= $mbc;
2339 return $out;
2344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2345 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);