lib/typo3/class.t3lib_cs.php

   1 <?php
   2 /***************************************************************
   3  *  Copyright notice
   4  *
   5  *  (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
   6  *  All rights reserved
   7  *
   8  *  This script is part of the Typo3 project. The Typo3 project is
   9  *  free software; you can redistribute it and/or modify
  10  *  it under the terms of the GNU General Public License as published by
  11  *  the Free Software Foundation; either version 2 of the License, or
  12  *  (at your option) any later version.
  13  *
  14  *  The GNU General Public License can be found at
  15  *  http://www.gnu.org/copyleft/gpl.html.
  16  *
  17  *  This script is distributed in the hope that it will be useful,
  18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  *  GNU General Public License for more details.
  21  *
  22  *  This copyright notice MUST APPEAR in all copies of the script!
  23  ***************************************************************/
  24 /**
  25  * Class for conversion between charsets.
  26  *
  27  * $Id$
  28  *
  29  * @author      Kasper Skårhøj <kasperYYYY@typo3.com>
  30  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
  31  */
  32 /**
  33  * [CLASS/FUNCTION INDEX of SCRIPT]
  34  *
  35  *
  36  *
  37  *  136: class t3lib_cs
  38  *  488:         function parse_charset($charset)
  39  *  507:         function get_locale_charset($locale)
  40  *
  41  *                        SECTION: Charset Conversion functions
  42  *  560:         function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
  43  *  600:         function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
  44  *  617:         function utf8_encode($str,$charset)
  45  *  663:         function utf8_decode($str,$charset,$useEntityForNoChar=0)
  46  *  706:         function utf8_to_entities($str)
  47  *  739:         function entities_to_utf8($str,$alsoStdHtmlEnt=0)
  48  *  773:         function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
  49  *  823:         function UnumberToChar($cbyte)
  50  *  868:         function utf8CharToUnumber($str,$hex=0)
  51  *
  52  *                        SECTION: Init functions
  53  *  911:         function initCharset($charset)
  54  *  973:         function initUnicodeData($mode=null)
  55  * 1198:         function initCaseFolding($charset)
  56  * 1260:         function initToASCII($charset)
  57  *
  58  *                        SECTION: String operation functions
  59  * 1331:         function substr($charset,$string,$start,$len=null)
  60  * 1384:         function strlen($charset,$string)
  61  * 1414:         function crop($charset,$string,$len,$crop='')
  62  * 1467:         function strtrunc($charset,$string,$len)
  63  * 1501:         function conv_case($charset,$string,$case)
  64  * 1527:         function specCharsToASCII($charset,$string)
  65  *
  66  *                        SECTION: Internal string operation functions
  67  * 1567:         function sb_char_mapping($str,$charset,$mode,$opt='')
  68  *
  69  *                        SECTION: Internal UTF-8 string operation functions
  70  * 1622:         function utf8_substr($str,$start,$len=null)
  71  * 1655:         function utf8_strlen($str)
  72  * 1676:         function utf8_strtrunc($str,$len)
  73  * 1698:         function utf8_strpos($haystack,$needle,$offset=0)
  74  * 1723:         function utf8_strrpos($haystack,$needle)
  75  * 1745:         function utf8_char2byte_pos($str,$pos)
  76  * 1786:         function utf8_byte2char_pos($str,$pos)
  77  * 1809:         function utf8_char_mapping($str,$mode,$opt='')
  78  *
  79  *                        SECTION: Internal EUC string operation functions
  80  * 1885:         function euc_strtrunc($str,$len,$charset)
  81  * 1914:         function euc_substr($str,$start,$charset,$len=null)
  82  * 1939:         function euc_strlen($str,$charset)
  83  * 1966:         function euc_char2byte_pos($str,$pos,$charset)
  84  * 2007:         function euc_char_mapping($str,$charset,$mode,$opt='')
  85  *
  86  * TOTAL FUNCTIONS: 35
  87  * (This index is automatically created/updated by the extension "extdeveval")
  88  *
  89  */
  90
  91
  92 /**
  93  * Notes on UTF-8
  94  *
  95  * Functions working on UTF-8 strings:
  96  *
  97  * - strchr/strstr
  98  * - strrchr
  99  * - substr_count
 100  * - implode/explode/join
 101  *
 102  * Functions nearly working on UTF-8 strings:
 103  *
 104  * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
 105  * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
 106  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
 107  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
 108  * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
 109  *
 110  * Functions NOT working on UTF-8 strings:
 111  *
 112  * - str*cmp
 113  * - stristr
 114  * - stripos
 115  * - substr
 116  * - strrev
 117  * - split/spliti
 118  * - ...
 119  *
 120  */
 121 /**
 122  * Class for conversion between charsets
 123  *
 124  * @author      Kasper Skårhøj <kasperYYYY@typo3.com>
 125  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 126  * @package TYPO3
 127  * @subpackage t3lib
 128  */
 129 class t3lib_cs {
 130         var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
 131
 132                 // This is the array where parsed conversion tables are stored (cached)
 133         var $parsedCharsets = array();
 134
 135                 // An array where case folding data will be stored (cached)
 136         var $caseFolding = array();
 137
 138                 // An array where charset-to-ASCII mappings are stored (cached)
 139         var $toASCII = array();
 140
 141                 // This tells the converter which charsets has two bytes per char:
 142         var $twoByteSets = array(
 143                 'ucs-2' => 1, // 2-byte Unicode
 144         );
 145
 146                 // This tells the converter which charsets has four bytes per char:
 147         var $fourByteSets = array(
 148                 'ucs-4' => 1, // 4-byte Unicode
 149                 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
 150         );
 151
 152                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
 153         var $eucBasedSets = array(
 154                 'gb2312' => 1, // Chinese, simplified.
 155                 'big5' => 1, // Chinese, traditional.
 156                 'euc-kr' => 1, // Korean
 157                 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 158         );
 159
 160                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 161                 // http://czyborra.com/charsets/iso8859.html
 162         var $synonyms = array(
 163                 'us' => 'ascii',
 164                 'us-ascii' => 'ascii',
 165                 'cp819' => 'iso-8859-1',
 166                 'ibm819' => 'iso-8859-1',
 167                 'iso-ir-100' => 'iso-8859-1',
 168                 'iso-ir-101' => 'iso-8859-2',
 169                 'iso-ir-109' => 'iso-8859-3',
 170                 'iso-ir-110' => 'iso-8859-4',
 171                 'iso-ir-144' => 'iso-8859-5',
 172                 'iso-ir-127' => 'iso-8859-6',
 173                 'iso-ir-126' => 'iso-8859-7',
 174                 'iso-ir-138' => 'iso-8859-8',
 175                 'iso-ir-148' => 'iso-8859-9',
 176                 'iso-ir-157' => 'iso-8859-10',
 177                 'iso-ir-179' => 'iso-8859-13',
 178                 'iso-ir-199' => 'iso-8859-14',
 179                 'iso-ir-203' => 'iso-8859-15',
 180                 'csisolatin1' => 'iso-8859-1',
 181                 'csisolatin2' => 'iso-8859-2',
 182                 'csisolatin3' => 'iso-8859-3',
 183                 'csisolatin5' => 'iso-8859-9',
 184                 'csisolatin8' => 'iso-8859-14',
 185                 'csisolatin9' => 'iso-8859-15',
 186                 'csisolatingreek' => 'iso-8859-7',
 187                 'iso-celtic' => 'iso-8859-14',
 188                 'latin1' => 'iso-8859-1',
 189                 'latin2' => 'iso-8859-2',
 190                 'latin3' => 'iso-8859-3',
 191                 'latin5' => 'iso-8859-9',
 192                 'latin6' => 'iso-8859-10',
 193                 'latin8' => 'iso-8859-14',
 194                 'latin9' => 'iso-8859-15',
 195                 'l1' => 'iso-8859-1',
 196                 'l2' => 'iso-8859-2',
 197                 'l3' => 'iso-8859-3',
 198                 'l5' => 'iso-8859-9',
 199                 'l6' => 'iso-8859-10',
 200                 'l8' => 'iso-8859-14',
 201                 'l9' => 'iso-8859-15',
 202                 'cyrillic' => 'iso-8859-5',
 203                 'arabic' => 'iso-8859-6',
 204                 'tis-620' => 'iso-8859-11',
 205                 'win874' => 'windows-874',
 206                 'win1250' => 'windows-1250',
 207                 'win1251' => 'windows-1251',
 208                 'win1252' => 'windows-1252',
 209                 'win1253' => 'windows-1253',
 210                 'win1254' => 'windows-1254',
 211                 'win1255' => 'windows-1255',
 212                 'win1256' => 'windows-1256',
 213                 'win1257' => 'windows-1257',
 214                 'win1258' => 'windows-1258',
 215                 'cp1250' => 'windows-1250',
 216                 'cp1251' => 'windows-1251',
 217                 'cp1252' => 'windows-1252',
 218                 'ms-ee' => 'windows-1250',
 219                 'ms-ansi' => 'windows-1252',
 220                 'ms-greek' => 'windows-1253',
 221                 'ms-turk' => 'windows-1254',
 222                 'winbaltrim' => 'windows-1257',
 223                 'koi-8ru' => 'koi-8r',
 224                 'koi8r' => 'koi-8r',
 225                 'cp878' => 'koi-8r',
 226                 'mac' => 'macroman',
 227                 'macintosh' => 'macroman',
 228                 'euc-cn' => 'gb2312',
 229                 'x-euc-cn' => 'gb2312',
 230                 'euccn' => 'gb2312',
 231                 'cp936' => 'gb2312',
 232                 'big-5' => 'big5',
 233                 'cp950' => 'big5',
 234                 'eucjp' => 'euc-jp',
 235                 'sjis' => 'shift_jis',
 236                 'shift-jis' => 'shift_jis',
 237                 'cp932' => 'shift_jis',
 238                 'cp949' => 'euc-kr',
 239                 'utf7' => 'utf-7',
 240                 'utf8' => 'utf-8',
 241                 'utf16' => 'utf-16',
 242                 'utf32' => 'utf-32',
 243                 'utf8' => 'utf-8',
 244                 'ucs2' => 'ucs-2',
 245                 'ucs4' => 'ucs-4',
 246         );
 247
 248                 // mapping of iso-639-1 language codes to script names
 249         var $lang_to_script = array(
 250                         // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
 251                 'ar' => 'arabic',
 252                 'bg' => 'cyrillic', // Bulgarian
 253                 'bs' => 'east_european', // Bosnian
 254                 'cs' => 'east_european', // Czech
 255                 'da' => 'west_european', // Danish
 256                 'de' => 'west_european', // German
 257                 'es' => 'west_european', // Spanish
 258                 'et' => 'estonian',
 259                 'eo' => 'unicode', // Esperanto
 260                 'eu' => 'west_european', // Basque
 261                 'fa' => 'arabic', // Persian
 262                 'fi' => 'west_european', // Finish
 263                 'fo' => 'west_european', // Faroese
 264                 'fr' => 'west_european', // French
 265                 'ga' => 'west_european', // Irish
 266                 'gl' => 'west_european', // Galician
 267                 'gr' => 'greek',
 268                 'he' => 'hebrew', // Hebrew (since 1998)
 269                 'hi' => 'unicode', // Hindi
 270                 'hr' => 'east_european', // Croatian
 271                 'hu' => 'east_european', // Hungarian
 272                 'iw' => 'hebrew', // Hebrew (til 1998)
 273                 'is' => 'west_european', // Icelandic
 274                 'it' => 'west_european', // Italian
 275                 'ja' => 'japanese',
 276                 'ka' => 'unicode', // Georgian
 277                 'kl' => 'west_european', // Greenlandic
 278                 'km' => 'unicode', // Khmer
 279                 'ko' => 'korean',
 280                 'lt' => 'lithuanian',
 281                 'lv' => 'west_european', // Latvian/Lettish
 282                 'nl' => 'west_european', // Dutch
 283                 'no' => 'west_european', // Norwegian
 284                 'nb' => 'west_european', // Norwegian Bokmal
 285                 'nn' => 'west_european', // Norwegian Nynorsk
 286                 'pl' => 'east_european', // Polish
 287                 'pt' => 'west_european', // Portuguese
 288                 'ro' => 'east_european', // Romanian
 289                 'ru' => 'cyrillic', // Russian
 290                 'sk' => 'east_european', // Slovak
 291                 'sl' => 'east_european', // Slovenian
 292                 'sr' => 'cyrillic', // Serbian
 293                 'sv' => 'west_european', // Swedish
 294                 'sq' => 'albanian', // Albanian
 295                 'th' => 'thai',
 296                 'uk' => 'cyrillic', // Ukranian
 297                 'vi' => 'vietnamese',
 298                 'zh' => 'chinese',
 299                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 300                         // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
 301                 'ara' => 'arabic',
 302                 'bgr' => 'cyrillic', // Bulgarian
 303                 'cat' => 'west_european', // Catalan
 304                 'chs' => 'simpl_chinese',
 305                 'cht' => 'trad_chinese',
 306                 'csy' => 'east_european', // Czech
 307                 'dan' => 'west_european', // Danisch
 308                 'deu' => 'west_european', // German
 309                 'dea' => 'west_european', // German (Austrian)
 310                 'des' => 'west_european', // German (Swiss)
 311                 'ena' => 'west_european', // English (Australian)
 312                 'enc' => 'west_european', // English (Canadian)
 313                 'eng' => 'west_european', // English
 314                 'enz' => 'west_european', // English (New Zealand)
 315                 'enu' => 'west_european', // English (United States)
 316                 'euq' => 'west_european', // Basque
 317                 'fos' => 'west_european', // Faroese
 318                 'far' => 'arabic', // Persian
 319                 'fin' => 'west_european', // Finish
 320                 'fra' => 'west_european', // French
 321                 'frb' => 'west_european', // French (Belgian)
 322                 'frc' => 'west_european', // French (Canadian)
 323                 'frs' => 'west_european', // French (Swiss)
 324                 'geo' => 'unicode', // Georgian
 325                 'glg' => 'west_european', // Galician
 326                 'ell' => 'greek',
 327                 'heb' => 'hebrew',
 328                 'hin' => 'unicode', // Hindi
 329                 'hun' => 'east_european', // Hungarian
 330                 'isl' => 'west_euorpean', // Icelandic
 331                 'ita' => 'west_european', // Italian
 332                 'its' => 'west_european', // Italian (Swiss)
 333                 'jpn' => 'japanese',
 334                 'khm' => 'unicode', // Khmer
 335                 'kor' => 'korean',
 336                 'lth' => 'lithuanian',
 337                 'lvi' => 'west_european', // Latvian/Lettish
 338                 'msl' => 'west_european', // Malay
 339                 'nlb' => 'west_european', // Dutch (Belgian)
 340                 'nld' => 'west_european', // Dutch
 341                 'nor' => 'west_european', // Norwegian (bokmal)
 342                 'non' => 'west_european', // Norwegian (nynorsk)
 343                 'plk' => 'east_european', // Polish
 344                 'ptg' => 'west_european', // Portuguese
 345                 'ptb' => 'west_european', // Portuguese (Brazil)
 346                 'rom' => 'east_european', // Romanian
 347                 'rus' => 'cyrillic', // Russian
 348                 'slv' => 'east_european', // Slovenian
 349                 'sky' => 'east_european', // Slovak
 350                 'srl' => 'east_european', // Serbian (Latin)
 351                 'srb' => 'cyrillic', // Serbian (Cyrillic)
 352                 'esp' => 'west_european', // Spanish (trad. sort)
 353                 'esm' => 'west_european', // Spanish (Mexican)
 354                 'esn' => 'west_european', // Spanish (internat. sort)
 355                 'sve' => 'west_european', // Swedish
 356                 'sqi' => 'albanian', // Albanian
 357                 'tha' => 'thai',
 358                 'trk' => 'turkish',
 359                 'ukr' => 'cyrillic', // Ukrainian
 360                         // English language names
 361                 'albanian' => 'albanian',
 362                 'arabic' => 'arabic',
 363                 'basque' => 'west_european',
 364                 'bosnian' => 'east_european',
 365                 'bulgarian' => 'east_european',
 366                 'catalan' => 'west_european',
 367                 'croatian' => 'east_european',
 368                 'czech' => 'east_european',
 369                 'danish' => 'west_european',
 370                 'dutch' => 'west_european',
 371                 'english' => 'west_european',
 372                 'esperanto' => 'unicode',
 373                 'estonian' => 'estonian',
 374                 'faroese' => 'west_european',
 375                 'farsi' => 'arabic',
 376                 'finnish' => 'west_european',
 377                 'french' => 'west_european',
 378                 'galician' => 'west_european',
 379                 'georgian' => 'unicode',
 380                 'german' => 'west_european',
 381                 'greek' => 'greek',
 382                 'greenlandic' => 'west_european',
 383                 'hebrew' => 'hebrew',
 384                 'hindi' => 'unicode',
 385                 'hungarian' => 'east_european',
 386                 'icelandic' => 'west_european',
 387                 'italian' => 'west_european',
 388                 'khmer' => 'unicode',
 389                 'latvian' => 'west_european',
 390                 'lettish' => 'west_european',
 391                 'lithuanian' => 'lithuanian',
 392                 'malay' => 'west_european',
 393                 'norwegian' => 'west_european',
 394                 'persian' => 'arabic',
 395                 'polish' => 'east_european',
 396                 'portuguese' => 'west_european',
 397                 'russian' => 'cyrillic',
 398                 'romanian' => 'east_european',
 399                 'serbian' => 'cyrillic',
 400                 'slovak' => 'east_european',
 401                 'slovenian' => 'east_european',
 402                 'spanish' => 'west_european',
 403                 'svedish' => 'west_european',
 404                 'that' => 'thai',
 405                 'turkish' => 'turkish',
 406                 'ukrainian' => 'cyrillic',
 407         );
 408
 409                 // mapping of language (family) names to charsets on Unix
 410         var $script_to_charset_unix = array(
 411                 'west_european' => 'iso-8859-1',
 412                 'estonian' => 'iso-8859-1',
 413                 'east_european' => 'iso-8859-2',
 414                 'baltic' => 'iso-8859-4',
 415                 'cyrillic' => 'iso-8859-5',
 416                 'arabic' => 'iso-8859-6',
 417                 'greek' => 'iso-8859-7',
 418                 'hebrew' => 'iso-8859-8',
 419                 'turkish' => 'iso-8859-9',
 420                 'thai' => 'iso-8859-11', // = TIS-620
 421                 'lithuanian' => 'iso-8859-13',
 422                 'chinese' => 'gb2312', // = euc-cn
 423                 'japanese' => 'euc-jp',
 424                 'korean' => 'euc-kr',
 425                 'simpl_chinese' => 'gb2312',
 426                 'trad_chinese' => 'big5',
 427                 'vietnamese' => '',
 428                 'unicode' => 'utf-8',
 429                 'albanian' => 'utf-8'
 430         );
 431
 432                 // mapping of language (family) names to charsets on Windows
 433         var $script_to_charset_windows = array(
 434                 'east_european' => 'windows-1250',
 435                 'cyrillic' => 'windows-1251',
 436                 'west_european' => 'windows-1252',
 437                 'greek' => 'windows-1253',
 438                 'turkish' => 'windows-1254',
 439                 'hebrew' => 'windows-1255',
 440                 'arabic' => 'windows-1256',
 441                 'baltic' => 'windows-1257',
 442                 'estonian' => 'windows-1257',
 443                 'lithuanian' => 'windows-1257',
 444                 'vietnamese' => 'windows-1258',
 445                 'thai' => 'cp874',
 446                 'korean' => 'cp949',
 447                 'chinese' => 'gb2312',
 448                 'japanese' => 'shift_jis',
 449                 'simpl_chinese' => 'gb2312',
 450                 'trad_chinese' => 'big5',
 451                 'albanian' => 'windows-1250',
 452                 'unicode' => 'utf-8'
 453         );
 454
 455                 // mapping of locale names to charsets
 456         var $locale_to_charset = array(
 457                 'japanese.euc' => 'euc-jp',
 458                 'ja_jp.ujis' => 'euc-jp',
 459                 'korean.euc' => 'euc-kr',
 460                 'sr@Latn' => 'iso-8859-2',
 461                 'zh_cn' => 'gb2312',
 462                 'zh_hk' => 'big5',
 463                 'zh_tw' => 'big5',
 464         );
 465
 466                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 467                 // Empty values means "iso-8859-1"
 468         var $charSetArray = array(
 469                 'dk' => '',
 470                 'de' => '',
 471                 'no' => '',
 472                 'it' => '',
 473                 'fr' => '',
 474                 'es' => '',
 475                 'nl' => '',
 476                 'cz' => 'windows-1250',
 477                 'pl' => 'iso-8859-2',
 478                 'si' => 'windows-1250',
 479                 'fi' => '',
 480                 'tr' => 'iso-8859-9',
 481                 'se' => '',
 482                 'pt' => '',
 483                 'ru' => 'windows-1251',
 484                 'ro' => 'iso-8859-2',
 485                 'ch' => 'gb2312',
 486                 'sk' => 'windows-1250',
 487                 'lt' => 'windows-1257',
 488                 'is' => 'utf-8',
 489                 'hr' => 'windows-1250',
 490                 'hu' => 'iso-8859-2',
 491                 'gl' => '',
 492                 'th' => 'iso-8859-11',
 493                 'gr' => 'iso-8859-7',
 494                 'hk' => 'big5',
 495                 'eu' => '',
 496                 'bg' => 'windows-1251',
 497                 'br' => '',
 498                 'et' => 'iso-8859-4',
 499                 'ar' => 'iso-8859-6',
 500                 'he' => 'utf-8',
 501                 'ua' => 'windows-1251',
 502                 'jp' => 'shift_jis',
 503                 'lv' => 'utf-8',
 504                 'vn' => 'utf-8',
 505                 'ca' => 'iso-8859-15',
 506                 'ba' => 'iso-8859-2',
 507                 'kr' => 'euc-kr',
 508                 'eo' => 'utf-8',
 509                 'my' => '',
 510                 'hi' => 'utf-8',
 511                 'fo' => 'utf-8',
 512                 'fa' => 'utf-8',
 513                 'sr' => 'utf-8',
 514                 'sq' => 'utf-8',
 515                 'ge' => 'utf-8',
 516                 'ga' => '',
 517                 'km' => 'utf-8',
 518                 'qc' => '',
 519         );
 520
 521                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 522                 // Missing keys means: same as Typo3
 523         var $isoArray = array(
 524                 'ba' => 'bs',
 525                 'br' => 'pt_BR',
 526                 'ch' => 'zh_CN',
 527                 'cz' => 'cs',
 528                 'dk' => 'da',
 529                 'si' => 'sl',
 530                 'se' => 'sv',
 531                 'gl' => 'kl',
 532                 'gr' => 'el',
 533                 'hk' => 'zh_HK',
 534                 'kr' => 'ko',
 535                 'ua' => 'uk',
 536                 'jp' => 'ja',
 537                 'qc' => 'fr_CA',
 538                 'vn' => 'vi',
 539                 'ge' => 'ka',
 540                 'ga' => 'gl',
 541         );
 542
 543         /**
 544          * Normalize - changes input character set to lowercase letters.
 545          *
 546          * @param       string          Input charset
 547          * @return      string          Normalized charset
 548          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 549          */
 550         function parse_charset($charset) {
 551                 $charset = trim(strtolower($charset));
 552                 if (isset($this->synonyms[$charset])) {
 553                         $charset = $this->synonyms[$charset];
 554                 }
 555
 556                 return $charset;
 557         }
 558
 559         /**
 560          * Get the charset of a locale.
 561          *
 562          * ln                   language
 563          * ln_CN                 language / country
 564          * ln_CN.cs       language / country / charset
 565          * ln_CN.cs@mod  language / country / charset / modifier
 566          *
 567          * @param       string          Locale string
 568          * @return      string          Charset resolved for locale string
 569          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 570          */
 571         function get_locale_charset($locale) {
 572                 $locale = strtolower($locale);
 573
 574                         // exact locale specific charset?
 575                 if (isset($this->locale_to_charset[$locale])) {
 576                         return $this->locale_to_charset[$locale];
 577                 }
 578
 579                         // get modifier
 580                 list($locale, $modifier) = explode('@', $locale);
 581
 582                         // locale contains charset: use it
 583                 list($locale, $charset) = explode('.', $locale);
 584                 if ($charset) {
 585                         return $this->parse_charset($charset);
 586                 }
 587
 588                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 589                 if ($modifier == 'euro') {
 590                         return 'iso-8859-15';
 591                 }
 592
 593                         // get language
 594                 list($language, $country) = explode('_', $locale);
 595                 if (isset($this->lang_to_script[$language])) {
 596                         $script = $this->lang_to_script[$language];
 597                 }
 598
 599                 if (TYPO3_OS == 'WIN') {
 600                         $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
 601                 } else {
 602                         $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
 603                 }
 604
 605                 return $cs;
 606         }
 607
 608
 609         /********************************************
 610          *
 611          * Charset Conversion functions
 612          *
 613          ********************************************/
 614
 615         /**
 616          * Convert from one charset to another charset.
 617          *
 618          * @param       string          Input string
 619          * @param       string          From charset (the current charset of the string)
 620          * @param       string          To charset (the output charset wanted)
 621          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 622          * @return      string          Converted string
 623          * @see convArray()
 624          */
 625         function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
 626                 if ($fromCS == $toCS) {
 627                         return $str;
 628                 }
 629
 630                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 631                 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
 632                         switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
 633                                 case 'mbstring':
 634                                         $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
 635                                         if (FALSE !== $conv_str) {
 636                                                 return $conv_str;
 637                                         } // returns false for unsupported charsets
 638                                         break;
 639
 640                                 case 'iconv':
 641                                         $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
 642                                         if (FALSE !== $conv_str) {
 643                                                 return $conv_str;
 644                                         }
 645                                         break;
 646
 647                                 case 'recode':
 648                                         $conv_str = recode_string($fromCS . '..' . $toCS, $str);
 649                                         if (FALSE !== $conv_str) {
 650                                                 return $conv_str;
 651                                         }
 652                                         break;
 653                         }
 654                         // fallback to TYPO3 conversion
 655                 }
 656
 657                 if ($fromCS != 'utf-8') {
 658                         $str = $this->utf8_encode($str, $fromCS);
 659                 }
 660                 if ($toCS != 'utf-8') {
 661                         $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
 662                 }
 663                 return $str;
 664         }
 665
 666         /**
 667          * Convert all elements in ARRAY with type string from one charset to another charset.
 668          * NOTICE: Array is passed by reference!
 669          *
 670          * @param       string          Input array, possibly multidimensional
 671          * @param       string          From charset (the current charset of the string)
 672          * @param       string          To charset (the output charset wanted)
 673          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 674          * @return      void
 675          * @see conv()
 676          */
 677         function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
 678                 foreach ($array as $key => $value) {
 679                         if (is_array($array[$key])) {
 680                                 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 681                         } elseif (is_string($array[$key])) {
 682                                 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 683                         }
 684                 }
 685         }
 686
 687         /**
 688          * Converts $str from $charset to UTF-8
 689          *
 690          * @param       string          String in local charset to convert to UTF-8
 691          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 692          * @return      string          Output string, converted to UTF-8
 693          */
 694         function utf8_encode($str, $charset) {
 695
 696                 if ($charset === 'utf-8') {
 697                         return $str;
 698                 }
 699
 700                         // Charset is case-insensitive.
 701                 if ($this->initCharset($charset)) { // Parse conv. table if not already...
 702                         $strLen = strlen($str);
 703                         $outStr = '';
 704
 705                         for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
 706                                 $chr = substr($str, $a, 1);
 707                                 $ord = ord($chr);
 708                                 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
 709                                         $ord2 = ord($str{$a + 1});
 710                                         $ord = $ord << 8 | $ord2; // assume big endian
 711
 712                                         if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 713                                                 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 714                                         } else {
 715                                                 $outStr .= chr($this->noCharByteVal);
 716                                         } // No char exists
 717                                         $a++;
 718                                 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
 719                                         if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 720                                                 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
 721                                                         $a++;
 722                                                         $ord2 = ord(substr($str, $a, 1));
 723                                                         $ord = $ord * 256 + $ord2;
 724                                                 }
 725                                         }
 726
 727                                         if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 728                                                 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 729                                         } else {
 730                                                 $outStr .= chr($this->noCharByteVal);
 731                                         } // No char exists
 732                                 } else {
 733                                         $outStr .= $chr;
 734                                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 735                         }
 736                         return $outStr;
 737                 }
 738         }
 739
 740         /**
 741          * Converts $str from UTF-8 to $charset
 742          *
 743          * @param       string          String in UTF-8 to convert to local charset
 744          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 745          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 746          * @return      string          Output string, converted to local charset
 747          */
 748         function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
 749
 750                 if ($charset === 'utf-8') {
 751                         return $str;
 752                 }
 753
 754                         // Charset is case-insensitive.
 755                 if ($this->initCharset($charset)) { // Parse conv. table if not already...
 756                         $strLen = strlen($str);
 757                         $outStr = '';
 758                         $buf = '';
 759                         for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
 760                                 $chr = substr($str, $a, 1);
 761                                 $ord = ord($chr);
 762                                 if ($ord > 127) { // This means multibyte! (first byte!)
 763                                         if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 764
 765                                                 $buf = $chr; // Add first byte
 766                                                 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 767                                                         $ord = $ord << 1; // Shift it left and ...
 768                                                         if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 769                                                                 $a++; // Increase pointer...
 770                                                                 $buf .= substr($str, $a, 1); // ... and add the next char.
 771                                                         } else {
 772                                                                 break;
 773                                                         }
 774                                                 }
 775
 776                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
 777                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
 778                                                         if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 779                                                                 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
 780                                                         } else {
 781                                                                 $outStr .= chr($mByte);
 782                                                         }
 783                                                 } elseif ($useEntityForNoChar) { // Create num entity:
 784                                                         $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 785                                                 } else {
 786                                                         $outStr .= chr($this->noCharByteVal);
 787                                                 } // No char exists
 788                                         } else {
 789                                                 $outStr .= chr($this->noCharByteVal);
 790                                         } // No char exists (MIDDLE of MB sequence!)
 791                                 } else {
 792                                         $outStr .= $chr;
 793                                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 794                         }
 795                         return $outStr;
 796                 }
 797         }
 798
 799         /**
 800          * Converts all chars > 127 to numeric entities.
 801          *
 802          * @param       string          Input string
 803          * @return      string          Output string
 804          */
 805         function utf8_to_entities($str) {
 806                 $strLen = strlen($str);
 807                 $outStr = '';
 808                 $buf = '';
 809                 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 810                         $chr = substr($str, $a, 1);
 811                         $ord = ord($chr);
 812                         if ($ord > 127) { // This means multibyte! (first byte!)
 813                                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 814                                         $buf = $chr; // Add first byte
 815                                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 816                                                 $ord = $ord << 1; // Shift it left and ...
 817                                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 818                                                         $a++; // Increase pointer...
 819                                                         $buf .= substr($str, $a, 1); // ... and add the next char.
 820                                                 } else {
 821                                                         break;
 822                                                 }
 823                                         }
 824
 825                                         $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 826                                 } else {
 827                                         $outStr .= chr($this->noCharByteVal);
 828                                 } // No char exists (MIDDLE of MB sequence!)
 829                         } else {
 830                                 $outStr .= $chr;
 831                         } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 832                 }
 833
 834                 return $outStr;
 835         }
 836
 837         /**
 838          * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 839          *
 840          * @param       string          Input string, UTF-8
 841          * @param       boolean         If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 842          * @return      string          Output string
 843          */
 844         function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
 845                 if ($alsoStdHtmlEnt) {
 846                         $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
 847                 }
 848
 849                 $token = md5(microtime());
 850                 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
 851                 foreach ($parts as $k => $v) {
 852                         if ($k % 2) {
 853                                 if (substr($v, 0, 1) == '#') { // Dec or hex entities:
 854                                         if (substr($v, 1, 1) == 'x') {
 855                                                 $parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
 856                                         } else {
 857                                                 $parts[$k] = $this->UnumberToChar(substr($v, 1));
 858                                         }
 859                                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
 860                                         $parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
 861                                 } else { // No conversion:
 862                                         $parts[$k] = '&' . $v . ';';
 863                                 }
 864                         }
 865                 }
 866
 867                 return implode('', $parts);
 868         }
 869
 870         /**
 871          * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 872          *
 873          * @param       string          Input string, UTF-8
 874          * @param       boolean         If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 875          * @param       boolean         If set, then instead of integer numbers the real UTF-8 char is returned.
 876          * @return      array           Output array with the char numbers
 877          */
 878         function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
 879                         // If entities must be registered as well...:
 880                 if ($convEntities) {
 881                         $str = $this->entities_to_utf8($str, 1);
 882                 }
 883                         // Do conversion:
 884                 $strLen = strlen($str);
 885                 $outArr = array();
 886                 $buf = '';
 887                 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 888                         $chr = substr($str, $a, 1);
 889                         $ord = ord($chr);
 890                         if ($ord > 127) { // This means multibyte! (first byte!)
 891                                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 892                                         $buf = $chr; // Add first byte
 893                                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 894                                                 $ord = $ord << 1; // Shift it left and ...
 895                                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 896                                                         $a++; // Increase pointer...
 897                                                         $buf .= substr($str, $a, 1); // ... and add the next char.
 898                                                 } else {
 899                                                         break;
 900                                                 }
 901                                         }
 902
 903                                         $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
 904                                 } else {
 905                                         $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
 906                                 } // No char exists (MIDDLE of MB sequence!)
 907                         } else {
 908                                 $outArr[] = $retChar ? chr($ord) : $ord;
 909                         } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 910                 }
 911
 912                 return $outArr;
 913         }
 914
 915         /**
 916          * Converts a UNICODE number to a UTF-8 multibyte character
 917          * Algorithm based on script found at From: http://czyborra.com/utf/
 918          * Unit-tested by Kasper
 919          *
 920          * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 921          *
 922          *  bytes | bits | representation
 923          *        1 |   7 | 0vvvvvvv
 924          *        2 |   11 | 110vvvvv 10vvvvvv
 925          *        3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 926          *        4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 927          *        5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 928          *        6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 929          *
 930          * @param       integer         UNICODE integer
 931          * @return      string          UTF-8 multibyte character string
 932          * @see utf8CharToUnumber()
 933          */
 934         function UnumberToChar($cbyte) {
 935                 $str = '';
 936
 937                 if ($cbyte < 0x80) {
 938                         $str .= chr($cbyte);
 939                 } else {
 940                         if ($cbyte < 0x800) {
 941                                 $str .= chr(0xC0 | ($cbyte >> 6));
 942                                 $str .= chr(0x80 | ($cbyte & 0x3F));
 943                         } else {
 944                                 if ($cbyte < 0x10000) {
 945                                         $str .= chr(0xE0 | ($cbyte >> 12));
 946                                         $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 947                                         $str .= chr(0x80 | ($cbyte & 0x3F));
 948                                 } else {
 949                                         if ($cbyte < 0x200000) {
 950                                                 $str .= chr(0xF0 | ($cbyte >> 18));
 951                                                 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 952                                                 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 953                                                 $str .= chr(0x80 | ($cbyte & 0x3F));
 954                                         } else {
 955                                                 if ($cbyte < 0x4000000) {
 956                                                         $str .= chr(0xF8 | ($cbyte >> 24));
 957                                                         $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 958                                                         $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 959                                                         $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 960                                                         $str .= chr(0x80 | ($cbyte & 0x3F));
 961                                                 } else {
 962                                                         if ($cbyte < 0x80000000) {
 963                                                                 $str .= chr(0xFC | ($cbyte >> 30));
 964                                                                 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
 965                                                                 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 966                                                                 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 967                                                                 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 968                                                                 $str .= chr(0x80 | ($cbyte & 0x3F));
 969                                                         } else { // Cannot express a 32-bit character in UTF-8
 970                                                                 $str .= chr($this->noCharByteVal);
 971                                                         }
 972                                                 }
 973                                         }
 974                                 }
 975                         }
 976                 }
 977                 return $str;
 978         }
 979
 980         /**
 981          * Converts a UTF-8 Multibyte character to a UNICODE number
 982          * Unit-tested by Kasper
 983          *
 984          * @param       string          UTF-8 multibyte character string
 985          * @param       boolean         If set, then a hex. number is returned.
 986          * @return      integer         UNICODE integer
 987          * @see UnumberToChar()
 988          */
 989         function utf8CharToUnumber($str, $hex = 0) {
 990                 $ord = ord(substr($str, 0, 1)); // First char
 991
 992                 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
 993                         $binBuf = '';
 994                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 995                                 $ord = $ord << 1; // Shift it left and ...
 996                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 997                                         $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
 998                                 } else {
 999                                         break;
1000                                 }
1001                         }
1002                         $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
1003
1004                         $int = bindec($binBuf);
1005                 } else {
1006                         $int = $ord;
1007                 }
1008
1009                 return $hex ? 'x' . dechex($int) : $int;
1010         }
1011
1012
1013         /********************************************
1014          *
1015          * Init functions
1016          *
1017          ********************************************/
1018
1019         /**
1020          * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1021          * This function is automatically called by the conversion functions
1022          *
1023          * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1024          *
1025          * @param       string          The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1026          * @return      integer         Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1027          * @access private
1028          */
1029         function initCharset($charset) {
1030                         // Only process if the charset is not yet loaded:
1031                 if (empty($this->parsedCharsets[$charset]) || !is_array($this->parsedCharsets[$charset])) {
1032
1033                                 // Conversion table filename:
1034                         $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1035
1036                                 // If the conversion table is found:
1037                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1038                                         // Cache file for charsets:
1039                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1040                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1041                                 if ($cacheFile && @is_file($cacheFile)) {
1042                                         $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1043                                 } else {
1044                                                 // Parse conversion table into lines:
1045                                         $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1046                                                 // Initialize the internal variable holding the conv. table:
1047                                         $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1048                                                 // traverse the lines:
1049                                         $detectedType = '';
1050                                         foreach ($lines as $value) {
1051                                                 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1052
1053                                                                 // Detect type if not done yet: (Done on first real line)
1054                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
1055                                                         if (!$detectedType) {
1056                                                                 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1057                                                         }
1058
1059                                                         if ($detectedType == 'ms-token') {
1060                                                                 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1061                                                         } elseif ($detectedType == 'whitespaced') {
1062                                                                 $regA = array();
1063                                                                 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1064                                                                 $hexbyte = $regA[1];
1065                                                                 $utf8 = 'U+' . $regA[2];
1066                                                         }
1067                                                         $decval = hexdec(trim($hexbyte));
1068                                                         if ($decval > 127) {
1069                                                                 $utf8decval = hexdec(substr(trim($utf8), 2));
1070                                                                 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1071                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1072                                                         }
1073                                                 }
1074                                         }
1075                                         if ($cacheFile) {
1076                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1077                                         }
1078                                 }
1079                                 return 2;
1080                         } else {
1081                                 return FALSE;
1082                         }
1083                 } else {
1084                         return 1;
1085                 }
1086         }
1087
1088         /**
1089          * This function initializes all UTF-8 character data tables.
1090          *
1091          * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1092          *
1093          * @param       string          Mode ("case", "ascii", ...)
1094          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1095          * @access private
1096          */
1097         function initUnicodeData($mode = NULL) {
1098                         // cache files
1099                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1100                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1101
1102                         // Only process if the tables are not yet loaded
1103                 switch ($mode) {
1104                         case 'case':
1105                                 if (is_array($this->caseFolding['utf-8'])) {
1106                                         return 1;
1107                                 }
1108
1109                                         // Use cached version if possible
1110                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
1111                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1112                                         return 2;
1113                                 }
1114                                 break;
1115
1116                         case 'ascii':
1117                                 if (is_array($this->toASCII['utf-8'])) {
1118                                         return 1;
1119                                 }
1120
1121                                         // Use cached version if possible
1122                                 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1123                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1124                                         return 2;
1125                                 }
1126                                 break;
1127                 }
1128
1129                         // process main Unicode data file
1130                 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1131                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1132                         return FALSE;
1133                 }
1134
1135                 $fh = fopen($unicodeDataFile, 'rb');
1136                 if (!$fh) {
1137                         return FALSE;
1138                 }
1139
1140                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1141                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1142                 $this->caseFolding['utf-8'] = array();
1143                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1144                 $utf8CaseFolding['toUpper'] = array();
1145                 $utf8CaseFolding['toLower'] = array();
1146                 $utf8CaseFolding['toTitle'] = array();
1147
1148                 $decomposition = array(); // array of temp. decompositions
1149                 $mark = array(); // array of chars that are marks (eg. composing accents)
1150                 $number = array(); // array of chars that are numbers (eg. digits)
1151                 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1152
1153                 while (!feof($fh)) {
1154                         $line = fgets($fh, 4096);
1155                                 // has a lot of info
1156                         list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1157
1158                         $ord = hexdec($char);
1159                         if ($ord > 0xFFFF) {
1160                                 break;
1161                         } // only process the BMP
1162
1163                         $utf8_char = $this->UnumberToChar($ord);
1164
1165                         if ($upper) {
1166                                 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1167                         }
1168                         if ($lower) {
1169                                 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1170                         }
1171                                 // store "title" only when different from "upper" (only a few)
1172                         if ($title && $title != $upper) {
1173                                 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1174                         }
1175
1176                         switch ($cat{0}) {
1177                                 case 'M': // mark (accent, umlaut, ...)
1178                                         $mark["U+$char"] = 1;
1179                                         break;
1180
1181                                 case 'N': // numeric value
1182                                         if ($ord > 0x80 && $num != '') {
1183                                                 $number["U+$char"] = $num;
1184                                         }
1185                         }
1186
1187                                 // accented Latin letters without "official" decomposition
1188                         $match = array();
1189                         if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1190                                 $c = ord($match[2]);
1191                                 if ($match[1] == 'SMALL') {
1192                                         $c += 32;
1193                                 }
1194
1195                                 $decomposition["U+$char"] = array(dechex($c));
1196                                 continue;
1197                         }
1198
1199                         $match = array();
1200                         if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1201                                 switch ($match[1]) {
1202                                         case '<circle>': // add parenthesis as circle replacement, eg (1)
1203                                                 $match[2] = '0028 ' . $match[2] . ' 0029';
1204                                                 break;
1205
1206                                         case '<square>': // add square brackets as square replacement, eg [1]
1207                                                 $match[2] = '005B ' . $match[2] . ' 005D';
1208                                                 break;
1209
1210                                         case '<compat>': // ignore multi char decompositions that start with a space
1211                                                 if (preg_match('/^0020 /', $match[2])) {
1212                                                         continue 2;
1213                                                 }
1214                                                 break;
1215
1216                                                 // ignore Arabic and vertical layout presentation decomposition
1217                                         case '<initial>':
1218                                         case '<medial>':
1219                                         case '<final>':
1220                                         case '<isolated>':
1221                                         case '<vertical>':
1222                                                 continue 2;
1223                                 }
1224                                 $decomposition["U+$char"] = explode(' ', $match[2]);
1225                         }
1226                 }
1227                 fclose($fh);
1228
1229                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1230                 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1231                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1232                         $fh = fopen($specialCasingFile, 'rb');
1233                         if ($fh) {
1234                                 while (!feof($fh)) {
1235                                         $line = fgets($fh, 4096);
1236                                         if ($line{0} != '#' && trim($line) != '') {
1237
1238                                                 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1239                                                 if ($cond == '' || $cond{0} == '#') {
1240                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
1241                                                         if ($char != $lower) {
1242                                                                 $arr = explode(' ', $lower);
1243                                                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1245                                                         }
1246                                                         if ($char != $title && $title != $upper) {
1247                                                                 $arr = explode(' ', $title);
1248                                                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1249                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1250                                                         }
1251                                                         if ($char != $upper) {
1252                                                                 $arr = explode(' ', $upper);
1253                                                                 for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1254                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1255                                                         }
1256                                                 }
1257                                         }
1258                                 }
1259                                 fclose($fh);
1260                         }
1261                 }
1262
1263                         // process custom decompositions
1264                 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1265                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1266                         $fh = fopen($customTranslitFile, 'rb');
1267                         if ($fh) {
1268                                 while (!feof($fh)) {
1269                                         $line = fgets($fh, 4096);
1270                                         if ($line{0} != '#' && trim($line) != '') {
1271                                                 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1272                                                 if (!$translit) {
1273                                                         $omit["U+$char"] = 1;
1274                                                 }
1275                                                 $decomposition["U+$char"] = explode(' ', $translit);
1276
1277                                         }
1278                                 }
1279                                 fclose($fh);
1280                         }
1281                 }
1282
1283                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1284                 foreach ($decomposition as $from => $to) {
1285                         $code_decomp = array();
1286
1287                         while ($code_value = array_shift($to)) {
1288                                 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1289                                         foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1290                                                 array_unshift($to, $cv);
1291                                         }
1292                                 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1293                                         array_push($code_decomp, $code_value);
1294                                 }
1295                         }
1296                         if (count($code_decomp) || isset($omit[$from])) {
1297                                 $decomposition[$from] = $code_decomp;
1298                         } else {
1299                                 unset($decomposition[$from]);
1300                         }
1301                 }
1302
1303                         // create ascii only mapping
1304                 $this->toASCII['utf-8'] = array();
1305                 $ascii =& $this->toASCII['utf-8'];
1306
1307                 foreach ($decomposition as $from => $to) {
1308                         $code_decomp = array();
1309                         while ($code_value = array_shift($to)) {
1310                                 $ord = hexdec($code_value);
1311                                 if ($ord > 127) {
1312                                         continue 2;
1313                                 } // skip decompositions containing non-ASCII chars
1314                                 else
1315                                 {
1316                                         array_push($code_decomp, chr($ord));
1317                                 }
1318                         }
1319                         $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1320                 }
1321
1322                         // add numeric decompositions
1323                 foreach ($number as $from => $to) {
1324                         $utf8_char = $this->UnumberToChar(hexdec($from));
1325                         if (!isset($ascii[$utf8_char])) {
1326                                 $ascii[$utf8_char] = $to;
1327                         }
1328                 }
1329
1330                 if ($cacheFileCase) {
1331                         t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1332                 }
1333
1334                 if ($cacheFileASCII) {
1335                         t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1336                 }
1337
1338                 return 3;
1339         }
1340
1341         /**
1342          * This function initializes the folding table for a charset other than UTF-8.
1343          * This function is automatically called by the case folding functions.
1344          *
1345          * @param       string          Charset for which to initialize case folding.
1346          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1347          * @access private
1348          */
1349         function initCaseFolding($charset) {
1350                         // Only process if the case table is not yet loaded:
1351                 if (is_array($this->caseFolding[$charset])) {
1352                         return 1;
1353                 }
1354
1355                         // Use cached version if possible
1356                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1357                 if ($cacheFile && @is_file($cacheFile)) {
1358                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1359                         return 2;
1360                 }
1361
1362                         // init UTF-8 conversion for this charset
1363                 if (!$this->initCharset($charset)) {
1364                         return FALSE;
1365                 }
1366
1367                         // UTF-8 case folding is used as the base conversion table
1368                 if (!$this->initUnicodeData('case')) {
1369                         return FALSE;
1370                 }
1371
1372                 $nochar = chr($this->noCharByteVal);
1373                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1374                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1375                         $c = $this->utf8_decode($utf8, $charset);
1376
1377                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1378                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1379                         if ($cc != '' && $cc != $nochar) {
1380                                 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1381                         }
1382
1383                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1384                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1385                         if ($cc != '' && $cc != $nochar) {
1386                                 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1387                         }
1388
1389                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1390                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1391                         if ($cc != '' && $cc != $nochar) {
1392                                 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1393                         }
1394                 }
1395
1396                         // add the ASCII case table
1397                 for ($i = ord('a'); $i <= ord('z'); $i++) {
1398                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1399                 }
1400                 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1401                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1402                 }
1403
1404                 if ($cacheFile) {
1405                         t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1406                 }
1407
1408                 return 3;
1409         }
1410
1411         /**
1412          * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1413          * This function is automatically called by the ASCII transliteration functions.
1414          *
1415          * @param       string          Charset for which to initialize conversion.
1416          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1417          * @access private
1418          */
1419         function initToASCII($charset) {
1420                         // Only process if the case table is not yet loaded:
1421                 if (is_array($this->toASCII[$charset])) {
1422                         return 1;
1423                 }
1424
1425                         // Use cached version if possible
1426                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1427                 if ($cacheFile && @is_file($cacheFile)) {
1428                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1429                         return 2;
1430                 }
1431
1432                         // init UTF-8 conversion for this charset
1433                 if (!$this->initCharset($charset)) {
1434                         return FALSE;
1435                 }
1436
1437                         // UTF-8/ASCII transliteration is used as the base conversion table
1438                 if (!$this->initUnicodeData('ascii')) {
1439                         return FALSE;
1440                 }
1441
1442                 $nochar = chr($this->noCharByteVal);
1443                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1444                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1445                         $c = $this->utf8_decode($utf8, $charset);
1446
1447                         if (isset($this->toASCII['utf-8'][$utf8])) {
1448                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1449                         }
1450                 }
1451
1452                 if ($cacheFile) {
1453                         t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1454                 }
1455
1456                 return 3;
1457         }
1458
1459
1460         /********************************************
1461          *
1462          * String operation functions
1463          *
1464          ********************************************/
1465
1466         /**
1467          * Returns a part of a string.
1468          * Unit-tested by Kasper (single byte charsets only)
1469          *
1470          * @param       string          The character set
1471          * @param       string          Character string
1472          * @param       integer         Start position (character position)
1473          * @param       integer         Length (in characters)
1474          * @return      string          The substring
1475          * @see substr(), mb_substr()
1476          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1477          */
1478         function substr($charset, $string, $start, $len = NULL) {
1479                 if ($len === 0 || $string === '') {
1480                         return '';
1481                 }
1482
1483                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1484                                 // cannot omit $len, when specifying charset
1485                         if ($len == NULL) {
1486                                 $enc = mb_internal_encoding(); // save internal encoding
1487                                 mb_internal_encoding($charset);
1488                                 $str = mb_substr($string, $start);
1489                                 mb_internal_encoding($enc); // restore internal encoding
1490
1491                                 return $str;
1492                         }
1493                         else {
1494                                 return mb_substr($string, $start, $len, $charset);
1495                         }
1496                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1497                                 // cannot omit $len, when specifying charset
1498                         if ($len == NULL) {
1499                                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1500                                 iconv_set_encoding('internal_encoding', $charset);
1501                                 $str = iconv_substr($string, $start);
1502                                 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1503
1504                                 return $str;
1505                         }
1506                         else {
1507                                 return iconv_substr($string, $start, $len, $charset);
1508                         }
1509                 } elseif ($charset == 'utf-8') {
1510                         return $this->utf8_substr($string, $start, $len);
1511                 } elseif ($this->eucBasedSets[$charset]) {
1512                         return $this->euc_substr($string, $start, $charset, $len);
1513                 } elseif ($this->twoByteSets[$charset]) {
1514                         return substr($string, $start * 2, $len * 2);
1515                 } elseif ($this->fourByteSets[$charset]) {
1516                         return substr($string, $start * 4, $len * 4);
1517                 }
1518
1519                         // treat everything else as single-byte encoding
1520                 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1521         }
1522
1523         /**
1524          * Counts the number of characters.
1525          * Unit-tested by Kasper (single byte charsets only)
1526          *
1527          * @param       string          The character set
1528          * @param       string          Character string
1529          * @return      integer         The number of characters
1530          * @see strlen()
1531          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1532          */
1533         function strlen($charset, $string) {
1534                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1535                         return mb_strlen($string, $charset);
1536                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1537                         return iconv_strlen($string, $charset);
1538                 } elseif ($charset == 'utf-8') {
1539                         return $this->utf8_strlen($string);
1540                 } elseif ($this->eucBasedSets[$charset]) {
1541                         return $this->euc_strlen($string, $charset);
1542                 } elseif ($this->twoByteSets[$charset]) {
1543                         return strlen($string) / 2;
1544                 } elseif ($this->fourByteSets[$charset]) {
1545                         return strlen($string) / 4;
1546                 }
1547                         // treat everything else as single-byte encoding
1548                 return strlen($string);
1549         }
1550
1551         /**
1552          * Method to crop strings using the mb_substr function.
1553          *
1554          * @param  string               The character set
1555          * @param  string               String to be cropped
1556          * @param  integer              Crop length (in characters)
1557          * @param  string               Crop signifier
1558          * @return string               The shortened string
1559          * @see mb_strlen(), mb_substr()
1560          */
1561         protected function cropMbstring($charset, $string, $len, $crop = '') {
1562                 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1563                         return $string;
1564                 }
1565
1566                 if ($len > 0) {
1567                         $string = mb_substr($string, 0, $len, $charset) . $crop;
1568                 } else {
1569                         $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1570                 }
1571
1572                 return $string;
1573         }
1574
1575         /**
1576          * Truncates a string and pre-/appends a string.
1577          * Unit tested by Kasper
1578          *
1579          * @param       string          The character set
1580          * @param       string          Character string
1581          * @param       integer         Length (in characters)
1582          * @param       string          Crop signifier
1583          * @return      string          The shortened string
1584          * @see substr(), mb_strimwidth()
1585          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1586          */
1587         function crop($charset, $string, $len, $crop = '') {
1588                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1589                         return $this->cropMbstring($charset, $string, $len, $crop);
1590                 }
1591
1592                 if (intval($len) == 0) {
1593                         return $string;
1594                 }
1595
1596                 if ($charset == 'utf-8') {
1597                         $i = $this->utf8_char2byte_pos($string, $len);
1598                 } elseif ($this->eucBasedSets[$charset]) {
1599                         $i = $this->euc_char2byte_pos($string, $len, $charset);
1600                 } else {
1601                         if ($len > 0) {
1602                                 $i = $len;
1603                         } else {
1604                                 $i = strlen($string) + $len;
1605                                 if ($i <= 0) {
1606                                         $i = FALSE;
1607                                 }
1608                         }
1609                 }
1610
1611                 if ($i === FALSE) { // $len outside actual string length
1612                         return $string;
1613                 } else {
1614                         if ($len > 0) {
1615                                 if (strlen($string{$i})) {
1616                                         return substr($string, 0, $i) . $crop;
1617
1618                                 }
1619                         } else {
1620                                 if (strlen($string{$i - 1})) {
1621                                         return $crop . substr($string, $i);
1622                                 }
1623                         }
1624
1625                         /*
1626                            if (abs($len)<$this->strlen($charset,$string))       {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1627                                    if ($len > 0)        {
1628                                            return substr($string,0,$i).$crop;
1629                                    } else {
1630                                            return $crop.substr($string,$i);
1631                                    }
1632                            }
1633    */
1634                 }
1635                 return $string;
1636         }
1637
1638         /**
1639          * Cuts a string short at a given byte length.
1640          *
1641          * @param       string          The character set
1642          * @param       string          Character string
1643          * @param       integer         The byte length
1644          * @return      string          The shortened string
1645          * @see mb_strcut()
1646          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1647          */
1648         function strtrunc($charset, $string, $len) {
1649                 if ($len <= 0) {
1650                         return '';
1651                 }
1652
1653                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1654                         return mb_strcut($string, 0, $len, $charset);
1655                 } elseif ($charset == 'utf-8') {
1656                         return $this->utf8_strtrunc($string, $len);
1657                 } elseif ($this->eucBasedSets[$charset]) {
1658                         return $this->euc_strtrunc($string, $len, $charset);
1659                 } elseif ($this->twoByteSets[$charset]) {
1660                         if ($len % 2) {
1661                                 $len--;
1662                         } // don't cut at odd positions
1663                 } elseif ($this->fourByteSets[$charset]) {
1664                         $x = $len % 4;
1665                         $len -= $x; // realign to position dividable by four
1666                 }
1667                         // treat everything else as single-byte encoding
1668                 return substr($string, 0, $len);
1669         }
1670
1671         /**
1672          * Translates all characters of a string into their respective case values.
1673          * Unlike strtolower() and strtoupper() this method is locale independent.
1674          * Note that the string length may change!
1675          * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1676          * Unit-tested by Kasper
1677          * Real case folding is language dependent, this method ignores this fact.
1678          *
1679          * @param       string          Character set of string
1680          * @param       string          Input string to convert case for
1681          * @param       string          Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1682          * @return      string          The converted string
1683          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1684          * @see strtolower(), strtoupper()
1685          */
1686         function conv_case($charset, $string, $case) {
1687                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1688                         if ($case == 'toLower') {
1689                                 $string = mb_strtolower($string, $charset);
1690                         } else {
1691                                 $string = mb_strtoupper($string, $charset);
1692                         }
1693                 } elseif ($charset == 'utf-8') {
1694                         $string = $this->utf8_char_mapping($string, 'case', $case);
1695                 } elseif (isset($this->eucBasedSets[$charset])) {
1696                         $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1697                 } else {
1698                                 // treat everything else as single-byte encoding
1699                         $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1700                 }
1701
1702                 return $string;
1703         }
1704
1705         /**
1706          * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1707          *
1708          * @param       string          Character set of string
1709          * @param       string          Input string to convert
1710          * @return      string          The converted string
1711          */
1712         function specCharsToASCII($charset, $string) {
1713                 if ($charset == 'utf-8') {
1714                         $string = $this->utf8_char_mapping($string, 'ascii');
1715                 } elseif (isset($this->eucBasedSets[$charset])) {
1716                         $string = $this->euc_char_mapping($string, $charset, 'ascii');
1717                 } else {
1718                                 // treat everything else as single-byte encoding
1719                         $string = $this->sb_char_mapping($string, $charset, 'ascii');
1720                 }
1721
1722                 return $string;
1723         }
1724
1725
1726         /**
1727          * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1728          * into a TYPO3-readable language code
1729          * @param       $languageCodesList      list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1730          *                       see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1731          * @return      string  a preferred language that TYPO3 supports, or "default" if none found
1732          * @author      Benjamin Mack (benni.typo3.org)
1733          */
1734         public function getPreferredClientLanguage($languageCodesList) {
1735                 $allLanguageCodes = array();
1736                 $selectedLanguage = 'default';
1737
1738                         // get all languages where TYPO3 code is the same as the ISO code
1739                 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1740                         $allLanguageCodes[$typo3Lang] = $typo3Lang;
1741                 }
1742
1743                         // get all languages where TYPO3 code differs from ISO code
1744                         // or needs the country part
1745                         // the iso codes will here overwrite the default typo3 language in the key
1746                 foreach ($this->isoArray as $typo3Lang => $isoLang) {
1747                         $isoLang = join('-', explode('_', $isoLang));
1748                         $allLanguageCodes[$typo3Lang] = $isoLang;
1749                 }
1750
1751                         // move the iso codes to the (because we're comparing the keys with "isset" later on)
1752                 $allLanguageCodes = array_flip($allLanguageCodes);
1753
1754
1755                 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1756                         // order the preferred languages after they key
1757                 $sortedPreferredLanguages = array();
1758                 foreach ($preferredLanguages as $preferredLanguage) {
1759                         $quality = 1.0;
1760                         if (strpos($preferredLanguage, ';q=') !== FALSE) {
1761                                 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1762                         }
1763                         $sortedPreferredLanguages[$preferredLanguage] = $quality;
1764                 }
1765
1766                         // loop through the languages, with the highest priority first
1767                 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1768                 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1769                         if (isset($allLanguageCodes[$preferredLanguage])) {
1770                                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1771                                 break;
1772                         }
1773
1774                                 // strip the country code from the end
1775                         list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1776                         if (isset($allLanguageCodes[$preferredLanguage])) {
1777                                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1778                                 break;
1779                         }
1780                 }
1781                 if (!$selectedLanguage || $selectedLanguage == 'en') {
1782                         $selectedLanguage = 'default';
1783                 }
1784                 return $selectedLanguage;
1785         }
1786
1787
1788         /********************************************
1789          *
1790          * Internal string operation functions
1791          *
1792          ********************************************/
1793
1794         /**
1795          * Maps all characters of a string in a single byte charset.
1796          *
1797          * @param       string          the string
1798          * @param       string          the charset
1799          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1800          * @param       string          'case': conversion 'toLower' or 'toUpper'
1801          * @return      string          the converted string
1802          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1803          */
1804         function sb_char_mapping($str, $charset, $mode, $opt = '') {
1805                 switch ($mode) {
1806                         case 'case':
1807                                 if (!$this->initCaseFolding($charset)) {
1808                                         return $str;
1809                                 } // do nothing
1810                                 $map =& $this->caseFolding[$charset][$opt];
1811                                 break;
1812
1813                         case 'ascii':
1814                                 if (!$this->initToASCII($charset)) {
1815                                         return $str;
1816                                 } // do nothing
1817                                 $map =& $this->toASCII[$charset];
1818                                 break;
1819
1820                         default:
1821                                 return $str;
1822                 }
1823
1824                 $out = '';
1825                 for ($i = 0; strlen($str{$i}); $i++) {
1826                         $c = $str{$i};
1827                         if (isset($map[$c])) {
1828                                 $out .= $map[$c];
1829                         } else {
1830                                 $out .= $c;
1831                         }
1832                 }
1833
1834                 return $out;
1835         }
1836
1837
1838         /********************************************
1839          *
1840          * Internal UTF-8 string operation functions
1841          *
1842          ********************************************/
1843
1844         /**
1845          * Returns a part of a UTF-8 string.
1846          * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1847          *
1848          * @param       string          UTF-8 string
1849          * @param       integer         Start position (character position)
1850          * @param       integer         Length (in characters)
1851          * @return      string          The substring
1852          * @see substr()
1853          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1854          */
1855         function utf8_substr($str, $start, $len = NULL) {
1856                 if (!strcmp($len, '0')) {
1857                         return '';
1858                 }
1859
1860                 $byte_start = $this->utf8_char2byte_pos($str, $start);
1861                 if ($byte_start === FALSE) {
1862                         if ($start > 0) {
1863                                 return FALSE; // $start outside string length
1864                         } else {
1865                                 $start = 0;
1866                         }
1867                 }
1868
1869                 $str = substr($str, $byte_start);
1870
1871                 if ($len != NULL) {
1872                         $byte_end = $this->utf8_char2byte_pos($str, $len);
1873                         if ($byte_end === FALSE) // $len outside actual string length
1874                         {
1875                                 return $len < 0 ? '' : $str;
1876                         } // When length is less than zero and exceeds, then we return blank string.
1877                         else
1878                         {
1879                                 return substr($str, 0, $byte_end);
1880                         }
1881                 }
1882                 else    {
1883                         return $str;
1884                 }
1885         }
1886
1887         /**
1888          * Counts the number of characters of a string in UTF-8.
1889          * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1890          *
1891          * @param       string          UTF-8 multibyte character string
1892          * @return      integer         The number of characters
1893          * @see strlen()
1894          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1895          */
1896         function utf8_strlen($str) {
1897                 $n = 0;
1898                 for ($i = 0; strlen($str{$i}); $i++) {
1899                         $c = ord($str{$i});
1900                         if (!($c & 0x80)) // single-byte (0xxxxxx)
1901                         {
1902                                 $n++;
1903                         }
1904                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1905                         {
1906                                 $n++;
1907                         }
1908                 }
1909                 return $n;
1910         }
1911
1912         /**
1913          * Truncates a string in UTF-8 short at a given byte length.
1914          *
1915          * @param       string          UTF-8 multibyte character string
1916          * @param       integer         the byte length
1917          * @return      string          the shortened string
1918          * @see mb_strcut()
1919          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1920          */
1921         function utf8_strtrunc($str, $len) {
1922                 $i = $len - 1;
1923                 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1924                         for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1925                         if ($i <= 0) {
1926                                 return '';
1927                         } // sanity check
1928                         for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes
1929                         if ($bc + $i > $len) {
1930                                 return substr($str, 0, $i);
1931                         }
1932                         // fallthru: multibyte char fits into length
1933                 }
1934                 return substr($str, 0, $len);
1935         }
1936
1937         /**
1938          * Find position of first occurrence of a string, both arguments are in UTF-8.
1939          *
1940          * @param       string          UTF-8 string to search in
1941          * @param       string          UTF-8 string to search for
1942          * @param       integer         Positition to start the search
1943          * @return      integer         The character position
1944          * @see strpos()
1945          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1946          */
1947         function utf8_strpos($haystack, $needle, $offset = 0) {
1948                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1949                         return mb_strpos($haystack, $needle, $offset, 'utf-8');
1950                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1951                         return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1952                 }
1953
1954                 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1955                 if ($byte_offset === FALSE) {
1956                         return FALSE;
1957                 } // offset beyond string length
1958
1959                 $byte_pos = strpos($haystack, $needle, $byte_offset);
1960                 if ($byte_pos === FALSE) {
1961                         return FALSE;
1962                 } // needle not found
1963
1964                 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1965         }
1966
1967         /**
1968          * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1969          *
1970          * @param       string          UTF-8 string to search in
1971          * @param       string          UTF-8 character to search for (single character)
1972          * @return      integer         The character position
1973          * @see strrpos()
1974          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1975          */
1976         function utf8_strrpos($haystack, $needle) {
1977                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1978                         return mb_strrpos($haystack, $needle, 'utf-8');
1979                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1980                         return iconv_strrpos($haystack, $needle, 'utf-8');
1981                 }
1982
1983                 $byte_pos = strrpos($haystack, $needle);
1984                 if ($byte_pos === FALSE) {
1985                         return FALSE;
1986                 } // needle not found
1987
1988                 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1989         }
1990
1991         /**
1992          * Translates a character position into an 'absolute' byte position.
1993          * Unit tested by Kasper.
1994          *
1995          * @param       string          UTF-8 string
1996          * @param       integer         Character position (negative values start from the end)
1997          * @return      integer         Byte position
1998          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1999          */
2000         function utf8_char2byte_pos($str, $pos) {
2001                 $n = 0; // number of characters found
2002                 $p = abs($pos); // number of characters wanted
2003
2004                 if ($pos >= 0) {
2005                         $i = 0;
2006                         $d = 1;
2007                 } else {
2008                         $i = strlen($str) - 1;
2009                         $d = -1;
2010                 }
2011
2012                 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2013                         $c = (int) ord($str{$i});
2014                         if (!($c & 0x80)) // single-byte (0xxxxxx)
2015                         {
2016                                 $n++;
2017                         }
2018                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2019                         {
2020                                 $n++;
2021                         }
2022                 }
2023                 if (!strlen($str{$i})) {
2024                         return FALSE;
2025                 } // offset beyond string length
2026
2027                 if ($pos >= 0) {
2028                                 // skip trailing multi-byte data bytes
2029                         while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2030                                 $i++;
2031                         }
2032                 } else {
2033                                 // correct offset
2034                         $i++;
2035                 }
2036
2037                 return $i;
2038         }
2039
2040         /**
2041          * Translates an 'absolute' byte position into a character position.
2042          * Unit tested by Kasper.
2043          *
2044          * @param       string          UTF-8 string
2045          * @param       integer         byte position
2046          * @return      integer         character position
2047          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2048          */
2049         function utf8_byte2char_pos($str, $pos) {
2050                 $n = 0; // number of characters
2051                 for ($i = $pos; $i > 0; $i--) {
2052                         $c = (int) ord($str{$i});
2053                         if (!($c & 0x80)) // single-byte (0xxxxxx)
2054                         {
2055                                 $n++;
2056                         }
2057                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2058                         {
2059                                 $n++;
2060                         }
2061                 }
2062                 if (!strlen($str{$i})) {
2063                         return FALSE;
2064                 } // offset beyond string length
2065
2066                 return $n;
2067         }
2068
2069         /**
2070          * Maps all characters of an UTF-8 string.
2071          *
2072          * @param       string          UTF-8 string
2073          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2074          * @param       string          'case': conversion 'toLower' or 'toUpper'
2075          * @return      string          the converted string
2076          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2077          */
2078         function utf8_char_mapping($str, $mode, $opt = '') {
2079                 if (!$this->initUnicodeData($mode)) {
2080                         return $str;
2081                 } // do nothing
2082
2083                 $out = '';
2084                 switch ($mode) {
2085                         case 'case':
2086                                 $map =& $this->caseFolding['utf-8'][$opt];
2087                                 break;
2088
2089                         case 'ascii':
2090                                 $map =& $this->toASCII['utf-8'];
2091                                 break;
2092
2093                         default:
2094                                 return $str;
2095                 }
2096
2097                 for ($i = 0; strlen($str{$i}); $i++) {
2098                         $c = ord($str{$i});
2099                         if (!($c & 0x80)) // single-byte (0xxxxxx)
2100                         {
2101                                 $mbc = $str{$i};
2102                         }
2103                         elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2104                                 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2105                                         $bc++;
2106                                 } // calculate number of bytes
2107                                 $mbc = substr($str, $i, $bc);
2108                                 $i += $bc - 1;
2109                         }
2110
2111                         if (isset($map[$mbc])) {
2112                                 $out .= $map[$mbc];
2113                         } else {
2114                                 $out .= $mbc;
2115                         }
2116                 }
2117
2118                 return $out;
2119         }
2120
2121
2122         /********************************************
2123          *
2124          * Internal EUC string operation functions
2125          *
2126          * Extended Unix Code:
2127          *  ASCII compatible 7bit single bytes chars
2128          *  8bit two byte chars
2129          *
2130          * Shift-JIS is treated as a special case.
2131          *
2132          ********************************************/
2133
2134         /**
2135          * Cuts a string in the EUC charset family short at a given byte length.
2136          *
2137          * @param       string          EUC multibyte character string
2138          * @param       integer         the byte length
2139          * @param       string          the charset
2140          * @return      string          the shortened string
2141          * @see mb_strcut()
2142          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2143          */
2144         function euc_strtrunc($str, $len, $charset) {
2145                 $sjis = ($charset == 'shift_jis');
2146                 for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2147                         $c = ord($str{$i});
2148                         if ($sjis) {
2149                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2150                                         $i++;
2151                                 } // advance a double-byte char
2152                         }
2153                         else {
2154                                 if ($c >= 0x80) {
2155                                         $i++;
2156                                 } // advance a double-byte char
2157                         }
2158                 }
2159                 if (!strlen($str{$i})) {
2160                         return $str;
2161                 } // string shorter than supplied length
2162
2163                 if ($i > $len) {
2164                         return substr($str, 0, $len - 1); // we ended on a first byte
2165                 } else {
2166                         return substr($str, 0, $len);
2167                 }
2168         }
2169
2170         /**
2171          * Returns a part of a string in the EUC charset family.
2172          *
2173          * @param       string          EUC multibyte character string
2174          * @param       integer         start position (character position)
2175          * @param       string          the charset
2176          * @param       integer         length (in characters)
2177          * @return      string          the substring
2178          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2179          */
2180         function euc_substr($str, $start, $charset, $len = NULL) {
2181                 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2182                 if ($byte_start === FALSE) {
2183                         return FALSE;
2184                 } // $start outside string length
2185
2186                 $str = substr($str, $byte_start);
2187
2188                 if ($len != NULL) {
2189                         $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2190                         if ($byte_end === FALSE) // $len outside actual string length
2191                         {
2192                                 return $str;
2193                         }
2194                         else
2195                         {
2196                                 return substr($str, 0, $byte_end);
2197                         }
2198                 }
2199                 else    {
2200                         return $str;
2201                 }
2202         }
2203
2204         /**
2205          * Counts the number of characters of a string in the EUC charset family.
2206          *
2207          * @param       string          EUC multibyte character string
2208          * @param       string          the charset
2209          * @return      integer         the number of characters
2210          * @see strlen()
2211          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2212          */
2213         function euc_strlen($str, $charset) {
2214                 $sjis = ($charset == 'shift_jis');
2215                 $n = 0;
2216                 for ($i = 0; strlen($str{$i}); $i++) {
2217                         $c = ord($str{$i});
2218                         if ($sjis) {
2219                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2220                                         $i++;
2221                                 } // advance a double-byte char
2222                         }
2223                         else {
2224                                 if ($c >= 0x80) {
2225                                         $i++;
2226                                 } // advance a double-byte char
2227                         }
2228
2229                         $n++;
2230                 }
2231
2232                 return $n;
2233         }
2234
2235         /**
2236          * Translates a character position into an 'absolute' byte position.
2237          *
2238          * @param       string          EUC multibyte character string
2239          * @param       integer         character position (negative values start from the end)
2240          * @param       string          the charset
2241          * @return      integer         byte position
2242          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2243          */
2244         function euc_char2byte_pos($str, $pos, $charset) {
2245                 $sjis = ($charset == 'shift_jis');
2246                 $n = 0; // number of characters seen
2247                 $p = abs($pos); // number of characters wanted
2248
2249                 if ($pos >= 0) {
2250                         $i = 0;
2251                         $d = 1;
2252                 } else {
2253                         $i = strlen($str) - 1;
2254                         $d = -1;
2255                 }
2256
2257                 for (; strlen($str{$i}) && $n < $p; $i += $d) {
2258                         $c = ord($str{$i});
2259                         if ($sjis) {
2260                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2261                                         $i += $d;
2262                                 } // advance a double-byte char
2263                         }
2264                         else {
2265                                 if ($c >= 0x80) {
2266                                         $i += $d;
2267                                 } // advance a double-byte char
2268                         }
2269
2270                         $n++;
2271                 }
2272                 if (!strlen($str{$i})) {
2273                         return FALSE;
2274                 } // offset beyond string length
2275
2276                 if ($pos < 0) {
2277                         $i++;
2278                 } // correct offset
2279
2280                 return $i;
2281         }
2282
2283         /**
2284          * Maps all characters of a string in the EUC charset family.
2285          *
2286          * @param       string          EUC multibyte character string
2287          * @param       string          the charset
2288          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2289          * @param       string          'case': conversion 'toLower' or 'toUpper'
2290          * @return      string          the converted string
2291          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2292          */
2293         function euc_char_mapping($str, $charset, $mode, $opt = '') {
2294                 switch ($mode) {
2295                         case 'case':
2296                                 if (!$this->initCaseFolding($charset)) {
2297                                         return $str;
2298                                 } // do nothing
2299                                 $map =& $this->caseFolding[$charset][$opt];
2300                                 break;
2301
2302                         case 'ascii':
2303                                 if (!$this->initToASCII($charset)) {
2304                                         return $str;
2305                                 } // do nothing
2306                                 $map =& $this->toASCII[$charset];
2307                                 break;
2308
2309                         default:
2310                                 return $str;
2311                 }
2312
2313                 $sjis = ($charset == 'shift_jis');
2314                 $out = '';
2315                 for ($i = 0; strlen($str{$i}); $i++) {
2316                         $mbc = $str{$i};
2317                         $c = ord($mbc);
2318
2319                         if ($sjis) {
2320                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2321                                         $mbc = substr($str, $i, 2);
2322                                         $i++;
2323                                 }
2324                         }
2325                         else {
2326                                 if ($c >= 0x80) { // a double-byte char
2327                                         $mbc = substr($str, $i, 2);
2328                                         $i++;
2329                                 }
2330                         }
2331
2332                         if (isset($map[$mbc])) {
2333                                 $out .= $map[$mbc];
2334                         } else {
2335                                 $out .= $mbc;
2336                         }
2337                 }
2338
2339                 return $out;
2340         }
2341
2342 }
2343
2344 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2345         include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2346 }
2347
2348 ?>