lib/typo3/class.t3lib_cs.php

   1 <?php
   2 /***************************************************************
   3  *  Copyright notice
   4  *
   5  *  (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
   6  *  All rights reserved
   7  *
   8  *  This script is part of the Typo3 project. The Typo3 project is
   9  *  free software; you can redistribute it and/or modify
  10  *  it under the terms of the GNU General Public License as published by
  11  *  the Free Software Foundation; either version 2 of the License, or
  12  *  (at your option) any later version.
  13  *
  14  *  The GNU General Public License can be found at
  15  *  http://www.gnu.org/copyleft/gpl.html.
  16  *
  17  *  This script is distributed in the hope that it will be useful,
  18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  *  GNU General Public License for more details.
  21  *
  22  *  This copyright notice MUST APPEAR in all copies of the script!
  23  ***************************************************************/
  24 /**
  25  * Class for conversion between charsets.
  26  *
  27  * @author      Kasper Skårhøj <kasperYYYY@typo3.com>
  28  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
  29  */
  30
  31
  32 /**
  33  * Notes on UTF-8
  34  *
  35  * Functions working on UTF-8 strings:
  36  *
  37  * - strchr/strstr
  38  * - strrchr
  39  * - substr_count
  40  * - implode/explode/join
  41  *
  42  * Functions nearly working on UTF-8 strings:
  43  *
  44  * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  45  * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  46  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  47  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  48  * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  49  *
  50  * Functions NOT working on UTF-8 strings:
  51  *
  52  * - str*cmp
  53  * - stristr
  54  * - stripos
  55  * - substr
  56  * - strrev
  57  * - split/spliti
  58  * - ...
  59  *
  60  */
  61 /**
  62  * Class for conversion between charsets
  63  *
  64  * @author      Kasper Skårhøj <kasperYYYY@typo3.com>
  65  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
  66  * @package TYPO3
  67  * @subpackage t3lib
  68  */
  69 class t3lib_cs {
  70
  71         /**
  72          * @var t3lib_l10n_Locales
  73          */
  74         protected $locales;
  75
  76         var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
  77
  78                 // This is the array where parsed conversion tables are stored (cached)
  79         var $parsedCharsets = array();
  80
  81                 // An array where case folding data will be stored (cached)
  82         var $caseFolding = array();
  83
  84                 // An array where charset-to-ASCII mappings are stored (cached)
  85         var $toASCII = array();
  86
  87                 // This tells the converter which charsets has two bytes per char:
  88         var $twoByteSets = array(
  89                 'ucs-2' => 1, // 2-byte Unicode
  90         );
  91
  92                 // This tells the converter which charsets has four bytes per char:
  93         var $fourByteSets = array(
  94                 'ucs-4' => 1, // 4-byte Unicode
  95                 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  96         );
  97
  98                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
  99         var $eucBasedSets = array(
 100                 'gb2312' => 1, // Chinese, simplified.
 101                 'big5' => 1, // Chinese, traditional.
 102                 'euc-kr' => 1, // Korean
 103                 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 104         );
 105
 106                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 107                 // http://czyborra.com/charsets/iso8859.html
 108         var $synonyms = array(
 109                 'us' => 'ascii',
 110                 'us-ascii' => 'ascii',
 111                 'cp819' => 'iso-8859-1',
 112                 'ibm819' => 'iso-8859-1',
 113                 'iso-ir-100' => 'iso-8859-1',
 114                 'iso-ir-101' => 'iso-8859-2',
 115                 'iso-ir-109' => 'iso-8859-3',
 116                 'iso-ir-110' => 'iso-8859-4',
 117                 'iso-ir-144' => 'iso-8859-5',
 118                 'iso-ir-127' => 'iso-8859-6',
 119                 'iso-ir-126' => 'iso-8859-7',
 120                 'iso-ir-138' => 'iso-8859-8',
 121                 'iso-ir-148' => 'iso-8859-9',
 122                 'iso-ir-157' => 'iso-8859-10',
 123                 'iso-ir-179' => 'iso-8859-13',
 124                 'iso-ir-199' => 'iso-8859-14',
 125                 'iso-ir-203' => 'iso-8859-15',
 126                 'csisolatin1' => 'iso-8859-1',
 127                 'csisolatin2' => 'iso-8859-2',
 128                 'csisolatin3' => 'iso-8859-3',
 129                 'csisolatin5' => 'iso-8859-9',
 130                 'csisolatin8' => 'iso-8859-14',
 131                 'csisolatin9' => 'iso-8859-15',
 132                 'csisolatingreek' => 'iso-8859-7',
 133                 'iso-celtic' => 'iso-8859-14',
 134                 'latin1' => 'iso-8859-1',
 135                 'latin2' => 'iso-8859-2',
 136                 'latin3' => 'iso-8859-3',
 137                 'latin5' => 'iso-8859-9',
 138                 'latin6' => 'iso-8859-10',
 139                 'latin8' => 'iso-8859-14',
 140                 'latin9' => 'iso-8859-15',
 141                 'l1' => 'iso-8859-1',
 142                 'l2' => 'iso-8859-2',
 143                 'l3' => 'iso-8859-3',
 144                 'l5' => 'iso-8859-9',
 145                 'l6' => 'iso-8859-10',
 146                 'l8' => 'iso-8859-14',
 147                 'l9' => 'iso-8859-15',
 148                 'cyrillic' => 'iso-8859-5',
 149                 'arabic' => 'iso-8859-6',
 150                 'tis-620' => 'iso-8859-11',
 151                 'win874' => 'windows-874',
 152                 'win1250' => 'windows-1250',
 153                 'win1251' => 'windows-1251',
 154                 'win1252' => 'windows-1252',
 155                 'win1253' => 'windows-1253',
 156                 'win1254' => 'windows-1254',
 157                 'win1255' => 'windows-1255',
 158                 'win1256' => 'windows-1256',
 159                 'win1257' => 'windows-1257',
 160                 'win1258' => 'windows-1258',
 161                 'cp1250' => 'windows-1250',
 162                 'cp1251' => 'windows-1251',
 163                 'cp1252' => 'windows-1252',
 164                 'ms-ee' => 'windows-1250',
 165                 'ms-ansi' => 'windows-1252',
 166                 'ms-greek' => 'windows-1253',
 167                 'ms-turk' => 'windows-1254',
 168                 'winbaltrim' => 'windows-1257',
 169                 'koi-8ru' => 'koi-8r',
 170                 'koi8r' => 'koi-8r',
 171                 'cp878' => 'koi-8r',
 172                 'mac' => 'macroman',
 173                 'macintosh' => 'macroman',
 174                 'euc-cn' => 'gb2312',
 175                 'x-euc-cn' => 'gb2312',
 176                 'euccn' => 'gb2312',
 177                 'cp936' => 'gb2312',
 178                 'big-5' => 'big5',
 179                 'cp950' => 'big5',
 180                 'eucjp' => 'euc-jp',
 181                 'sjis' => 'shift_jis',
 182                 'shift-jis' => 'shift_jis',
 183                 'cp932' => 'shift_jis',
 184                 'cp949' => 'euc-kr',
 185                 'utf7' => 'utf-7',
 186                 'utf8' => 'utf-8',
 187                 'utf16' => 'utf-16',
 188                 'utf32' => 'utf-32',
 189                 'utf8' => 'utf-8',
 190                 'ucs2' => 'ucs-2',
 191                 'ucs4' => 'ucs-4',
 192         );
 193
 194                 // mapping of iso-639-1 language codes to script names
 195         var $lang_to_script = array(
 196                         // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
 197                 'af' => 'west_european', //Afrikaans
 198                 'ar' => 'arabic',
 199                 'bg' => 'cyrillic', // Bulgarian
 200                 'bs' => 'east_european', // Bosnian
 201                 'cs' => 'east_european', // Czech
 202                 'da' => 'west_european', // Danish
 203                 'de' => 'west_european', // German
 204                 'es' => 'west_european', // Spanish
 205                 'et' => 'estonian',
 206                 'eo' => 'unicode', // Esperanto
 207                 'eu' => 'west_european', // Basque
 208                 'fa' => 'arabic', // Persian
 209                 'fi' => 'west_european', // Finish
 210                 'fo' => 'west_european', // Faroese
 211                 'fr' => 'west_european', // French
 212                 'ga' => 'west_european', // Irish
 213                 'gl' => 'west_european', // Galician
 214                 'gr' => 'greek',
 215                 'he' => 'hebrew', // Hebrew (since 1998)
 216                 'hi' => 'unicode', // Hindi
 217                 'hr' => 'east_european', // Croatian
 218                 'hu' => 'east_european', // Hungarian
 219                 'iw' => 'hebrew', // Hebrew (til 1998)
 220                 'is' => 'west_european', // Icelandic
 221                 'it' => 'west_european', // Italian
 222                 'ja' => 'japanese',
 223                 'ka' => 'unicode', // Georgian
 224                 'kl' => 'west_european', // Greenlandic
 225                 'km' => 'unicode', // Khmer
 226                 'ko' => 'korean',
 227                 'lt' => 'lithuanian',
 228                 'lv' => 'west_european', // Latvian/Lettish
 229                 'nl' => 'west_european', // Dutch
 230                 'no' => 'west_european', // Norwegian
 231                 'nb' => 'west_european', // Norwegian Bokmal
 232                 'nn' => 'west_european', // Norwegian Nynorsk
 233                 'pl' => 'east_european', // Polish
 234                 'pt' => 'west_european', // Portuguese
 235                 'ro' => 'east_european', // Romanian
 236                 'ru' => 'cyrillic', // Russian
 237                 'sk' => 'east_european', // Slovak
 238                 'sl' => 'east_european', // Slovenian
 239                 'sr' => 'cyrillic', // Serbian
 240                 'sv' => 'west_european', // Swedish
 241                 'sq' => 'albanian', // Albanian
 242                 'th' => 'thai',
 243                 'uk' => 'cyrillic', // Ukranian
 244                 'vi' => 'vietnamese',
 245                 'zh' => 'chinese',
 246                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 247                         // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
 248                 'afk'=> 'west_european', // Afrikaans
 249                 'ara' => 'arabic',
 250                 'bgr' => 'cyrillic', // Bulgarian
 251                 'cat' => 'west_european', // Catalan
 252                 'chs' => 'simpl_chinese',
 253                 'cht' => 'trad_chinese',
 254                 'csy' => 'east_european', // Czech
 255                 'dan' => 'west_european', // Danisch
 256                 'deu' => 'west_european', // German
 257                 'dea' => 'west_european', // German (Austrian)
 258                 'des' => 'west_european', // German (Swiss)
 259                 'ena' => 'west_european', // English (Australian)
 260                 'enc' => 'west_european', // English (Canadian)
 261                 'eng' => 'west_european', // English
 262                 'enz' => 'west_european', // English (New Zealand)
 263                 'enu' => 'west_european', // English (United States)
 264                 'euq' => 'west_european', // Basque
 265                 'fos' => 'west_european', // Faroese
 266                 'far' => 'arabic', // Persian
 267                 'fin' => 'west_european', // Finish
 268                 'fra' => 'west_european', // French
 269                 'frb' => 'west_european', // French (Belgian)
 270                 'frc' => 'west_european', // French (Canadian)
 271                 'frs' => 'west_european', // French (Swiss)
 272                 'geo' => 'unicode', // Georgian
 273                 'glg' => 'west_european', // Galician
 274                 'ell' => 'greek',
 275                 'heb' => 'hebrew',
 276                 'hin' => 'unicode', // Hindi
 277                 'hun' => 'east_european', // Hungarian
 278                 'isl' => 'west_euorpean', // Icelandic
 279                 'ita' => 'west_european', // Italian
 280                 'its' => 'west_european', // Italian (Swiss)
 281                 'jpn' => 'japanese',
 282                 'khm' => 'unicode', // Khmer
 283                 'kor' => 'korean',
 284                 'lth' => 'lithuanian',
 285                 'lvi' => 'west_european', // Latvian/Lettish
 286                 'msl' => 'west_european', // Malay
 287                 'nlb' => 'west_european', // Dutch (Belgian)
 288                 'nld' => 'west_european', // Dutch
 289                 'nor' => 'west_european', // Norwegian (bokmal)
 290                 'non' => 'west_european', // Norwegian (nynorsk)
 291                 'plk' => 'east_european', // Polish
 292                 'ptg' => 'west_european', // Portuguese
 293                 'ptb' => 'west_european', // Portuguese (Brazil)
 294                 'rom' => 'east_european', // Romanian
 295                 'rus' => 'cyrillic', // Russian
 296                 'slv' => 'east_european', // Slovenian
 297                 'sky' => 'east_european', // Slovak
 298                 'srl' => 'east_european', // Serbian (Latin)
 299                 'srb' => 'cyrillic', // Serbian (Cyrillic)
 300                 'esp' => 'west_european', // Spanish (trad. sort)
 301                 'esm' => 'west_european', // Spanish (Mexican)
 302                 'esn' => 'west_european', // Spanish (internat. sort)
 303                 'sve' => 'west_european', // Swedish
 304                 'sqi' => 'albanian', // Albanian
 305                 'tha' => 'thai',
 306                 'trk' => 'turkish',
 307                 'ukr' => 'cyrillic', // Ukrainian
 308                         // English language names
 309                 'afrikaans' => 'west_european',
 310                 'albanian' => 'albanian',
 311                 'arabic' => 'arabic',
 312                 'basque' => 'west_european',
 313                 'bosnian' => 'east_european',
 314                 'bulgarian' => 'east_european',
 315                 'catalan' => 'west_european',
 316                 'croatian' => 'east_european',
 317                 'czech' => 'east_european',
 318                 'danish' => 'west_european',
 319                 'dutch' => 'west_european',
 320                 'english' => 'west_european',
 321                 'esperanto' => 'unicode',
 322                 'estonian' => 'estonian',
 323                 'faroese' => 'west_european',
 324                 'farsi' => 'arabic',
 325                 'finnish' => 'west_european',
 326                 'french' => 'west_european',
 327                 'galician' => 'west_european',
 328                 'georgian' => 'unicode',
 329                 'german' => 'west_european',
 330                 'greek' => 'greek',
 331                 'greenlandic' => 'west_european',
 332                 'hebrew' => 'hebrew',
 333                 'hindi' => 'unicode',
 334                 'hungarian' => 'east_european',
 335                 'icelandic' => 'west_european',
 336                 'italian' => 'west_european',
 337                 'khmer' => 'unicode',
 338                 'latvian' => 'west_european',
 339                 'lettish' => 'west_european',
 340                 'lithuanian' => 'lithuanian',
 341                 'malay' => 'west_european',
 342                 'norwegian' => 'west_european',
 343                 'persian' => 'arabic',
 344                 'polish' => 'east_european',
 345                 'portuguese' => 'west_european',
 346                 'russian' => 'cyrillic',
 347                 'romanian' => 'east_european',
 348                 'serbian' => 'cyrillic',
 349                 'slovak' => 'east_european',
 350                 'slovenian' => 'east_european',
 351                 'spanish' => 'west_european',
 352                 'svedish' => 'west_european',
 353                 'that' => 'thai',
 354                 'turkish' => 'turkish',
 355                 'ukrainian' => 'cyrillic',
 356         );
 357
 358                 // mapping of language (family) names to charsets on Unix
 359         var $script_to_charset_unix = array(
 360                 'west_european' => 'iso-8859-1',
 361                 'estonian' => 'iso-8859-1',
 362                 'east_european' => 'iso-8859-2',
 363                 'baltic' => 'iso-8859-4',
 364                 'cyrillic' => 'iso-8859-5',
 365                 'arabic' => 'iso-8859-6',
 366                 'greek' => 'iso-8859-7',
 367                 'hebrew' => 'iso-8859-8',
 368                 'turkish' => 'iso-8859-9',
 369                 'thai' => 'iso-8859-11', // = TIS-620
 370                 'lithuanian' => 'iso-8859-13',
 371                 'chinese' => 'gb2312', // = euc-cn
 372                 'japanese' => 'euc-jp',
 373                 'korean' => 'euc-kr',
 374                 'simpl_chinese' => 'gb2312',
 375                 'trad_chinese' => 'big5',
 376                 'vietnamese' => '',
 377                 'unicode' => 'utf-8',
 378                 'albanian' => 'utf-8'
 379         );
 380
 381                 // mapping of language (family) names to charsets on Windows
 382         var $script_to_charset_windows = array(
 383                 'east_european' => 'windows-1250',
 384                 'cyrillic' => 'windows-1251',
 385                 'west_european' => 'windows-1252',
 386                 'greek' => 'windows-1253',
 387                 'turkish' => 'windows-1254',
 388                 'hebrew' => 'windows-1255',
 389                 'arabic' => 'windows-1256',
 390                 'baltic' => 'windows-1257',
 391                 'estonian' => 'windows-1257',
 392                 'lithuanian' => 'windows-1257',
 393                 'vietnamese' => 'windows-1258',
 394                 'thai' => 'cp874',
 395                 'korean' => 'cp949',
 396                 'chinese' => 'gb2312',
 397                 'japanese' => 'shift_jis',
 398                 'simpl_chinese' => 'gb2312',
 399                 'trad_chinese' => 'big5',
 400                 'albanian' => 'windows-1250',
 401                 'unicode' => 'utf-8'
 402         );
 403
 404                 // mapping of locale names to charsets
 405         var $locale_to_charset = array(
 406                 'japanese.euc' => 'euc-jp',
 407                 'ja_jp.ujis' => 'euc-jp',
 408                 'korean.euc' => 'euc-kr',
 409                 'sr@Latn' => 'iso-8859-2',
 410                 'zh_cn' => 'gb2312',
 411                 'zh_hk' => 'big5',
 412                 'zh_tw' => 'big5',
 413         );
 414
 415                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 416                 // Empty values means "iso-8859-1"
 417         var $charSetArray = array(
 418                 'af' => '',
 419                 'ar' => 'iso-8859-6',
 420                 'ba' => 'iso-8859-2',
 421                 'bg' => 'windows-1251',
 422                 'br' => '',
 423                 'ca' => 'iso-8859-15',
 424                 'ch' => 'gb2312',
 425                 'cs' => 'windows-1250',
 426                 'cz' => 'windows-1250',
 427                 'da' => '',
 428                 'de' => '',
 429                 'dk' => '',
 430                 'el' => 'iso-8859-7',
 431                 'eo' => 'utf-8',
 432                 'es' => '',
 433                 'et' => 'iso-8859-4',
 434                 'eu' => '',
 435                 'fa' => 'utf-8',
 436                 'fi' => '',
 437                 'fo' => 'utf-8',
 438                 'fr' => '',
 439                 'fr_CA' => '',
 440                 'ga' => '',
 441                 'ge' => 'utf-8',
 442                 'gl' => '',
 443                 'gr' => 'iso-8859-7',
 444                 'he' => 'utf-8',
 445                 'hi' => 'utf-8',
 446                 'hk' => 'big5',
 447                 'hr' => 'windows-1250',
 448                 'hu' => 'iso-8859-2',
 449                 'is' => 'utf-8',
 450                 'it' => '',
 451                 'ja' => 'shift_jis',
 452                 'jp' => 'shift_jis',
 453                 'ka' => 'utf-8',
 454                 'kl' => 'utf-8',
 455                 'km' => 'utf-8',
 456                 'ko' => 'euc-kr',
 457                 'kr' => 'euc-kr',
 458                 'lt' => 'windows-1257',
 459                 'lv' => 'utf-8',
 460                 'ms' => '',
 461                 'my' => '',
 462                 'nl' => '',
 463                 'no' => '',
 464                 'pl' => 'iso-8859-2',
 465                 'pt' => '',
 466                 'pt_BR' => '',
 467                 'qc' => '',
 468                 'ro' => 'iso-8859-2',
 469                 'ru' => 'windows-1251',
 470                 'se' => '',
 471                 'si' => 'windows-1250',
 472                 'sk' => 'windows-1250',
 473                 'sl' => 'windows-1250',
 474                 'sq' => 'utf-8',
 475                 'sr' => 'utf-8',
 476                 'sv' => '',
 477                 'th' => 'iso-8859-11',
 478                 'tr' => 'iso-8859-9',
 479                 'ua' => 'windows-1251',
 480                 'uk' => 'windows-1251',
 481                 'vi' => 'utf-8',
 482                 'vn' => 'utf-8',
 483                 'zh' => 'big5',
 484         );
 485
 486                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 487                 // Missing keys means: same as TYPO3
 488                 // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
 489         var $isoArray = array(
 490                 'ba' => 'bs',
 491                 'br' => 'pt_BR',
 492                 'ch' => 'zh_CN',
 493                 'cz' => 'cs',
 494                 'dk' => 'da',
 495                 'si' => 'sl',
 496                 'se' => 'sv',
 497                 'gl' => 'kl',
 498                 'gr' => 'el',
 499                 'hk' => 'zh_HK',
 500                 'kr' => 'ko',
 501                 'ua' => 'uk',
 502                 'jp' => 'ja',
 503                 'qc' => 'fr_CA',
 504                 'vn' => 'vi',
 505                 'ge' => 'ka',
 506                 'ga' => 'gl',
 507         );
 508
 509         /**
 510          * Default constructor.
 511          */
 512         public function __construct() {
 513                 $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
 514         }
 515
 516         /**
 517          * Normalize - changes input character set to lowercase letters.
 518          *
 519          * @param       string          Input charset
 520          * @return      string          Normalized charset
 521          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 522          */
 523         function parse_charset($charset) {
 524                 $charset = trim(strtolower($charset));
 525                 if (isset($this->synonyms[$charset])) {
 526                         $charset = $this->synonyms[$charset];
 527                 }
 528
 529                 return $charset;
 530         }
 531
 532         /**
 533          * Get the charset of a locale.
 534          *
 535          * ln                   language
 536          * ln_CN                 language / country
 537          * ln_CN.cs       language / country / charset
 538          * ln_CN.cs@mod  language / country / charset / modifier
 539          *
 540          * @param       string          Locale string
 541          * @return      string          Charset resolved for locale string
 542          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 543          */
 544         function get_locale_charset($locale) {
 545                 $locale = strtolower($locale);
 546
 547                         // exact locale specific charset?
 548                 if (isset($this->locale_to_charset[$locale])) {
 549                         return $this->locale_to_charset[$locale];
 550                 }
 551
 552                         // get modifier
 553                 list($locale, $modifier) = explode('@', $locale);
 554
 555                         // locale contains charset: use it
 556                 list($locale, $charset) = explode('.', $locale);
 557                 if ($charset) {
 558                         return $this->parse_charset($charset);
 559                 }
 560
 561                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 562                 if ($modifier == 'euro') {
 563                         return 'iso-8859-15';
 564                 }
 565
 566                         // get language
 567                 list($language, $country) = explode('_', $locale);
 568                 if (isset($this->lang_to_script[$language])) {
 569                         $script = $this->lang_to_script[$language];
 570                 }
 571
 572                 if (TYPO3_OS == 'WIN') {
 573                         $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
 574                 } else {
 575                         $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
 576                 }
 577
 578                 return $cs;
 579         }
 580
 581
 582         /********************************************
 583          *
 584          * Charset Conversion functions
 585          *
 586          ********************************************/
 587
 588         /**
 589          * Convert from one charset to another charset.
 590          *
 591          * @param       string          Input string
 592          * @param       string          From charset (the current charset of the string)
 593          * @param       string          To charset (the output charset wanted)
 594          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 595          * @return      string          Converted string
 596          * @see convArray()
 597          */
 598         function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
 599                 if ($fromCS == $toCS) {
 600                         return $str;
 601                 }
 602
 603                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 604                 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
 605                         switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
 606                                 case 'mbstring':
 607                                         $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
 608                                         if (FALSE !== $conv_str) {
 609                                                 return $conv_str;
 610                                         } // returns FALSE for unsupported charsets
 611                                         break;
 612
 613                                 case 'iconv':
 614                                         $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
 615                                         if (FALSE !== $conv_str) {
 616                                                 return $conv_str;
 617                                         }
 618                                         break;
 619
 620                                 case 'recode':
 621                                         $conv_str = recode_string($fromCS . '..' . $toCS, $str);
 622                                         if (FALSE !== $conv_str) {
 623                                                 return $conv_str;
 624                                         }
 625                                         break;
 626                         }
 627                         // fallback to TYPO3 conversion
 628                 }
 629
 630                 if ($fromCS != 'utf-8') {
 631                         $str = $this->utf8_encode($str, $fromCS);
 632                 }
 633                 if ($toCS != 'utf-8') {
 634                         $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
 635                 }
 636                 return $str;
 637         }
 638
 639         /**
 640          * Convert all elements in ARRAY with type string from one charset to another charset.
 641          * NOTICE: Array is passed by reference!
 642          *
 643          * @param       string          Input array, possibly multidimensional
 644          * @param       string          From charset (the current charset of the string)
 645          * @param       string          To charset (the output charset wanted)
 646          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 647          * @return      void
 648          * @see conv()
 649          */
 650         function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
 651                 foreach ($array as $key => $value) {
 652                         if (is_array($array[$key])) {
 653                                 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 654                         } elseif (is_string($array[$key])) {
 655                                 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 656                         }
 657                 }
 658         }
 659
 660         /**
 661          * Converts $str from $charset to UTF-8
 662          *
 663          * @param       string          String in local charset to convert to UTF-8
 664          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 665          * @return      string          Output string, converted to UTF-8
 666          */
 667         function utf8_encode($str, $charset) {
 668
 669                 if ($charset === 'utf-8') {
 670                         return $str;
 671                 }
 672
 673                         // Charset is case-insensitive.
 674                 if ($this->initCharset($charset)) { // Parse conv. table if not already...
 675                         $strLen = strlen($str);
 676                         $outStr = '';
 677
 678                         for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
 679                                 $chr = substr($str, $a, 1);
 680                                 $ord = ord($chr);
 681                                 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
 682                                         $ord2 = ord($str[$a + 1]);
 683                                         $ord = $ord << 8 | $ord2; // assume big endian
 684
 685                                         if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 686                                                 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 687                                         } else {
 688                                                 $outStr .= chr($this->noCharByteVal);
 689                                         } // No char exists
 690                                         $a++;
 691                                 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
 692                                         if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 693                                                 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
 694                                                         $a++;
 695                                                         $ord2 = ord(substr($str, $a, 1));
 696                                                         $ord = $ord * 256 + $ord2;
 697                                                 }
 698                                         }
 699
 700                                         if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 701                                                 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 702                                         } else {
 703                                                 $outStr .= chr($this->noCharByteVal);
 704                                         } // No char exists
 705                                 } else {
 706                                         $outStr .= $chr;
 707                                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 708                         }
 709                         return $outStr;
 710                 }
 711         }
 712
 713         /**
 714          * Converts $str from UTF-8 to $charset
 715          *
 716          * @param       string          String in UTF-8 to convert to local charset
 717          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 718          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 719          * @return      string          Output string, converted to local charset
 720          */
 721         function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
 722
 723                 if ($charset === 'utf-8') {
 724                         return $str;
 725                 }
 726
 727                         // Charset is case-insensitive.
 728                 if ($this->initCharset($charset)) { // Parse conv. table if not already...
 729                         $strLen = strlen($str);
 730                         $outStr = '';
 731                         $buf = '';
 732                         for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
 733                                 $chr = substr($str, $a, 1);
 734                                 $ord = ord($chr);
 735                                 if ($ord > 127) { // This means multibyte! (first byte!)
 736                                         if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 737
 738                                                 $buf = $chr; // Add first byte
 739                                                 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 740                                                         $ord = $ord << 1; // Shift it left and ...
 741                                                         if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 742                                                                 $a++; // Increase pointer...
 743                                                                 $buf .= substr($str, $a, 1); // ... and add the next char.
 744                                                         } else {
 745                                                                 break;
 746                                                         }
 747                                                 }
 748
 749                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
 750                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
 751                                                         if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 752                                                                 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
 753                                                         } else {
 754                                                                 $outStr .= chr($mByte);
 755                                                         }
 756                                                 } elseif ($useEntityForNoChar) { // Create num entity:
 757                                                         $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 758                                                 } else {
 759                                                         $outStr .= chr($this->noCharByteVal);
 760                                                 } // No char exists
 761                                         } else {
 762                                                 $outStr .= chr($this->noCharByteVal);
 763                                         } // No char exists (MIDDLE of MB sequence!)
 764                                 } else {
 765                                         $outStr .= $chr;
 766                                 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 767                         }
 768                         return $outStr;
 769                 }
 770         }
 771
 772         /**
 773          * Converts all chars > 127 to numeric entities.
 774          *
 775          * @param       string          Input string
 776          * @return      string          Output string
 777          */
 778         function utf8_to_entities($str) {
 779                 $strLen = strlen($str);
 780                 $outStr = '';
 781                 $buf = '';
 782                 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 783                         $chr = substr($str, $a, 1);
 784                         $ord = ord($chr);
 785                         if ($ord > 127) { // This means multibyte! (first byte!)
 786                                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 787                                         $buf = $chr; // Add first byte
 788                                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 789                                                 $ord = $ord << 1; // Shift it left and ...
 790                                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 791                                                         $a++; // Increase pointer...
 792                                                         $buf .= substr($str, $a, 1); // ... and add the next char.
 793                                                 } else {
 794                                                         break;
 795                                                 }
 796                                         }
 797
 798                                         $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 799                                 } else {
 800                                         $outStr .= chr($this->noCharByteVal);
 801                                 } // No char exists (MIDDLE of MB sequence!)
 802                         } else {
 803                                 $outStr .= $chr;
 804                         } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 805                 }
 806
 807                 return $outStr;
 808         }
 809
 810         /**
 811          * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 812          *
 813          * @param       string          Input string, UTF-8
 814          * @param       boolean         If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 815          * @return      string          Output string
 816          */
 817         function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
 818                 // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
 819                 // see http://php.net/manual/en/function.get-html-translation-table.php
 820                 $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<');
 821
 822                 if ($alsoStdHtmlEnt) {
 823                         if ($applyPhpCompatibilityFix === TRUE) {
 824                                 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
 825                         } else {
 826                                 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
 827                         }
 828                 }
 829
 830                 $token = md5(microtime());
 831                 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
 832                 foreach ($parts as $k => $v) {
 833                                 // only take every second element
 834                         if ($k % 2 === 0) {
 835                                 continue;
 836                         }
 837
 838                         $position = 0;
 839                         if (substr($v, $position, 1) == '#') { // Dec or hex entities:
 840                                 $position++;
 841                                 if (substr($v, $position, 1) == 'x') {
 842                                         $v = hexdec(substr($v, ++$position));
 843                                 } else {
 844                                         $v = substr($v, $position);
 845                                 }
 846                                 $parts[$k] = $this->UnumberToChar($v);
 847                         } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
 848                                 $v = $trans_tbl['&' . $v . ';'];
 849                                 if ($applyPhpCompatibilityFix === TRUE) {
 850                                         $v = $this->utf8_encode($v, 'iso-8859-1');
 851                                 }
 852                                 $parts[$k] = $v;
 853                         } else { // No conversion:
 854                                 $parts[$k] = '&' . $v . ';';
 855                         }
 856                 }
 857
 858                 return implode('', $parts);
 859         }
 860
 861         /**
 862          * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 863          *
 864          * @param       string          Input string, UTF-8
 865          * @param       boolean         If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 866          * @param       boolean         If set, then instead of integer numbers the real UTF-8 char is returned.
 867          * @return      array           Output array with the char numbers
 868          */
 869         function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
 870                         // If entities must be registered as well...:
 871                 if ($convEntities) {
 872                         $str = $this->entities_to_utf8($str, 1);
 873                 }
 874                         // Do conversion:
 875                 $strLen = strlen($str);
 876                 $outArr = array();
 877                 $buf = '';
 878                 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 879                         $chr = substr($str, $a, 1);
 880                         $ord = ord($chr);
 881                         if ($ord > 127) { // This means multibyte! (first byte!)
 882                                 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 883                                         $buf = $chr; // Add first byte
 884                                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 885                                                 $ord = $ord << 1; // Shift it left and ...
 886                                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 887                                                         $a++; // Increase pointer...
 888                                                         $buf .= substr($str, $a, 1); // ... and add the next char.
 889                                                 } else {
 890                                                         break;
 891                                                 }
 892                                         }
 893
 894                                         $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
 895                                 } else {
 896                                         $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
 897                                 } // No char exists (MIDDLE of MB sequence!)
 898                         } else {
 899                                 $outArr[] = $retChar ? chr($ord) : $ord;
 900                         } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 901                 }
 902
 903                 return $outArr;
 904         }
 905
 906         /**
 907          * Converts a UNICODE number to a UTF-8 multibyte character
 908          * Algorithm based on script found at From: http://czyborra.com/utf/
 909          * Unit-tested by Kasper
 910          *
 911          * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 912          *
 913          *  bytes | bits | representation
 914          *        1 |   7 | 0vvvvvvv
 915          *        2 |   11 | 110vvvvv 10vvvvvv
 916          *        3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 917          *        4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 918          *        5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 919          *        6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 920          *
 921          * @param       integer         UNICODE integer
 922          * @return      string          UTF-8 multibyte character string
 923          * @see utf8CharToUnumber()
 924          */
 925         function UnumberToChar($cbyte) {
 926                 $str = '';
 927
 928                 if ($cbyte < 0x80) {
 929                         $str .= chr($cbyte);
 930                 } else {
 931                         if ($cbyte < 0x800) {
 932                                 $str .= chr(0xC0 | ($cbyte >> 6));
 933                                 $str .= chr(0x80 | ($cbyte & 0x3F));
 934                         } else {
 935                                 if ($cbyte < 0x10000) {
 936                                         $str .= chr(0xE0 | ($cbyte >> 12));
 937                                         $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 938                                         $str .= chr(0x80 | ($cbyte & 0x3F));
 939                                 } else {
 940                                         if ($cbyte < 0x200000) {
 941                                                 $str .= chr(0xF0 | ($cbyte >> 18));
 942                                                 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 943                                                 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 944                                                 $str .= chr(0x80 | ($cbyte & 0x3F));
 945                                         } else {
 946                                                 if ($cbyte < 0x4000000) {
 947                                                         $str .= chr(0xF8 | ($cbyte >> 24));
 948                                                         $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 949                                                         $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 950                                                         $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 951                                                         $str .= chr(0x80 | ($cbyte & 0x3F));
 952                                                 } else {
 953                                                         if ($cbyte < 0x80000000) {
 954                                                                 $str .= chr(0xFC | ($cbyte >> 30));
 955                                                                 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
 956                                                                 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 957                                                                 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 958                                                                 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 959                                                                 $str .= chr(0x80 | ($cbyte & 0x3F));
 960                                                         } else { // Cannot express a 32-bit character in UTF-8
 961                                                                 $str .= chr($this->noCharByteVal);
 962                                                         }
 963                                                 }
 964                                         }
 965                                 }
 966                         }
 967                 }
 968                 return $str;
 969         }
 970
 971         /**
 972          * Converts a UTF-8 Multibyte character to a UNICODE number
 973          * Unit-tested by Kasper
 974          *
 975          * @param       string          UTF-8 multibyte character string
 976          * @param       boolean         If set, then a hex. number is returned.
 977          * @return      integer         UNICODE integer
 978          * @see UnumberToChar()
 979          */
 980         function utf8CharToUnumber($str, $hex = 0) {
 981                 $ord = ord(substr($str, 0, 1)); // First char
 982
 983                 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
 984                         $binBuf = '';
 985                         for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 986                                 $ord = $ord << 1; // Shift it left and ...
 987                                 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 988                                         $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
 989                                 } else {
 990                                         break;
 991                                 }
 992                         }
 993                         $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
 994
 995                         $int = bindec($binBuf);
 996                 } else {
 997                         $int = $ord;
 998                 }
 999
1000                 return $hex ? 'x' . dechex($int) : $int;
1001         }
1002
1003
1004         /********************************************
1005          *
1006          * Init functions
1007          *
1008          ********************************************/
1009
1010         /**
1011          * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1012          * This function is automatically called by the conversion functions
1013          *
1014          * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1015          *
1016          * @param       string          The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1017          * @return      integer         Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1018          * @access private
1019          */
1020         function initCharset($charset) {
1021                         // Only process if the charset is not yet loaded:
1022                 if (!is_array($this->parsedCharsets[$charset])) {
1023
1024                                 // Conversion table filename:
1025                         $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1026
1027                                 // If the conversion table is found:
1028                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1029                                         // Cache file for charsets:
1030                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1031                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1032                                 if ($cacheFile && @is_file($cacheFile)) {
1033                                         $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1034                                 } else {
1035                                                 // Parse conversion table into lines:
1036                                         $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1037                                                 // Initialize the internal variable holding the conv. table:
1038                                         $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1039                                                 // traverse the lines:
1040                                         $detectedType = '';
1041                                         foreach ($lines as $value) {
1042                                                 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1043
1044                                                                 // Detect type if not done yet: (Done on first real line)
1045                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
1046                                                         if (!$detectedType) {
1047                                                                 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1048                                                         }
1049
1050                                                         if ($detectedType == 'ms-token') {
1051                                                                 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1052                                                         } elseif ($detectedType == 'whitespaced') {
1053                                                                 $regA = array();
1054                                                                 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1055                                                                 $hexbyte = $regA[1];
1056                                                                 $utf8 = 'U+' . $regA[2];
1057                                                         }
1058                                                         $decval = hexdec(trim($hexbyte));
1059                                                         if ($decval > 127) {
1060                                                                 $utf8decval = hexdec(substr(trim($utf8), 2));
1061                                                                 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1062                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1063                                                         }
1064                                                 }
1065                                         }
1066                                         if ($cacheFile) {
1067                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1068                                         }
1069                                 }
1070                                 return 2;
1071                         } else {
1072                                 return FALSE;
1073                         }
1074                 } else {
1075                         return 1;
1076                 }
1077         }
1078
1079         /**
1080          * This function initializes all UTF-8 character data tables.
1081          *
1082          * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1083          *
1084          * @param       string          Mode ("case", "ascii", ...)
1085          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1086          * @access private
1087          */
1088         function initUnicodeData($mode = NULL) {
1089                         // cache files
1090                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1091                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1092
1093                         // Only process if the tables are not yet loaded
1094                 switch ($mode) {
1095                         case 'case':
1096                                 if (is_array($this->caseFolding['utf-8'])) {
1097                                         return 1;
1098                                 }
1099
1100                                         // Use cached version if possible
1101                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
1102                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1103                                         return 2;
1104                                 }
1105                                 break;
1106
1107                         case 'ascii':
1108                                 if (is_array($this->toASCII['utf-8'])) {
1109                                         return 1;
1110                                 }
1111
1112                                         // Use cached version if possible
1113                                 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1114                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1115                                         return 2;
1116                                 }
1117                                 break;
1118                 }
1119
1120                         // process main Unicode data file
1121                 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1122                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1123                         return FALSE;
1124                 }
1125
1126                 $fh = fopen($unicodeDataFile, 'rb');
1127                 if (!$fh) {
1128                         return FALSE;
1129                 }
1130
1131                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1132                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1133                 $this->caseFolding['utf-8'] = array();
1134                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1135                 $utf8CaseFolding['toUpper'] = array();
1136                 $utf8CaseFolding['toLower'] = array();
1137                 $utf8CaseFolding['toTitle'] = array();
1138
1139                 $decomposition = array(); // array of temp. decompositions
1140                 $mark = array(); // array of chars that are marks (eg. composing accents)
1141                 $number = array(); // array of chars that are numbers (eg. digits)
1142                 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1143
1144                 while (!feof($fh)) {
1145                         $line = fgets($fh, 4096);
1146                                 // has a lot of info
1147                         list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1148
1149                         $ord = hexdec($char);
1150                         if ($ord > 0xFFFF) {
1151                                 break;
1152                         } // only process the BMP
1153
1154                         $utf8_char = $this->UnumberToChar($ord);
1155
1156                         if ($upper) {
1157                                 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1158                         }
1159                         if ($lower) {
1160                                 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1161                         }
1162                                 // store "title" only when different from "upper" (only a few)
1163                         if ($title && $title != $upper) {
1164                                 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1165                         }
1166
1167                         switch ($cat[0]) {
1168                                 case 'M': // mark (accent, umlaut, ...)
1169                                         $mark["U+$char"] = 1;
1170                                         break;
1171
1172                                 case 'N': // numeric value
1173                                         if ($ord > 0x80 && $num != '') {
1174                                                 $number["U+$char"] = $num;
1175                                         }
1176                         }
1177
1178                                 // accented Latin letters without "official" decomposition
1179                         $match = array();
1180                         if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1181                                 $c = ord($match[2]);
1182                                 if ($match[1] == 'SMALL') {
1183                                         $c += 32;
1184                                 }
1185
1186                                 $decomposition["U+$char"] = array(dechex($c));
1187                                 continue;
1188                         }
1189
1190                         $match = array();
1191                         if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1192                                 switch ($match[1]) {
1193                                         case '<circle>': // add parenthesis as circle replacement, eg (1)
1194                                                 $match[2] = '0028 ' . $match[2] . ' 0029';
1195                                                 break;
1196
1197                                         case '<square>': // add square brackets as square replacement, eg [1]
1198                                                 $match[2] = '005B ' . $match[2] . ' 005D';
1199                                                 break;
1200
1201                                         case '<compat>': // ignore multi char decompositions that start with a space
1202                                                 if (preg_match('/^0020 /', $match[2])) {
1203                                                         continue 2;
1204                                                 }
1205                                                 break;
1206
1207                                                 // ignore Arabic and vertical layout presentation decomposition
1208                                         case '<initial>':
1209                                         case '<medial>':
1210                                         case '<final>':
1211                                         case '<isolated>':
1212                                         case '<vertical>':
1213                                                 continue 2;
1214                                 }
1215                                 $decomposition["U+$char"] = explode(' ', $match[2]);
1216                         }
1217                 }
1218                 fclose($fh);
1219
1220                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1221                 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1222                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1223                         $fh = fopen($specialCasingFile, 'rb');
1224                         if ($fh) {
1225                                 while (!feof($fh)) {
1226                                         $line = fgets($fh, 4096);
1227                                         if ($line[0] != '#' && trim($line) != '') {
1228
1229                                                 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1230                                                 if ($cond == '' || $cond[0] == '#') {
1231                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
1232                                                         if ($char != $lower) {
1233                                                                 $arr = explode(' ', $lower);
1234                                                                 for ($i = 0; isset($arr[$i]); $i++) {
1235                                                                         $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1236                                                                 }
1237                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1238                                                         }
1239                                                         if ($char != $title && $title != $upper) {
1240                                                                 $arr = explode(' ', $title);
1241                                                                 for ($i = 0; isset($arr[$i]); $i++) {
1242                                                                         $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1243                                                                 }
1244                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1245                                                         }
1246                                                         if ($char != $upper) {
1247                                                                 $arr = explode(' ', $upper);
1248                                                                 for ($i = 0; isset($arr[$i]); $i++) {
1249                                                                         $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1250                                                                 }
1251                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1252                                                         }
1253                                                 }
1254                                         }
1255                                 }
1256                                 fclose($fh);
1257                         }
1258                 }
1259
1260                         // process custom decompositions
1261                 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1262                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1263                         $fh = fopen($customTranslitFile, 'rb');
1264                         if ($fh) {
1265                                 while (!feof($fh)) {
1266                                         $line = fgets($fh, 4096);
1267                                         if ($line[0] != '#' && trim($line) != '') {
1268                                                 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1269                                                 if (!$translit) {
1270                                                         $omit["U+$char"] = 1;
1271                                                 }
1272                                                 $decomposition["U+$char"] = explode(' ', $translit);
1273
1274                                         }
1275                                 }
1276                                 fclose($fh);
1277                         }
1278                 }
1279
1280                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1281                 foreach ($decomposition as $from => $to) {
1282                         $code_decomp = array();
1283
1284                         while ($code_value = array_shift($to)) {
1285                                 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1286                                         foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1287                                                 array_unshift($to, $cv);
1288                                         }
1289                                 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1290                                         array_push($code_decomp, $code_value);
1291                                 }
1292                         }
1293                         if (count($code_decomp) || isset($omit[$from])) {
1294                                 $decomposition[$from] = $code_decomp;
1295                         } else {
1296                                 unset($decomposition[$from]);
1297                         }
1298                 }
1299
1300                         // create ascii only mapping
1301                 $this->toASCII['utf-8'] = array();
1302                 $ascii =& $this->toASCII['utf-8'];
1303
1304                 foreach ($decomposition as $from => $to) {
1305                         $code_decomp = array();
1306                         while ($code_value = array_shift($to)) {
1307                                 $ord = hexdec($code_value);
1308                                 if ($ord > 127) {
1309                                         continue 2;
1310                                 } // skip decompositions containing non-ASCII chars
1311                                 else
1312                                 {
1313                                         array_push($code_decomp, chr($ord));
1314                                 }
1315                         }
1316                         $ascii[$this->UnumberToChar(hexdec(str_replace('U+', '0x', $from)))] = join('', $code_decomp);
1317                 }
1318
1319                         // add numeric decompositions
1320                 foreach ($number as $from => $to) {
1321                         $utf8_char = $this->UnumberToChar(hexdec(str_replace('U+', '0x', $from)));
1322                         if (!isset($ascii[$utf8_char])) {
1323                                 $ascii[$utf8_char] = $to;
1324                         }
1325                 }
1326
1327                 if ($cacheFileCase) {
1328                         t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1329                 }
1330
1331                 if ($cacheFileASCII) {
1332                         t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1333                 }
1334
1335                 return 3;
1336         }
1337
1338         /**
1339          * This function initializes the folding table for a charset other than UTF-8.
1340          * This function is automatically called by the case folding functions.
1341          *
1342          * @param       string          Charset for which to initialize case folding.
1343          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1344          * @access private
1345          */
1346         function initCaseFolding($charset) {
1347                         // Only process if the case table is not yet loaded:
1348                 if (is_array($this->caseFolding[$charset])) {
1349                         return 1;
1350                 }
1351
1352                         // Use cached version if possible
1353                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1354                 if ($cacheFile && @is_file($cacheFile)) {
1355                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1356                         return 2;
1357                 }
1358
1359                         // init UTF-8 conversion for this charset
1360                 if (!$this->initCharset($charset)) {
1361                         return FALSE;
1362                 }
1363
1364                         // UTF-8 case folding is used as the base conversion table
1365                 if (!$this->initUnicodeData('case')) {
1366                         return FALSE;
1367                 }
1368
1369                 $nochar = chr($this->noCharByteVal);
1370                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1371                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1372                         $c = $this->utf8_decode($utf8, $charset);
1373
1374                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1375                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1376                         if ($cc != '' && $cc != $nochar) {
1377                                 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1378                         }
1379
1380                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1381                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1382                         if ($cc != '' && $cc != $nochar) {
1383                                 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1384                         }
1385
1386                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1387                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1388                         if ($cc != '' && $cc != $nochar) {
1389                                 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1390                         }
1391                 }
1392
1393                         // add the ASCII case table
1394                 for ($i = ord('a'); $i <= ord('z'); $i++) {
1395                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1396                 }
1397                 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1398                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1399                 }
1400
1401                 if ($cacheFile) {
1402                         t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1403                 }
1404
1405                 return 3;
1406         }
1407
1408         /**
1409          * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1410          * This function is automatically called by the ASCII transliteration functions.
1411          *
1412          * @param       string          Charset for which to initialize conversion.
1413          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1414          * @access private
1415          */
1416         function initToASCII($charset) {
1417                         // Only process if the case table is not yet loaded:
1418                 if (is_array($this->toASCII[$charset])) {
1419                         return 1;
1420                 }
1421
1422                         // Use cached version if possible
1423                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1424                 if ($cacheFile && @is_file($cacheFile)) {
1425                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1426                         return 2;
1427                 }
1428
1429                         // init UTF-8 conversion for this charset
1430                 if (!$this->initCharset($charset)) {
1431                         return FALSE;
1432                 }
1433
1434                         // UTF-8/ASCII transliteration is used as the base conversion table
1435                 if (!$this->initUnicodeData('ascii')) {
1436                         return FALSE;
1437                 }
1438
1439                 $nochar = chr($this->noCharByteVal);
1440                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1441                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1442                         $c = $this->utf8_decode($utf8, $charset);
1443
1444                         if (isset($this->toASCII['utf-8'][$utf8])) {
1445                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1446                         }
1447                 }
1448
1449                 if ($cacheFile) {
1450                         t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1451                 }
1452
1453                 return 3;
1454         }
1455
1456
1457         /********************************************
1458          *
1459          * String operation functions
1460          *
1461          ********************************************/
1462
1463         /**
1464          * Returns a part of a string.
1465          * Unit-tested by Kasper (single byte charsets only)
1466          *
1467          * @param       string          The character set
1468          * @param       string          Character string
1469          * @param       integer         Start position (character position)
1470          * @param       integer         Length (in characters)
1471          * @return      string          The substring
1472          * @see substr(), mb_substr()
1473          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1474          */
1475         function substr($charset, $string, $start, $len = NULL) {
1476                 if ($len === 0 || $string === '') {
1477                         return '';
1478                 }
1479
1480                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1481                                 // cannot omit $len, when specifying charset
1482                         if ($len == NULL) {
1483                                 $enc = mb_internal_encoding(); // save internal encoding
1484                                 mb_internal_encoding($charset);
1485                                 $str = mb_substr($string, $start);
1486                                 mb_internal_encoding($enc); // restore internal encoding
1487
1488                                 return $str;
1489                         }
1490                         else {
1491                                 return mb_substr($string, $start, $len, $charset);
1492                         }
1493                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1494                                 // cannot omit $len, when specifying charset
1495                         if ($len == NULL) {
1496                                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1497                                 iconv_set_encoding('internal_encoding', $charset);
1498                                 $str = iconv_substr($string, $start);
1499                                 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1500
1501                                 return $str;
1502                         }
1503                         else {
1504                                 return iconv_substr($string, $start, $len, $charset);
1505                         }
1506                 } elseif ($charset == 'utf-8') {
1507                         return $this->utf8_substr($string, $start, $len);
1508                 } elseif ($this->eucBasedSets[$charset]) {
1509                         return $this->euc_substr($string, $start, $charset, $len);
1510                 } elseif ($this->twoByteSets[$charset]) {
1511                         return substr($string, $start * 2, $len * 2);
1512                 } elseif ($this->fourByteSets[$charset]) {
1513                         return substr($string, $start * 4, $len * 4);
1514                 }
1515
1516                         // treat everything else as single-byte encoding
1517                 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1518         }
1519
1520         /**
1521          * Counts the number of characters.
1522          * Unit-tested by Kasper (single byte charsets only)
1523          *
1524          * @param       string          The character set
1525          * @param       string          Character string
1526          * @return      integer         The number of characters
1527          * @see strlen()
1528          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1529          */
1530         function strlen($charset, $string) {
1531                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1532                         return mb_strlen($string, $charset);
1533                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1534                         return iconv_strlen($string, $charset);
1535                 } elseif ($charset == 'utf-8') {
1536                         return $this->utf8_strlen($string);
1537                 } elseif ($this->eucBasedSets[$charset]) {
1538                         return $this->euc_strlen($string, $charset);
1539                 } elseif ($this->twoByteSets[$charset]) {
1540                         return strlen($string) / 2;
1541                 } elseif ($this->fourByteSets[$charset]) {
1542                         return strlen($string) / 4;
1543                 }
1544                         // treat everything else as single-byte encoding
1545                 return strlen($string);
1546         }
1547
1548         /**
1549          * Method to crop strings using the mb_substr function.
1550          *
1551          * @param  string               The character set
1552          * @param  string               String to be cropped
1553          * @param  integer              Crop length (in characters)
1554          * @param  string               Crop signifier
1555          * @return string               The shortened string
1556          * @see mb_strlen(), mb_substr()
1557          */
1558         protected function cropMbstring($charset, $string, $len, $crop = '') {
1559                 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1560                         return $string;
1561                 }
1562
1563                 if ($len > 0) {
1564                         $string = mb_substr($string, 0, $len, $charset) . $crop;
1565                 } else {
1566                         $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1567                 }
1568
1569                 return $string;
1570         }
1571
1572         /**
1573          * Truncates a string and pre-/appends a string.
1574          * Unit tested by Kasper
1575          *
1576          * @param       string          The character set
1577          * @param       string          Character string
1578          * @param       integer         Length (in characters)
1579          * @param       string          Crop signifier
1580          * @return      string          The shortened string
1581          * @see substr(), mb_strimwidth()
1582          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1583          */
1584         function crop($charset, $string, $len, $crop = '') {
1585                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1586                         return $this->cropMbstring($charset, $string, $len, $crop);
1587                 }
1588
1589                 if (intval($len) == 0) {
1590                         return $string;
1591                 }
1592
1593                 if ($charset == 'utf-8') {
1594                         $i = $this->utf8_char2byte_pos($string, $len);
1595                 } elseif ($this->eucBasedSets[$charset]) {
1596                         $i = $this->euc_char2byte_pos($string, $len, $charset);
1597                 } else {
1598                         if ($len > 0) {
1599                                 $i = $len;
1600                         } else {
1601                                 $i = strlen($string) + $len;
1602                                 if ($i <= 0) {
1603                                         $i = FALSE;
1604                                 }
1605                         }
1606                 }
1607
1608                 if ($i === FALSE) { // $len outside actual string length
1609                         return $string;
1610                 } else {
1611                         if ($len > 0) {
1612                                 if (strlen($string[$i])) {
1613                                         return substr($string, 0, $i) . $crop;
1614
1615                                 }
1616                         } else {
1617                                 if (strlen($string[$i - 1])) {
1618                                         return $crop . substr($string, $i);
1619                                 }
1620                         }
1621
1622                         /*
1623                            if (abs($len)<$this->strlen($charset,$string))       {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1624                                    if ($len > 0)        {
1625                                            return substr($string,0,$i).$crop;
1626                                    } else {
1627                                            return $crop.substr($string,$i);
1628                                    }
1629                            }
1630    */
1631                 }
1632                 return $string;
1633         }
1634
1635         /**
1636          * Cuts a string short at a given byte length.
1637          *
1638          * @param       string          The character set
1639          * @param       string          Character string
1640          * @param       integer         The byte length
1641          * @return      string          The shortened string
1642          * @see mb_strcut()
1643          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1644          */
1645         function strtrunc($charset, $string, $len) {
1646                 if ($len <= 0) {
1647                         return '';
1648                 }
1649
1650                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1651                         return mb_strcut($string, 0, $len, $charset);
1652                 } elseif ($charset == 'utf-8') {
1653                         return $this->utf8_strtrunc($string, $len);
1654                 } elseif ($this->eucBasedSets[$charset]) {
1655                         return $this->euc_strtrunc($string, $len, $charset);
1656                 } elseif ($this->twoByteSets[$charset]) {
1657                         if ($len % 2) {
1658                                 $len--;
1659                         } // don't cut at odd positions
1660                 } elseif ($this->fourByteSets[$charset]) {
1661                         $x = $len % 4;
1662                         $len -= $x; // realign to position dividable by four
1663                 }
1664                         // treat everything else as single-byte encoding
1665                 return substr($string, 0, $len);
1666         }
1667
1668         /**
1669          * Translates all characters of a string into their respective case values.
1670          * Unlike strtolower() and strtoupper() this method is locale independent.
1671          * Note that the string length may change!
1672          * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1673          * Unit-tested by Kasper
1674          * Real case folding is language dependent, this method ignores this fact.
1675          *
1676          * @param       string          Character set of string
1677          * @param       string          Input string to convert case for
1678          * @param       string          Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1679          * @return      string          The converted string
1680          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1681          * @see strtolower(), strtoupper()
1682          */
1683         function conv_case($charset, $string, $case) {
1684                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1685                         if ($case == 'toLower') {
1686                                 $string = mb_strtolower($string, $charset);
1687                         } else {
1688                                 $string = mb_strtoupper($string, $charset);
1689                         }
1690                 } elseif ($charset == 'utf-8') {
1691                         $string = $this->utf8_char_mapping($string, 'case', $case);
1692                 } elseif (isset($this->eucBasedSets[$charset])) {
1693                         $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1694                 } else {
1695                                 // treat everything else as single-byte encoding
1696                         $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1697                 }
1698
1699                 return $string;
1700         }
1701
1702         /**
1703          * Equivalent of lcfirst/ucfirst but using character set.
1704          *
1705          * @param string $charset
1706          * @param string $string
1707          * @param string $case
1708          * @return string
1709          * @see t3lib_cs::conv_case()
1710          */
1711         public function convCaseFirst($charset, $string, $case) {
1712                 $firstChar = $this->substr($charset, $string, 0, 1);
1713                 $firstChar = $this->conv_case($charset, $firstChar, $case);
1714                 $remainder = $this->substr($charset, $string, 1);
1715                 return $firstChar . $remainder;
1716         }
1717
1718         /**
1719          * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1720          *
1721          * @param string $charset Character set of string
1722          * @param string $string Input string to convert
1723          * @return string The converted string
1724          */
1725         function specCharsToASCII($charset, $string) {
1726                 if ($charset == 'utf-8') {
1727                         $string = $this->utf8_char_mapping($string, 'ascii');
1728                 } elseif (isset($this->eucBasedSets[$charset])) {
1729                         $string = $this->euc_char_mapping($string, $charset, 'ascii');
1730                 } else {
1731                                 // treat everything else as single-byte encoding
1732                         $string = $this->sb_char_mapping($string, $charset, 'ascii');
1733                 }
1734
1735                 return $string;
1736         }
1737
1738
1739         /**
1740          * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1741          * into a TYPO3-readable language code
1742          * @param       $languageCodesList      list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1743          *                       see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1744          * @return      string  a preferred language that TYPO3 supports, or "default" if none found
1745          * @author      Benjamin Mack (benni.typo3.org)
1746          */
1747         public function getPreferredClientLanguage($languageCodesList) {
1748                 $allLanguageCodes = array();
1749                 $selectedLanguage = 'default';
1750
1751                         // get all languages where TYPO3 code is the same as the ISO code
1752                 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1753                         $allLanguageCodes[$typo3Lang] = $typo3Lang;
1754                 }
1755
1756                         // get all languages where TYPO3 code differs from ISO code
1757                         // or needs the country part
1758                         // the iso codes will here overwrite the default typo3 language in the key
1759                 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1760                         $isoLang = join('-', explode('_', $isoLang));
1761                         $allLanguageCodes[$typo3Lang] = $isoLang;
1762                 }
1763
1764                         // move the iso codes to the (because we're comparing the keys with "isset" later on)
1765                 $allLanguageCodes = array_flip($allLanguageCodes);
1766
1767
1768                 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1769                         // order the preferred languages after they key
1770                 $sortedPreferredLanguages = array();
1771                 foreach ($preferredLanguages as $preferredLanguage) {
1772                         $quality = 1.0;
1773                         if (strpos($preferredLanguage, ';q=') !== FALSE) {
1774                                 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1775                         }
1776                         $sortedPreferredLanguages[$preferredLanguage] = $quality;
1777                 }
1778
1779                         // loop through the languages, with the highest priority first
1780                 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1781                 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1782                         if (isset($allLanguageCodes[$preferredLanguage])) {
1783                                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1784                                 break;
1785                         }
1786
1787                                 // strip the country code from the end
1788                         list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1789                         if (isset($allLanguageCodes[$preferredLanguage])) {
1790                                 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1791                                 break;
1792                         }
1793                 }
1794                 if (!$selectedLanguage || $selectedLanguage == 'en') {
1795                         $selectedLanguage = 'default';
1796                 }
1797                 return $selectedLanguage;
1798         }
1799
1800
1801         /********************************************
1802          *
1803          * Internal string operation functions
1804          *
1805          ********************************************/
1806
1807         /**
1808          * Maps all characters of a string in a single byte charset.
1809          *
1810          * @param       string          the string
1811          * @param       string          the charset
1812          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1813          * @param       string          'case': conversion 'toLower' or 'toUpper'
1814          * @return      string          the converted string
1815          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1816          */
1817         function sb_char_mapping($str, $charset, $mode, $opt = '') {
1818                 switch ($mode) {
1819                         case 'case':
1820                                 if (!$this->initCaseFolding($charset)) {
1821                                         return $str;
1822                                 } // do nothing
1823                                 $map =& $this->caseFolding[$charset][$opt];
1824                                 break;
1825
1826                         case 'ascii':
1827                                 if (!$this->initToASCII($charset)) {
1828                                         return $str;
1829                                 } // do nothing
1830                                 $map =& $this->toASCII[$charset];
1831                                 break;
1832
1833                         default:
1834                                 return $str;
1835                 }
1836
1837                 $out = '';
1838                 for ($i = 0; strlen($str[$i]); $i++) {
1839                         $c = $str[$i];
1840                         if (isset($map[$c])) {
1841                                 $out .= $map[$c];
1842                         } else {
1843                                 $out .= $c;
1844                         }
1845                 }
1846
1847                 return $out;
1848         }
1849
1850
1851         /********************************************
1852          *
1853          * Internal UTF-8 string operation functions
1854          *
1855          ********************************************/
1856
1857         /**
1858          * Returns a part of a UTF-8 string.
1859          * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1860          *
1861          * @param       string          UTF-8 string
1862          * @param       integer         Start position (character position)
1863          * @param       integer         Length (in characters)
1864          * @return      string          The substring
1865          * @see substr()
1866          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1867          */
1868         function utf8_substr($str, $start, $len = NULL) {
1869                 if (!strcmp($len, '0')) {
1870                         return '';
1871                 }
1872
1873                 $byte_start = $this->utf8_char2byte_pos($str, $start);
1874                 if ($byte_start === FALSE) {
1875                         if ($start > 0) {
1876                                 return FALSE; // $start outside string length
1877                         } else {
1878                                 $start = 0;
1879                         }
1880                 }
1881
1882                 $str = substr($str, $byte_start);
1883
1884                 if ($len != NULL) {
1885                         $byte_end = $this->utf8_char2byte_pos($str, $len);
1886                         if ($byte_end === FALSE) // $len outside actual string length
1887                         {
1888                                 return $len < 0 ? '' : $str;
1889                         } // When length is less than zero and exceeds, then we return blank string.
1890                         else
1891                         {
1892                                 return substr($str, 0, $byte_end);
1893                         }
1894                 }
1895                 else    {
1896                         return $str;
1897                 }
1898         }
1899
1900         /**
1901          * Counts the number of characters of a string in UTF-8.
1902          * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1903          *
1904          * @param       string          UTF-8 multibyte character string
1905          * @return      integer         The number of characters
1906          * @see strlen()
1907          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1908          */
1909         function utf8_strlen($str) {
1910                 $n = 0;
1911                 for ($i = 0; strlen($str[$i]); $i++) {
1912                         $c = ord($str[$i]);
1913                         if (!($c & 0x80)) // single-byte (0xxxxxx)
1914                         {
1915                                 $n++;
1916                         }
1917                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1918                         {
1919                                 $n++;
1920                         }
1921                 }
1922                 return $n;
1923         }
1924
1925         /**
1926          * Truncates a string in UTF-8 short at a given byte length.
1927          *
1928          * @param       string          UTF-8 multibyte character string
1929          * @param       integer         the byte length
1930          * @return      string          the shortened string
1931          * @see mb_strcut()
1932          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1933          */
1934         function utf8_strtrunc($str, $len) {
1935                 $i = $len - 1;
1936                 if (ord($str[$i]) & 0x80) { // part of a multibyte sequence
1937                         for (; $i > 0 && !(ord($str[$i]) & 0x40); $i--) {
1938                                 // find the first byte
1939                                 ;
1940                         }
1941                         if ($i <= 0) {
1942                                 return '';
1943                         } // sanity check
1944                         for ($bc = 0, $mbs = ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1) {
1945                                 // calculate number of bytes
1946                                 $bc++;
1947                         }
1948                         if ($bc + $i > $len) {
1949                                 return substr($str, 0, $i);
1950                         }
1951                         // fallthru: multibyte char fits into length
1952                 }
1953                 return substr($str, 0, $len);
1954         }
1955
1956         /**
1957          * Find position of first occurrence of a string, both arguments are in UTF-8.
1958          *
1959          * @param       string          UTF-8 string to search in
1960          * @param       string          UTF-8 string to search for
1961          * @param       integer         Positition to start the search
1962          * @return      integer         The character position
1963          * @see strpos()
1964          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1965          */
1966         function utf8_strpos($haystack, $needle, $offset = 0) {
1967                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1968                         return mb_strpos($haystack, $needle, $offset, 'utf-8');
1969                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1970                         return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1971                 }
1972
1973                 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1974                 if ($byte_offset === FALSE) {
1975                         return FALSE;
1976                 } // offset beyond string length
1977
1978                 $byte_pos = strpos($haystack, $needle, $byte_offset);
1979                 if ($byte_pos === FALSE) {
1980                         return FALSE;
1981                 } // needle not found
1982
1983                 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1984         }
1985
1986         /**
1987          * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1988          *
1989          * @param       string          UTF-8 string to search in
1990          * @param       string          UTF-8 character to search for (single character)
1991          * @return      integer         The character position
1992          * @see strrpos()
1993          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1994          */
1995         function utf8_strrpos($haystack, $needle) {
1996                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1997                         return mb_strrpos($haystack, $needle, 'utf-8');
1998                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1999                         return iconv_strrpos($haystack, $needle, 'utf-8');
2000                 }
2001
2002                 $byte_pos = strrpos($haystack, $needle);
2003                 if ($byte_pos === FALSE) {
2004                         return FALSE;
2005                 } // needle not found
2006
2007                 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2008         }
2009
2010         /**
2011          * Translates a character position into an 'absolute' byte position.
2012          * Unit tested by Kasper.
2013          *
2014          * @param       string          UTF-8 string
2015          * @param       integer         Character position (negative values start from the end)
2016          * @return      integer         Byte position
2017          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2018          */
2019         function utf8_char2byte_pos($str, $pos) {
2020                 $n = 0; // number of characters found
2021                 $p = abs($pos); // number of characters wanted
2022
2023                 if ($pos >= 0) {
2024                         $i = 0;
2025                         $d = 1;
2026                 } else {
2027                         $i = strlen($str) - 1;
2028                         $d = -1;
2029                 }
2030
2031                 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2032                         $c = (int) ord($str[$i]);
2033                         if (!($c & 0x80)) // single-byte (0xxxxxx)
2034                         {
2035                                 $n++;
2036                         }
2037                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2038                         {
2039                                 $n++;
2040                         }
2041                 }
2042                 if (!strlen($str[$i])) {
2043                         return FALSE;
2044                 } // offset beyond string length
2045
2046                 if ($pos >= 0) {
2047                                 // skip trailing multi-byte data bytes
2048                         while ((ord($str[$i]) & 0x80) && !(ord($str[$i]) & 0x40)) {
2049                                 $i++;
2050                         }
2051                 } else {
2052                                 // correct offset
2053                         $i++;
2054                 }
2055
2056                 return $i;
2057         }
2058
2059         /**
2060          * Translates an 'absolute' byte position into a character position.
2061          * Unit tested by Kasper.
2062          *
2063          * @param       string          UTF-8 string
2064          * @param       integer         byte position
2065          * @return      integer         character position
2066          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2067          */
2068         function utf8_byte2char_pos($str, $pos) {
2069                 $n = 0; // number of characters
2070                 for ($i = $pos; $i > 0; $i--) {
2071                         $c = (int) ord($str[$i]);
2072                         if (!($c & 0x80)) // single-byte (0xxxxxx)
2073                         {
2074                                 $n++;
2075                         }
2076                         elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2077                         {
2078                                 $n++;
2079                         }
2080                 }
2081                 if (!strlen($str[$i])) {
2082                         return FALSE;
2083                 } // offset beyond string length
2084
2085                 return $n;
2086         }
2087
2088         /**
2089          * Maps all characters of an UTF-8 string.
2090          *
2091          * @param       string          UTF-8 string
2092          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2093          * @param       string          'case': conversion 'toLower' or 'toUpper'
2094          * @return      string          the converted string
2095          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2096          */
2097         function utf8_char_mapping($str, $mode, $opt = '') {
2098                 if (!$this->initUnicodeData($mode)) {
2099                         return $str;
2100                 } // do nothing
2101
2102                 $out = '';
2103                 switch ($mode) {
2104                         case 'case':
2105                                 $map =& $this->caseFolding['utf-8'][$opt];
2106                                 break;
2107
2108                         case 'ascii':
2109                                 $map =& $this->toASCII['utf-8'];
2110                                 break;
2111
2112                         default:
2113                                 return $str;
2114                 }
2115
2116                 for ($i = 0; strlen($str[$i]); $i++) {
2117                         $c = ord($str[$i]);
2118                         if (!($c & 0x80)) // single-byte (0xxxxxx)
2119                         {
2120                                 $mbc = $str[$i];
2121                         }
2122                         elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2123                                 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2124                                         $bc++;
2125                                 } // calculate number of bytes
2126                                 $mbc = substr($str, $i, $bc);
2127                                 $i += $bc - 1;
2128                         }
2129
2130                         if (isset($map[$mbc])) {
2131                                 $out .= $map[$mbc];
2132                         } else {
2133                                 $out .= $mbc;
2134                         }
2135                 }
2136
2137                 return $out;
2138         }
2139
2140
2141         /********************************************
2142          *
2143          * Internal EUC string operation functions
2144          *
2145          * Extended Unix Code:
2146          *  ASCII compatible 7bit single bytes chars
2147          *  8bit two byte chars
2148          *
2149          * Shift-JIS is treated as a special case.
2150          *
2151          ********************************************/
2152
2153         /**
2154          * Cuts a string in the EUC charset family short at a given byte length.
2155          *
2156          * @param       string          EUC multibyte character string
2157          * @param       integer         the byte length
2158          * @param       string          the charset
2159          * @return      string          the shortened string
2160          * @see mb_strcut()
2161          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2162          */
2163         function euc_strtrunc($str, $len, $charset) {
2164                 $sjis = ($charset == 'shift_jis');
2165                 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2166                         $c = ord($str[$i]);
2167                         if ($sjis) {
2168                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2169                                         $i++;
2170                                 } // advance a double-byte char
2171                         }
2172                         else {
2173                                 if ($c >= 0x80) {
2174                                         $i++;
2175                                 } // advance a double-byte char
2176                         }
2177                 }
2178                 if (!strlen($str[$i])) {
2179                         return $str;
2180                 } // string shorter than supplied length
2181
2182                 if ($i > $len) {
2183                         return substr($str, 0, $len - 1); // we ended on a first byte
2184                 } else {
2185                         return substr($str, 0, $len);
2186                 }
2187         }
2188
2189         /**
2190          * Returns a part of a string in the EUC charset family.
2191          *
2192          * @param       string          EUC multibyte character string
2193          * @param       integer         start position (character position)
2194          * @param       string          the charset
2195          * @param       integer         length (in characters)
2196          * @return      string          the substring
2197          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2198          */
2199         function euc_substr($str, $start, $charset, $len = NULL) {
2200                 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2201                 if ($byte_start === FALSE) {
2202                         return FALSE;
2203                 } // $start outside string length
2204
2205                 $str = substr($str, $byte_start);
2206
2207                 if ($len != NULL) {
2208                         $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2209                         if ($byte_end === FALSE) // $len outside actual string length
2210                         {
2211                                 return $str;
2212                         }
2213                         else
2214                         {
2215                                 return substr($str, 0, $byte_end);
2216                         }
2217                 }
2218                 else    {
2219                         return $str;
2220                 }
2221         }
2222
2223         /**
2224          * Counts the number of characters of a string in the EUC charset family.
2225          *
2226          * @param       string          EUC multibyte character string
2227          * @param       string          the charset
2228          * @return      integer         the number of characters
2229          * @see strlen()
2230          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2231          */
2232         function euc_strlen($str, $charset) {
2233                 $sjis = ($charset == 'shift_jis');
2234                 $n = 0;
2235                 for ($i = 0; strlen($str[$i]); $i++) {
2236                         $c = ord($str[$i]);
2237                         if ($sjis) {
2238                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2239                                         $i++;
2240                                 } // advance a double-byte char
2241                         }
2242                         else {
2243                                 if ($c >= 0x80) {
2244                                         $i++;
2245                                 } // advance a double-byte char
2246                         }
2247
2248                         $n++;
2249                 }
2250
2251                 return $n;
2252         }
2253
2254         /**
2255          * Translates a character position into an 'absolute' byte position.
2256          *
2257          * @param       string          EUC multibyte character string
2258          * @param       integer         character position (negative values start from the end)
2259          * @param       string          the charset
2260          * @return      integer         byte position
2261          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2262          */
2263         function euc_char2byte_pos($str, $pos, $charset) {
2264                 $sjis = ($charset == 'shift_jis');
2265                 $n = 0; // number of characters seen
2266                 $p = abs($pos); // number of characters wanted
2267
2268                 if ($pos >= 0) {
2269                         $i = 0;
2270                         $d = 1;
2271                 } else {
2272                         $i = strlen($str) - 1;
2273                         $d = -1;
2274                 }
2275
2276                 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2277                         $c = ord($str[$i]);
2278                         if ($sjis) {
2279                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2280                                         $i += $d;
2281                                 } // advance a double-byte char
2282                         }
2283                         else {
2284                                 if ($c >= 0x80) {
2285                                         $i += $d;
2286                                 } // advance a double-byte char
2287                         }
2288
2289                         $n++;
2290                 }
2291                 if (!strlen($str[$i])) {
2292                         return FALSE;
2293                 } // offset beyond string length
2294
2295                 if ($pos < 0) {
2296                         $i++;
2297                 } // correct offset
2298
2299                 return $i;
2300         }
2301
2302         /**
2303          * Maps all characters of a string in the EUC charset family.
2304          *
2305          * @param       string          EUC multibyte character string
2306          * @param       string          the charset
2307          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2308          * @param       string          'case': conversion 'toLower' or 'toUpper'
2309          * @return      string          the converted string
2310          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2311          */
2312         function euc_char_mapping($str, $charset, $mode, $opt = '') {
2313                 switch ($mode) {
2314                         case 'case':
2315                                 if (!$this->initCaseFolding($charset)) {
2316                                         return $str;
2317                                 } // do nothing
2318                                 $map =& $this->caseFolding[$charset][$opt];
2319                                 break;
2320
2321                         case 'ascii':
2322                                 if (!$this->initToASCII($charset)) {
2323                                         return $str;
2324                                 } // do nothing
2325                                 $map =& $this->toASCII[$charset];
2326                                 break;
2327
2328                         default:
2329                                 return $str;
2330                 }
2331
2332                 $sjis = ($charset == 'shift_jis');
2333                 $out = '';
2334                 for ($i = 0; strlen($str[$i]); $i++) {
2335                         $mbc = $str[$i];
2336                         $c = ord($mbc);
2337
2338                         if ($sjis) {
2339                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2340                                         $mbc = substr($str, $i, 2);
2341                                         $i++;
2342                                 }
2343                         }
2344                         else {
2345                                 if ($c >= 0x80) { // a double-byte char
2346                                         $mbc = substr($str, $i, 2);
2347                                         $i++;
2348                                 }
2349                         }
2350
2351                         if (isset($map[$mbc])) {
2352                                 $out .= $map[$mbc];
2353                         } else {
2354                                 $out .= $mbc;
2355                         }
2356                 }
2357
2358                 return $out;
2359         }
2360
2361 }
2362
2363 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2364         include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2365 }
2366
2367 ?>