lib/typo3/class.t3lib_cs.php

   1 <?php
   2 /***************************************************************
   3 *  Copyright notice
   4 *
   5 *  (c) 2003-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
   6 *  All rights reserved
   7 *
   8 *  This script is part of the Typo3 project. The Typo3 project is
   9 *  free software; you can redistribute it and/or modify
  10 *  it under the terms of the GNU General Public License as published by
  11 *  the Free Software Foundation; either version 2 of the License, or
  12 *  (at your option) any later version.
  13 *
  14 *  The GNU General Public License can be found at
  15 *  http://www.gnu.org/copyleft/gpl.html.
  16 *
  17 *  This script is distributed in the hope that it will be useful,
  18 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 *  GNU General Public License for more details.
  21 *
  22 *  This copyright notice MUST APPEAR in all copies of the script!
  23 ***************************************************************/
  24 /**
  25  * Class for conversion between charsets.
  26  *
  27  * $Id: class.t3lib_cs.php,v 1.54 2005/12/12 21:47:50 masi Exp $
  28  *
  29  * @author      Kasper Skaarhoj <kasperYYYY@typo3.com>
  30  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
  31  */
  32 /**
  33  * [CLASS/FUNCTION INDEX of SCRIPT]
  34  *
  35  *
  36  *
  37  *  136: class t3lib_cs
  38  *  503:     function parse_charset($charset)
  39  *  522:     function get_locale_charset($locale)
  40  *
  41  *              SECTION: Charset Conversion functions
  42  *  575:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
  43  *  615:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
  44  *  632:     function utf8_encode($str,$charset)
  45  *  678:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
  46  *  721:     function utf8_to_entities($str)
  47  *  754:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
  48  *  788:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
  49  *  838:     function UnumberToChar($cbyte)
  50  *  883:     function utf8CharToUnumber($str,$hex=0)
  51  *
  52  *              SECTION: Init functions
  53  *  926:     function initCharset($charset)
  54  *  988:     function initUnicodeData($mode=null)
  55  * 1213:     function initCaseFolding($charset)
  56  * 1275:     function initToASCII($charset)
  57  *
  58  *              SECTION: String operation functions
  59  * 1346:     function substr($charset,$string,$start,$len=null)
  60  * 1384:     function strlen($charset,$string)
  61  * 1412:     function crop($charset,$string,$len,$crop='')
  62  * 1465:     function strtrunc($charset,$string,$len)
  63  * 1499:     function conv_case($charset,$string,$case)
  64  * 1525:     function specCharsToASCII($charset,$string)
  65  *
  66  *              SECTION: Internal string operation functions
  67  * 1565:     function sb_char_mapping($str,$charset,$mode,$opt='')
  68  *
  69  *              SECTION: Internal UTF-8 string operation functions
  70  * 1620:     function utf8_substr($str,$start,$len=null)
  71  * 1653:     function utf8_strlen($str)
  72  * 1674:     function utf8_strtrunc($str,$len)
  73  * 1696:     function utf8_strpos($haystack,$needle,$offset=0)
  74  * 1719:     function utf8_strrpos($haystack,$needle)
  75  * 1739:     function utf8_char2byte_pos($str,$pos)
  76  * 1780:     function utf8_byte2char_pos($str,$pos)
  77  * 1803:     function utf8_char_mapping($str,$mode,$opt='')
  78  *
  79  *              SECTION: Internal EUC string operation functions
  80  * 1879:     function euc_strtrunc($str,$len,$charset)
  81  * 1908:     function euc_substr($str,$start,$charset,$len=null)
  82  * 1933:     function euc_strlen($str,$charset)
  83  * 1960:     function euc_char2byte_pos($str,$pos,$charset)
  84  * 2001:     function euc_char_mapping($str,$charset,$mode,$opt='')
  85  *
  86  * TOTAL FUNCTIONS: 35
  87  * (This index is automatically created/updated by the extension "extdeveval")
  88  *
  89  */
  90
  91
  92
  93
  94
  95
  96
  97
  98 /**
  99  * Notes on UTF-8
 100  *
 101  * Functions working on UTF-8 strings:
 102  *
 103  * - strchr/strstr
 104  * - strrchr
 105  * - substr_count
 106  * - implode/explode/join
 107  *
 108  * Functions nearly working on UTF-8 strings:
 109  *
 110  * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
 111  * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
 112  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
 113  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
 114  *
 115  * Functions NOT working on UTF-8 strings:
 116  *
 117  * - str*cmp
 118  * - stristr
 119  * - stripos
 120  * - substr
 121  * - strrev
 122  * - ereg/eregi
 123  * - split/spliti
 124  * - preg_*
 125  * - ...
 126  *
 127  */
 128 /**
 129  * Class for conversion between charsets
 130  *
 131  * @author      Kasper Skaarhoj <kasperYYYY@typo3.com>
 132  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 133  * @package TYPO3
 134  * @subpackage t3lib
 135  */
 136 class t3lib_cs {
 137         var $noCharByteVal=63;          // ASCII Value for chars with no equivalent.
 138
 139                 // This is the array where parsed conversion tables are stored (cached)
 140         var $parsedCharsets=array();
 141
 142                 // An array where case folding data will be stored (cached)
 143         var $caseFolding=array();
 144
 145                 // An array where charset-to-ASCII mappings are stored (cached)
 146         var $toASCII=array();
 147
 148                 // This tells the converter which charsets has two bytes per char:
 149         var $twoByteSets=array(
 150                 'ucs-2'=>1,     // 2-byte Unicode
 151         );
 152
 153                 // This tells the converter which charsets has four bytes per char:
 154         var $fourByteSets=array(
 155                 'ucs-4'=>1,     // 4-byte Unicode
 156                 'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
 157         );
 158
 159                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
 160         var $eucBasedSets=array(
 161                 'gb2312'=>1,            // Chinese, simplified.
 162                 'big5'=>1,              // Chinese, traditional.
 163                 'euc-kr'=>1,            // Korean
 164                 'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 165         );
 166
 167                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 168                 // http://czyborra.com/charsets/iso8859.html
 169         var $synonyms=array(
 170                 'us' => 'ascii',
 171                 'us-ascii'=> 'ascii',
 172                 'cp819' => 'iso-8859-1',
 173                 'ibm819' => 'iso-8859-1',
 174                 'iso-ir-100' => 'iso-8859-1',
 175                 'iso-ir-109' => 'iso-8859-2',
 176                 'iso-ir-148' => 'iso-8859-9',
 177                 'iso-ir-199' => 'iso-8859-14',
 178                 'iso-ir-203' => 'iso-8859-15',
 179                 'csisolatin1' => 'iso-8859-1',
 180                 'csisolatin2' => 'iso-8859-2',
 181                 'csisolatin3' => 'iso-8859-3',
 182                 'csisolatin5' => 'iso-8859-9',
 183                 'csisolatin8' => 'iso-8859-14',
 184                 'csisolatin9' => 'iso-8859-15',
 185                 'csisolatingreek' => 'iso-8859-7',
 186                 'iso-celtic' => 'iso-8859-14',
 187                 'latin1' => 'iso-8859-1',
 188                 'latin2' => 'iso-8859-2',
 189                 'latin3' => 'iso-8859-3',
 190                 'latin5' => 'iso-8859-9',
 191                 'latin6' => 'iso-8859-10',
 192                 'latin8' => 'iso-8859-14',
 193                 'latin9' => 'iso-8859-15',
 194                 'l1' => 'iso-8859-1',
 195                 'l2' => 'iso-8859-2',
 196                 'l3' => 'iso-8859-3',
 197                 'l5' => 'iso-8859-9',
 198                 'l6' => 'iso-8859-10',
 199                 'l8' => 'iso-8859-14',
 200                 'l9' => 'iso-8859-15',
 201                 'cyrillic' => 'iso-8859-5',
 202                 'arabic' => 'iso-8859-6',
 203                 'tis-620' => 'iso-8859-11',
 204                 'win874' => 'windows-874',
 205                 'win1250' => 'windows-1250',
 206                 'win1251' => 'windows-1251',
 207                 'win1252' => 'windows-1252',
 208                 'win1253' => 'windows-1253',
 209                 'win1254' => 'windows-1254',
 210                 'win1255' => 'windows-1255',
 211                 'win1256' => 'windows-1256',
 212                 'win1257' => 'windows-1257',
 213                 'win1258' => 'windows-1258',
 214                 'cp1250' => 'windows-1250',
 215                 'cp1251' => 'windows-1251',
 216                 'cp1252' => 'windows-1252',
 217                 'ms-ee' => 'windows-1250',
 218                 'ms-ansi' => 'windows-1252',
 219                 'ms-greek' => 'windows-1253',
 220                 'ms-turk' => 'windows-1254',
 221                 'winbaltrim' => 'windows-1257',
 222                 'koi-8ru' => 'koi-8r',
 223                 'koi8r' => 'koi-8r',
 224                 'cp878' => 'koi-8r',
 225                 'mac' => 'macroman',
 226                 'macintosh' => 'macroman',
 227                 'euc-cn' => 'gb2312',
 228                 'x-euc-cn' => 'gb2312',
 229                 'euccn' => 'gb2312',
 230                 'cp936' => 'gb2312',
 231                 'big-5' => 'big5',
 232                 'cp950' => 'big5',
 233                 'eucjp' => 'euc-jp',
 234                 'sjis' => 'shift_jis',
 235                 'shift-jis' => 'shift_jis',
 236                 'cp932' => 'shift_jis',
 237                 'cp949' => 'euc-kr',
 238                 'utf7' => 'utf-7',
 239                 'utf8' => 'utf-8',
 240                 'utf16' => 'utf-16',
 241                 'utf32' => 'utf-32',
 242                 'utf8' => 'utf-8',
 243                 'ucs2' => 'ucs-2',
 244                 'ucs4' => 'ucs-4',
 245         );
 246
 247                 // mapping of iso-639:2 language codes to language (family) names
 248         var $lang_to_langfamily=array(
 249                         // iso-639:2 language codes, see:
 250                         //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
 251                         //  http://www.unicode.org/onlinedat/languages.html
 252                 'ar' => 'arabic',
 253                 'bg' => 'cyrillic',
 254                 'cs' => 'east_european',
 255                 'da' => 'west_european',
 256                 'de' => 'west_european',
 257                 'es' => 'west_european',
 258                 'et' => 'estonian',
 259                 'eu' => 'west_european',
 260                 'fi' => 'west_european',
 261                 'fr' => 'west_european',
 262                 'gr' => 'greek',
 263                 'hr' => 'east_european',
 264                 'hu' => 'east_european',
 265                 'iw' => 'hebrew',
 266                 'is' => 'west_european',
 267                 'it' => 'west_european',
 268                 'ja' => 'japanese',
 269                 'kl' => 'west_european',
 270                 'ko' => 'korean',
 271                 'lt' => 'lithuanian',
 272                 'lv' => 'west_european', // Latvian/Lettish
 273                 'nl' => 'west_european',
 274                 'no' => 'west_european',
 275                 'pl' => 'east_european',
 276                 'pt' => 'west_european',
 277                 'ro' => 'east_european',
 278                 'ru' => 'cyrillic',
 279                 'sk' => 'east_european',
 280                 'sl' => 'east_european',
 281                 'sv' => 'west_european',
 282                 'th' => 'thai',
 283                 'uk' => 'cyrillic',
 284                 'vi' => 'vietnamese',
 285                 'zh' => 'chinese',
 286                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 287                 'chs' => 'simpl_chinese',
 288                 'cht' => 'trad_chinese',
 289                 'csy' => 'east_european',
 290                 'dan' => 'west_european',
 291                 'deu' => 'west_european',
 292                 'dea' => 'west_european',
 293                 'des' => 'west_european',
 294                 'ena' => 'west_european',
 295                 'enc' => 'west_european',
 296                 'eng' => 'west_european',
 297                 'enz' => 'west_european',
 298                 'enu' => 'west_european',
 299                 'nld' => 'west_european',
 300                 'nlb' => 'west_european',
 301                 'fin' => 'west_european',
 302                 'fra' => 'west_european',
 303                 'frb' => 'west_european',
 304                 'frc' => 'west_european',
 305                 'frs' => 'west_european',
 306                 'ell' => 'greek',
 307                 'hun' => 'east_european',
 308                 'isl' => 'west_euorpean',
 309                 'ita' => 'west_european',
 310                 'its' => 'west_european',
 311                 'jpn' => 'japanese',
 312                 'kor' => 'korean',
 313                 'nor' => 'west_european',
 314                 'non' => 'west_european',
 315                 'plk' => 'east_european',
 316                 'ptg' => 'west_european',
 317                 'ptb' => 'west_european',
 318                 'rus' => 'east_european',
 319                 'sky' => 'east_european',
 320                 'esp' => 'west_european',
 321                 'esm' => 'west_european',
 322                 'esn' => 'west_european',
 323                 'sve' => 'west_european',
 324                 'trk' => 'turkish',
 325                         // English language names
 326                 'bulgarian' => 'east_european',
 327                 'catalan' => 'west_european',
 328                 'croatian' => 'east_european',
 329                 'czech' => 'east_european',
 330                 'danish' => 'west_european',
 331                 'dutch' => 'west_european',
 332                 'english' => 'west_european',
 333                 'finnish' => 'west_european',
 334                 'french' => 'west_european',
 335                 'galician' => 'west_european',
 336                 'german' => 'west_european',
 337                 'hungarian' => 'east_european',
 338                 'icelandic' => 'west_european',
 339                 'italian' => 'west_european',
 340                 'latvian' => 'west_european',
 341                 'lettish' => 'west_european',
 342                 'norwegian' => 'west_european',
 343                 'polish' => 'east_european',
 344                 'portuguese' => 'west_european',
 345                 'russian' => 'cyrillic',
 346                 'romanian' => 'east_european',
 347                 'slovak' => 'east_european',
 348                 'slovenian' => 'east_european',
 349                 'spanish' => 'west_european',
 350                 'svedish' => 'west_european',
 351                 'turkish' => 'east_european',
 352                 'ukrainian' => 'cyrillic',
 353         );
 354
 355                 // mapping of language (family) names to charsets on Unix
 356         var $lang_to_charset_unix=array(
 357                 'west_european' => 'iso-8859-1',
 358                 'estonian' => 'iso-8859-1',
 359                 'east_european' => 'iso-8859-2',
 360                 'baltic' => 'iso-8859-4',
 361                 'cyrillic' => 'iso-8859-5',
 362                 'arabic' => 'iso-8859-6',
 363                 'greek' => 'iso-8859-7',
 364                 'hebrew' => 'iso-8859-8',
 365                 'turkish' => 'iso-8859-9',
 366                 'thai' => 'iso-8859-11', // = TIS-620
 367                 'lithuanian' => 'iso-8859-13',
 368                 'chinese' => 'gb2312', // = euc-cn
 369                 'japanese' => 'euc-jp',
 370                 'korean' => 'euc-kr',
 371                 'simpl_chinese' => 'gb2312',
 372                 'trad_chinese' => 'big5',
 373                 'vietnamese' => '',
 374         );
 375
 376                 // mapping of language (family) names to charsets on Windows
 377         var $lang_to_charset_windows=array(
 378                 'east_european' => 'windows-1250',
 379                 'cyrillic' => 'windows-1251',
 380                 'west_european' => 'windows-1252',
 381                 'greek' => 'windows-1253',
 382                 'turkish' => 'windows-1254',
 383                 'hebrew' => 'windows-1255',
 384                 'arabic' => 'windows-1256',
 385                 'baltic' => 'windows-1257',
 386                 'estonian' => 'windows-1257',
 387                 'lithuanian' => 'windows-1257',
 388                 'vietnamese' => 'windows-1258',
 389                 'thai' => 'cp874',
 390                 'korean' => 'cp949',
 391                 'chinese' => 'gb2312',
 392                 'japanese' => 'shift_jis',
 393                 'simpl_chinese' => 'gb2312',
 394                 'trad_chinese' => 'big5',
 395         );
 396
 397                 // mapping of locale names to charsets
 398         var $locale_to_charset=array(
 399                 'japanese.euc' => 'euc-jp',
 400                 'ja_jp.ujis' => 'euc-jp',
 401                 'korean.euc' => 'euc-kr',
 402                 'zh_cn' => 'gb2312',
 403                 'zh_hk' => 'big5',
 404                 'zh_tw' => 'big5',
 405         );
 406
 407                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 408                 // Empty values means "iso-8859-1"
 409         var $charSetArray = array(
 410                 'dk' => '',
 411                 'de' => '',
 412                 'no' => '',
 413                 'it' => '',
 414                 'fr' => '',
 415                 'es' => '',
 416                 'nl' => '',
 417                 'cz' => 'windows-1250',
 418                 'pl' => 'iso-8859-2',
 419                 'si' => 'windows-1250',
 420                 'fi' => '',
 421                 'tr' => 'iso-8859-9',
 422                 'se' => '',
 423                 'pt' => '',
 424                 'ru' => 'windows-1251',
 425                 'ro' => 'iso-8859-2',
 426                 'ch' => 'gb2312',
 427                 'sk' => 'windows-1250',
 428                 'lt' => 'windows-1257',
 429                 'is' => 'utf-8',
 430                 'hr' => 'windows-1250',
 431                 'hu' => 'iso-8859-2',
 432                 'gl' => '',
 433                 'th' => 'iso-8859-11',
 434                 'gr' => 'iso-8859-7',
 435                 'hk' => 'big5',
 436                 'eu' => '',
 437                 'bg' => 'windows-1251',
 438                 'br' => '',
 439                 'et' => 'iso-8859-4',
 440                 'ar' => 'iso-8859-6',
 441                 'he' => 'utf-8',
 442                 'ua' => 'windows-1251',
 443                 'jp' => 'shift_jis',
 444                 'lv' => 'utf-8',
 445                 'vn' => 'utf-8',
 446                 'ca' => 'iso-8859-15',
 447                 'ba' => 'iso-8859-2',
 448                 'kr' => 'euc-kr',
 449                 'eo' => 'utf-8',
 450                 'my' => '',
 451                 'hi' => 'utf-8',
 452         );
 453
 454                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 455                 // Empty values means sames as Typo3
 456         var $isoArray = array(
 457                 'dk' => 'da',
 458                 'de' => '',
 459                 'no' => '',
 460                 'it' => '',
 461                 'fr' => '',
 462                 'es' => '',
 463                 'nl' => '',
 464                 'cz' => 'cs',
 465                 'pl' => '',
 466                 'si' => 'sl',
 467                 'fi' => '',
 468                 'tr' => '',
 469                 'se' => 'sv',
 470                 'pt' => '',
 471                 'ru' => '',
 472                 'ro' => '',
 473                 'ch' => 'zh_CN',
 474                 'sk' => '',
 475                 'lt' => '',
 476                 'is' => '',
 477                 'hr' => '',
 478                 'hu' => '',
 479                 'gl' => '', // Greenlandic
 480                 'th' => '',
 481                 'gr' => 'el',
 482                 'hk' => 'zh_HK',
 483                 'eu' => '',
 484                 'bg' => '',
 485                 'br' => 'pt_BR',
 486                 'et' => '',
 487                 'ar' => '',
 488                 'he' => 'iw',
 489                 'ua' => 'uk',
 490                 'jp' => 'ja',
 491                 'lv' => '',
 492                 'vn' => 'vi',
 493                 'ca' => '',
 494                 'ba' => '', // Bosnian
 495                 'kr' => '',
 496         );
 497
 498         /**
 499          * Normalize - changes input character set to lowercase letters.
 500          *
 501          * @param       string          Input charset
 502          * @return      string          Normalized charset
 503          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 504          */
 505         function parse_charset($charset)        {
 506                 $charset = strtolower($charset);
 507                 if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
 508
 509                 return $charset;
 510         }
 511
 512         /**
 513          * Get the charset of a locale.
 514          *
 515          * ln            language
 516          * ln_CN         language / country
 517          * ln_CN.cs      language / country / charset
 518          * ln_CN.cs@mod  language / country / charset / modifier
 519          *
 520          * @param       string          Locale string
 521          * @return      string          Charset resolved for locale string
 522          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 523          */
 524         function get_locale_charset($locale)    {
 525                 $locale = strtolower($locale);
 526
 527                         // exact locale specific charset?
 528                 if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
 529
 530                         // get modifier
 531                 list($locale,$modifier) = explode('@',$locale);
 532
 533                         // locale contains charset: use it
 534                 list($locale,$charset) = explode('.',$locale);
 535                 if ($charset)   return $this->parse_charset($charset);
 536
 537                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 538                 if ($modifier == 'euro')        return 'iso-8859-15';
 539
 540                         // get language
 541                 list($language,$country) = explode('_',$locale);
 542                 if (isset($this->lang_to_langfamily[$language]))        $language = $this->lang_to_langfamily[$language];
 543
 544                 if (TYPO3_OS == 'WIN')  {
 545                         $cs = $this->lang_to_charset_windows[$language];
 546                 } else {
 547                         $cs = $this->lang_to_charset_unix[$language];
 548                 }
 549
 550                 return $cs ? $cs : 'iso-8859-1';
 551         }
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561         /********************************************
 562          *
 563          * Charset Conversion functions
 564          *
 565          ********************************************/
 566
 567         /**
 568          * Convert from one charset to another charset.
 569          *
 570          * @param       string          Input string
 571          * @param       string          From charset (the current charset of the string)
 572          * @param       string          To charset (the output charset wanted)
 573          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 574          * @return      string          Converted string
 575          * @see convArray()
 576          */
 577         function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
 578                 if ($fromCS==$toCS)     return $str;
 579
 580                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 581                 if ($toCS=='utf-8' || !$useEntityForNoChar)     {
 582                         switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])       {
 583                         case 'mbstring':
 584                                 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
 585                                 if (false !== $conv_str)        return $conv_str; // returns false for unsupported charsets
 586                                 break;
 587
 588                         case 'iconv':
 589                                 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
 590                                 if (false !== $conv_str)        return $conv_str;
 591                                 break;
 592
 593                         case 'recode':
 594                                 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
 595                                 if (false !== $conv_str)        return $conv_str;
 596                                 break;
 597                         }
 598                         // fallback to TYPO3 conversion
 599                 }
 600
 601                 if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
 602                 if ($toCS!='utf-8')     $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
 603                 return $str;
 604         }
 605
 606         /**
 607          * Convert all elements in ARRAY from one charset to another charset.
 608          * NOTICE: Array is passed by reference!
 609          *
 610          * @param       string          Input array, possibly multidimensional
 611          * @param       string          From charset (the current charset of the string)
 612          * @param       string          To charset (the output charset wanted)
 613          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 614          * @return      void
 615          * @see conv()
 616          */
 617         function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
 618                 foreach($array as $key => $value)       {
 619                         if (is_array($array[$key]))     {
 620                                 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 621                         } else {
 622                                 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 623                         }
 624                 }
 625         }
 626
 627         /**
 628          * Converts $str from $charset to UTF-8
 629          *
 630          * @param       string          String in local charset to convert to UTF-8
 631          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 632          * @return      string          Output string, converted to UTF-8
 633          */
 634         function utf8_encode($str,$charset)     {
 635
 636                 if ($charset === 'utf-8')       return $str;
 637
 638                         // Charset is case-insensitive.
 639                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
 640                         $strLen = strlen($str);
 641                         $outStr='';
 642
 643                         for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
 644                                 $chr=substr($str,$a,1);
 645                                 $ord=ord($chr);
 646                                 if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
 647                                         $ord2 = ord($str{$a+1});
 648                                         $ord = $ord<<8 & $ord2; // assume big endian
 649
 650                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 651                                                 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
 652                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists
 653                                         $a++;
 654                                 } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
 655                                         if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 656                                                 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {       // Shift-JIS: chars between 160 and 223 are single byte
 657                                                         $a++;
 658                                                         $ord2=ord(substr($str,$a,1));
 659                                                         $ord = $ord*256+$ord2;
 660                                                 }
 661                                         }
 662
 663                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 664                                                 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
 665                                         } else $outStr.= chr($this->noCharByteVal);     // No char exists
 666                                 } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 667                         }
 668                         return $outStr;
 669                 }
 670         }
 671
 672         /**
 673          * Converts $str from UTF-8 to $charset
 674          *
 675          * @param       string          String in UTF-8 to convert to local charset
 676          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 677          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 678          * @return      string          Output string, converted to local charset
 679          */
 680         function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
 681
 682                         // Charset is case-insensitive.
 683                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
 684                         $strLen = strlen($str);
 685                         $outStr='';
 686                         $buf='';
 687                         for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {       // Traverse each char in UTF-8 string.
 688                                 $chr=substr($str,$a,1);
 689                                 $ord=ord($chr);
 690                                 if ($ord>127)   {       // This means multibyte! (first byte!)
 691                                         if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 692
 693                                                 $buf=$chr;      // Add first byte
 694                                                 for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 695                                                         $ord = $ord << 1;       // Shift it left and ...
 696                                                         if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 697                                                                 $a++;   // Increase pointer...
 698                                                                 $buf.=substr($str,$a,1);        // ... and add the next char.
 699                                                         } else break;
 700                                                 }
 701
 702                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))       {       // If the UTF-8 char-sequence is found then...
 703                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
 704                                                         if ($mByte>255) {       // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 705                                                                 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
 706                                                         } else $outStr.= chr($mByte);
 707                                                 } elseif ($useEntityForNoChar) {        // Create num entity:
 708                                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 709                                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists
 710                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
 711                                 } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 712                         }
 713                         return $outStr;
 714                 }
 715         }
 716
 717         /**
 718          * Converts all chars > 127 to numeric entities.
 719          *
 720          * @param       string          Input string
 721          * @return      string          Output string
 722          */
 723         function utf8_to_entities($str) {
 724                 $strLen = strlen($str);
 725                 $outStr='';
 726                 $buf='';
 727                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
 728                         $chr=substr($str,$a,1);
 729                         $ord=ord($chr);
 730                         if ($ord>127)   {       // This means multibyte! (first byte!)
 731                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 732                                         $buf=$chr;      // Add first byte
 733                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 734                                                 $ord = $ord << 1;       // Shift it left and ...
 735                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 736                                                         $a++;   // Increase pointer...
 737                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
 738                                                 } else break;
 739                                         }
 740
 741                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 742                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
 743                         } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 744                 }
 745
 746                 return $outStr;
 747         }
 748
 749         /**
 750          * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 751          *
 752          * @param       string          Input string, UTF-8
 753          * @param       boolean         If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 754          * @return      string          Output string
 755          */
 756         function entities_to_utf8($str,$alsoStdHtmlEnt=0)       {
 757                 if ($alsoStdHtmlEnt)    {
 758                         $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));             // Getting them in iso-8859-1 - but thats ok since this is observed below.
 759                 }
 760
 761                 $token = md5(microtime());
 762                 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
 763                 foreach($parts as $k => $v)     {
 764                         if ($k%2)       {
 765                                 if (substr($v,0,1)=='#')        {       // Dec or hex entities:
 766                                         if (substr($v,1,1)=='x')        {
 767                                                 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
 768                                         } else {
 769                                                 $parts[$k] = $this->UnumberToChar(substr($v,1));
 770                                         }
 771                                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {  // Other entities:
 772                                         $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
 773                                 } else {        // No conversion:
 774                                         $parts[$k] ='&'.$v.';';
 775                                 }
 776                         }
 777                 }
 778
 779                 return implode('',$parts);
 780         }
 781
 782         /**
 783          * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 784          *
 785          * @param       string          Input string, UTF-8
 786          * @param       boolean         If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 787          * @param       boolean         If set, then instead of integer numbers the real UTF-8 char is returned.
 788          * @return      array           Output array with the char numbers
 789          */
 790         function utf8_to_numberarray($str,$convEntities=0,$retChar=0)   {
 791                         // If entities must be registered as well...:
 792                 if ($convEntities)      {
 793                         $str = $this->entities_to_utf8($str,1);
 794                 }
 795                         // Do conversion:
 796                 $strLen = strlen($str);
 797                 $outArr=array();
 798                 $buf='';
 799                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
 800                         $chr=substr($str,$a,1);
 801                         $ord=ord($chr);
 802                         if ($ord>127)   {       // This means multibyte! (first byte!)
 803                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 804                                         $buf=$chr;      // Add first byte
 805                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 806                                                 $ord = $ord << 1;       // Shift it left and ...
 807                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 808                                                         $a++;   // Increase pointer...
 809                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
 810                                                 } else break;
 811                                         }
 812
 813                                         $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
 814                                 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;       // No char exists (MIDDLE of MB sequence!)
 815                         } else $outArr[]=$retChar?chr($ord):$ord;       // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 816                 }
 817
 818                 return $outArr;
 819         }
 820
 821         /**
 822          * Converts a UNICODE number to a UTF-8 multibyte character
 823          * Algorithm based on script found at From: http://czyborra.com/utf/
 824          * Unit-tested by Kasper
 825          *
 826          * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 827          *
 828          *  bytes | bits | representation
 829          *      1 |    7 | 0vvvvvvv
 830          *      2 |   11 | 110vvvvv 10vvvvvv
 831          *      3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 832          *      4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 833          *      5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 834          *      6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 835          *
 836          * @param       integer         UNICODE integer
 837          * @return      string          UTF-8 multibyte character string
 838          * @see utf8CharToUnumber()
 839          */
 840         function UnumberToChar($cbyte)  {
 841                 $str='';
 842
 843                 if ($cbyte < 0x80) {
 844                         $str.=chr($cbyte);
 845                 } else if ($cbyte < 0x800) {
 846                         $str.=chr(0xC0 | ($cbyte >> 6));
 847                         $str.=chr(0x80 | ($cbyte & 0x3F));
 848                 } else if ($cbyte < 0x10000) {
 849                         $str.=chr(0xE0 | ($cbyte >> 12));
 850                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 851                         $str.=chr(0x80 | ($cbyte & 0x3F));
 852                 } else if ($cbyte < 0x200000) {
 853                         $str.=chr(0xF0 | ($cbyte >> 18));
 854                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 855                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 856                         $str.=chr(0x80 | ($cbyte & 0x3F));
 857                 } else if ($cbyte < 0x4000000) {
 858                         $str.=chr(0xF8 | ($cbyte >> 24));
 859                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 860                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 861                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 862                         $str.=chr(0x80 | ($cbyte & 0x3F));
 863                 } else if ($cbyte < 0x80000000) {
 864                         $str.=chr(0xFC | ($cbyte >> 30));
 865                         $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
 866                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 867                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 868                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 869                         $str.=chr(0x80 | ($cbyte & 0x3F));
 870                 } else { // Cannot express a 32-bit character in UTF-8
 871                         $str .= chr($this->noCharByteVal);
 872                 }
 873                 return $str;
 874         }
 875
 876         /**
 877          * Converts a UTF-8 Multibyte character to a UNICODE number
 878          * Unit-tested by Kasper
 879          *
 880          * @param       string          UTF-8 multibyte character string
 881          * @param       boolean         If set, then a hex. number is returned.
 882          * @return      integer         UNICODE integer
 883          * @see UnumberToChar()
 884          */
 885         function utf8CharToUnumber($str,$hex=0) {
 886                 $ord=ord(substr($str,0,1));     // First char
 887
 888                 if (($ord & 192) == 192)        {       // This verifyes that it IS a multi byte string
 889                         $binBuf='';
 890                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 891                                 $ord = $ord << 1;       // Shift it left and ...
 892                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 893                                         $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
 894                                 } else break;
 895                         }
 896                         $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
 897
 898                         $int = bindec($binBuf);
 899                 } else $int = $ord;
 900
 901                 return $hex ? 'x'.dechex($int) : $int;
 902         }
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912         /********************************************
 913          *
 914          * Init functions
 915          *
 916          ********************************************/
 917
 918         /**
 919          * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
 920          * This function is automatically called by the conversion functions
 921          *
 922          * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
 923          *
 924          * @param       string          The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
 925          * @return      integer         Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
 926          * @access private
 927          */
 928         function initCharset($charset)  {
 929                         // Only process if the charset is not yet loaded:
 930                 if (!is_array($this->parsedCharsets[$charset])) {
 931
 932                                 // Conversion table filename:
 933                         $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
 934
 935                                 // If the conversion table is found:
 936                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
 937                                         // Cache file for charsets:
 938                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
 939                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
 940                                 if ($cacheFile && @is_file($cacheFile)) {
 941                                         $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
 942                                 } else {
 943                                                 // Parse conversion table into lines:
 944                                         $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
 945                                                 // Initialize the internal variable holding the conv. table:
 946                                         $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
 947                                                 // traverse the lines:
 948                                         $detectedType='';
 949                                         foreach($lines as $value)       {
 950                                                 if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
 951
 952                                                                 // Detect type if not done yet: (Done on first real line)
 953                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
 954                                                         if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
 955
 956                                                         if ($detectedType=='ms-token')  {
 957                                                                 list($hexbyte,$utf8) = split('=|:',$value,3);
 958                                                         } elseif ($detectedType=='whitespaced') {
 959                                                                 $regA=array();
 960                                                                 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
 961                                                                 $hexbyte = $regA[1];
 962                                                                 $utf8 = 'U+'.$regA[2];
 963                                                         }
 964                                                         $decval = hexdec(trim($hexbyte));
 965                                                         if ($decval>127)        {
 966                                                                 $utf8decval = hexdec(substr(trim($utf8),2));
 967                                                                 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
 968                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
 969                                                         }
 970                                                 }
 971                                         }
 972                                         if ($cacheFile) {
 973                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
 974                                         }
 975                                 }
 976                                 return 2;
 977                         } else return false;
 978                 } else return 1;
 979         }
 980
 981         /**
 982          * This function initializes all UTF-8 character data tables.
 983          *
 984          * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
 985          *
 986          * @param       string          Mode ("case", "ascii", ...)
 987          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
 988          * @access private
 989          */
 990         function initUnicodeData($mode=null)    {
 991                         // cache files
 992                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
 993                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
 994
 995                         // Only process if the tables are not yet loaded
 996                 switch($mode)   {
 997                         case 'case':
 998                                 if (is_array($this->caseFolding['utf-8']))      return 1;
 999
1000                                         // Use cached version if possible
1001                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
1002                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1003                                         return 2;
1004                                 }
1005                                 break;
1006
1007                         case 'ascii':
1008                                 if (is_array($this->toASCII['utf-8']))  return 1;
1009
1010                                         // Use cached version if possible
1011                                 if ($cacheFileASCII && @is_file($cacheFileASCII))       {
1012                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1013                                         return 2;
1014                                 }
1015                                 break;
1016                 }
1017
1018                         // process main Unicode data file
1019                 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1020                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1021
1022                 $fh = fopen($unicodeDataFile,'rb');
1023                 if (!$fh)       return false;
1024
1025                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1026                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1027                 $this->caseFolding['utf-8'] = array();
1028                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1029                 $utf8CaseFolding['toUpper'] = array();
1030                 $utf8CaseFolding['toLower'] = array();
1031                 $utf8CaseFolding['toTitle'] = array();
1032
1033                 $decomposition = array();       // array of temp. decompositions
1034                 $mark = array();                // array of chars that are marks (eg. composing accents)
1035                 $number = array();              // array of chars that are numbers (eg. digits)
1036                 $omit = array();                // array of chars to be omitted (eg. Russian hard sign)
1037
1038                 while (!feof($fh))      {
1039                         $line = fgets($fh,4096);
1040                                 // has a lot of info
1041                         list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1042
1043                         $ord = hexdec($char);
1044                         if ($ord > 0xFFFF)      break;  // only process the BMP
1045
1046                         $utf8_char = $this->UnumberToChar($ord);
1047
1048                         if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1049                         if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1050                                 // store "title" only when different from "upper" (only a few)
1051                         if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1052
1053                         switch ($cat{0})        {
1054                                 case 'M':       // mark (accent, umlaut, ...)
1055                                         $mark["U+$char"] = 1;
1056                                         break;
1057
1058                                 case 'N':       // numeric value
1059                                         if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
1060                         }
1061
1062                                 // accented Latin letters without "official" decomposition
1063                         $match = array();
1064                         if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)        {
1065                                 $c = ord($match[2]);
1066                                 if ($match[1] == 'SMALL')       $c += 32;
1067
1068                                 $decomposition["U+$char"] = array(dechex($c));
1069                                 continue;
1070                         }
1071
1072                         $match = array();
1073                         if (ereg('(<.*>)? *(.+)',$decomp,$match))       {
1074                                 switch($match[1])       {
1075                                         case '<circle>':        // add parenthesis as circle replacement, eg (1)
1076                                                 $match[2] = '0028 '.$match[2].' 0029';
1077                                                 break;
1078
1079                                         case '<square>':        // add square brackets as square replacement, eg [1]
1080                                                 $match[2] = '005B '.$match[2].' 005D';
1081                                                 break;
1082
1083                                         case '<compat>':        // ignore multi char decompositions that start with a space
1084                                                 if (ereg('^0020 ',$match[2]))   continue 2;
1085                                                 break;
1086
1087                                                 // ignore Arabic and vertical layout presentation decomposition
1088                                         case '<initial>':
1089                                         case '<medial>':
1090                                         case '<final>':
1091                                         case '<isolated>':
1092                                         case '<vertical>':
1093                                                 continue 2;
1094                                 }
1095                                 $decomposition["U+$char"] = split(' ',$match[2]);
1096                         }
1097                 }
1098                 fclose($fh);
1099
1100                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1101                 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1102                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
1103                         $fh = fopen($specialCasingFile,'rb');
1104                         if ($fh)        {
1105                                 while (!feof($fh))      {
1106                                         $line = fgets($fh,4096);
1107                                         if ($line{0} != '#' && trim($line) != '')       {
1108
1109                                                 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1110                                                 if ($cond == '' || $cond{0} == '#')     {
1111                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
1112                                                         if ($char != $lower)    {
1113                                                                 $arr = split(' ',$lower);
1114                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1115                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1116                                                         }
1117                                                         if ($char != $title && $title != $upper)        {
1118                                                                 $arr = split(' ',$title);
1119                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1120                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1121                                                         }
1122                                                         if ($char != $upper)    {
1123                                                                         $arr = split(' ',$upper);
1124                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1125                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1126                                                         }
1127                                                 }
1128                                         }
1129                                 }
1130                                 fclose($fh);
1131                         }
1132                 }
1133
1134                         // process custom decompositions
1135                 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1136                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
1137                         $fh = fopen($customTranslitFile,'rb');
1138                         if ($fh)        {
1139                                 while (!feof($fh))      {
1140                                         $line = fgets($fh,4096);
1141                                         if ($line{0} != '#' && trim($line) != '')       {
1142                                                 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1143                                                 if (!$translit) $omit["U+$char"] = 1;
1144                                                 $decomposition["U+$char"] = split(' ', $translit);
1145
1146                                         }
1147                                 }
1148                                 fclose($fh);
1149                         }
1150                 }
1151
1152                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1153                 foreach($decomposition as $from => $to) {
1154                         $code_decomp = array();
1155
1156                         while ($code_value = array_shift($to))  {
1157                                 if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
1158                                         foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
1159                                                 array_unshift($to, $cv);
1160                                         }
1161                                 } elseif (!isset($mark["U+$code_value"])) {     // remove mark
1162                                         array_push($code_decomp, $code_value);
1163                                 }
1164                         }
1165                         if (count($code_decomp) || isset($omit[$from])) {
1166                                 $decomposition[$from] = $code_decomp;
1167                         } else {
1168                                 unset($decomposition[$from]);
1169                         }
1170                 }
1171
1172                         // create ascii only mapping
1173                 $this->toASCII['utf-8'] = array();
1174                 $ascii =& $this->toASCII['utf-8'];
1175
1176                 foreach($decomposition as $from => $to) {
1177                         $code_decomp = array();
1178                         while ($code_value = array_shift($to))  {
1179                                 $ord = hexdec($code_value);
1180                                 if ($ord > 127)
1181                                         continue 2;     // skip decompositions containing non-ASCII chars
1182                                 else
1183                                         array_push($code_decomp,chr($ord));
1184                         }
1185                         $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1186                 }
1187
1188                         // add numeric decompositions
1189                 foreach($number as $from => $to)        {
1190                         $utf8_char = $this->UnumberToChar(hexdec($from));
1191                         if (!isset($ascii[$utf8_char])) {
1192                                 $ascii[$utf8_char] = $to;
1193                         }
1194                 }
1195
1196                 if ($cacheFileCase)     {
1197                                 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1198                 }
1199
1200                 if ($cacheFileASCII)    {
1201                                 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1202                 }
1203
1204                 return 3;
1205         }
1206
1207         /**
1208          * This function initializes the folding table for a charset other than UTF-8.
1209          * This function is automatically called by the case folding functions.
1210          *
1211          * @param       string          Charset for which to initialize case folding.
1212          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1213          * @access private
1214          */
1215         function initCaseFolding($charset)      {
1216                         // Only process if the case table is not yet loaded:
1217                 if (is_array($this->caseFolding[$charset]))     return 1;
1218
1219                         // Use cached version if possible
1220                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1221                 if ($cacheFile && @is_file($cacheFile)) {
1222                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1223                         return 2;
1224                 }
1225
1226                         // init UTF-8 conversion for this charset
1227                 if (!$this->initCharset($charset))      {
1228                         return false;
1229                 }
1230
1231                         // UTF-8 case folding is used as the base conversion table
1232                 if (!$this->initUnicodeData('case'))    {
1233                         return false;
1234                 }
1235
1236                 $nochar = chr($this->noCharByteVal);
1237                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
1238                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1239                         $c = $this->utf8_decode($utf8, $charset);
1240
1241                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1242                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1243                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1244
1245                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1246                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1247                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
1248
1249                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1250                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1251                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1252                 }
1253
1254                         // add the ASCII case table
1255                 for ($i=ord('a'); $i<=ord('z'); $i++)   {
1256                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1257                 }
1258                 for ($i=ord('A'); $i<=ord('Z'); $i++)   {
1259                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1260                 }
1261
1262                 if ($cacheFile) {
1263                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1264                 }
1265
1266                 return 3;
1267         }
1268
1269         /**
1270          * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1271          * This function is automatically called by the ASCII transliteration functions.
1272          *
1273          * @param       string          Charset for which to initialize conversion.
1274          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1275          * @access private
1276          */
1277         function initToASCII($charset)  {
1278                         // Only process if the case table is not yet loaded:
1279                 if (is_array($this->toASCII[$charset])) return 1;
1280
1281                         // Use cached version if possible
1282                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1283                 if ($cacheFile && @is_file($cacheFile)) {
1284                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1285                         return 2;
1286                 }
1287
1288                         // init UTF-8 conversion for this charset
1289                 if (!$this->initCharset($charset))      {
1290                         return false;
1291                 }
1292
1293                         // UTF-8/ASCII transliteration is used as the base conversion table
1294                 if (!$this->initUnicodeData('ascii'))   {
1295                         return false;
1296                 }
1297
1298                 $nochar = chr($this->noCharByteVal);
1299                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
1300                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1301                         $c = $this->utf8_decode($utf8, $charset);
1302
1303                         if (isset($this->toASCII['utf-8'][$utf8]))      {
1304                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1305                         }
1306                 }
1307
1308                 if ($cacheFile) {
1309                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1310                 }
1311
1312                 return 3;
1313         }
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330         /********************************************
1331          *
1332          * String operation functions
1333          *
1334          ********************************************/
1335
1336         /**
1337          * Returns a part of a string.
1338          * Unit-tested by Kasper (single byte charsets only)
1339          *
1340          * @param       string          The character set
1341          * @param       string          Character string
1342          * @param       integer         Start position (character position)
1343          * @param       integer         Length (in characters)
1344          * @return      string          The substring
1345          * @see substr(), mb_substr()
1346          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1347          */
1348         function substr($charset,$string,$start,$len=null)      {
1349                 if ($len===0)   return '';
1350
1351                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1352                                 // cannot omit $len, when specifying charset
1353                         if ($len==null) {
1354                                 $enc = mb_internal_encoding();  // save internal encoding
1355                                 mb_internal_encoding($charset);
1356                                 $str = mb_substr($string,$start);
1357                                 mb_internal_encoding($enc);     // restore internal encoding
1358
1359                                 return $str;
1360                         }
1361                         else {
1362                                 return mb_substr($string,$start,$len,$charset);
1363                         }
1364                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1365                                 // cannot omit $len, when specifying charset
1366                         if ($len==null) {
1367                                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1368                                 iconv_set_encoding('internal_encoding',$charset);
1369                                 $str = iconv_substr($string,$start);
1370                                 iconv_set_encoding('internal_encoding',$enc);   // restore internal encoding
1371
1372                                 return $str;
1373                         }
1374                         else {
1375                                 return iconv_substr($string,$start,$len,$charset);
1376                         }
1377                 } elseif ($charset == 'utf-8')  {
1378                         return $this->utf8_substr($string,$start,$len);
1379                 } elseif ($this->eucBasedSets[$charset])        {
1380                         return $this->euc_substr($string,$start,$charset,$len);
1381                 } elseif ($this->twoByteSets[$charset]) {
1382                         return substr($string,$start*2,$len*2);
1383                 } elseif ($this->fourByteSets[$charset])        {
1384                         return substr($string,$start*4,$len*4);
1385                 }
1386
1387                 // treat everything else as single-byte encoding
1388                 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1389         }
1390
1391         /**
1392          * Counts the number of characters.
1393          * Unit-tested by Kasper (single byte charsets only)
1394          *
1395          * @param       string          The character set
1396          * @param       string          Character string
1397          * @return      integer         The number of characters
1398          * @see strlen()
1399          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1400          */
1401         function strlen($charset,$string)       {
1402                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1403                         return mb_strlen($string,$charset);
1404                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1405                         return iconv_strlen($string,$charset);
1406                 } elseif ($charset == 'utf-8')  {
1407                         return $this->utf8_strlen($string);
1408                 } elseif ($this->eucBasedSets[$charset])        {
1409                         return $this->euc_strlen($string,$charset);
1410                 } elseif ($this->twoByteSets[$charset]) {
1411                         return strlen($string)/2;
1412                 } elseif ($this->fourByteSets[$charset])        {
1413                         return strlen($string)/4;
1414                 }
1415                 // treat everything else as single-byte encoding
1416                 return strlen($string);
1417         }
1418
1419         /**
1420          * Truncates a string and pre-/appends a string.
1421          * Unit tested by Kasper
1422          *
1423          * @param       string          The character set
1424          * @param       string          Character string
1425          * @param       integer         Length (in characters)
1426          * @param       string          Crop signifier
1427          * @return      string          The shortened string
1428          * @see substr(), mb_strimwidth()
1429          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1430          */
1431         function crop($charset,$string,$len,$crop='')   {
1432                 if (intval($len) == 0)  return $string;
1433
1434                 if ($charset == 'utf-8')        {
1435                         $i = $this->utf8_char2byte_pos($string,$len);
1436                 } elseif ($this->eucBasedSets[$charset])        {
1437                         $i = $this->euc_char2byte_pos($string,$len,$charset);
1438                 } else {
1439                         if ($len > 0)   {
1440                                 $i = $len;
1441                         } else {
1442                                 $i = strlen($string)+$len;
1443                                 if ($i<=0)      $i = false;
1444                         }
1445                 }
1446
1447                 if ($i === false)       {       // $len outside actual string length
1448                         return $string;
1449                 } else  {
1450                         if ($len > 0)   {
1451                                 if (strlen($string{$i}))        {
1452                                         return substr($string,0,$i).$crop;
1453
1454                                 }
1455                         } else {
1456                                 if (strlen($string{$i-1}))      {
1457                                         return $crop.substr($string,$i);
1458                                 }
1459                         }
1460
1461 /*
1462                         if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1463                                 if ($len > 0)   {
1464                                         return substr($string,0,$i).$crop;
1465                                 } else {
1466                                         return $crop.substr($string,$i);
1467                                 }
1468                         }
1469 */
1470                 }
1471                 return $string;
1472         }
1473
1474         /**
1475          * Cuts a string short at a given byte length.
1476          *
1477          * @param       string          The character set
1478          * @param       string          Character string
1479          * @param       integer         The byte length
1480          * @return      string          The shortened string
1481          * @see mb_strcut()
1482          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1483          */
1484         function strtrunc($charset,$string,$len)        {
1485                 if ($len <= 0)  return '';
1486
1487                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1488                         return mb_strcut($string,0,$len,$charset);
1489                 } elseif ($charset == 'utf-8')  {
1490                         return $this->utf8_strtrunc($string,$len);
1491                 } elseif ($this->eucBasedSets[$charset])        {
1492                         return $this->euc_strtrunc($string,$charset);
1493                 } elseif ($this->twoByteSets[$charset]) {
1494                         if ($len % 2)   $len--;         // don't cut at odd positions
1495                 } elseif ($this->fourByteSets[$charset])        {
1496                         $x = $len % 4;
1497                         $len -= $x;     // realign to position dividable by four
1498                 }
1499                 // treat everything else as single-byte encoding
1500                 return substr($string,0,$len);
1501         }
1502
1503         /**
1504          * Translates all characters of a string into their respective case values.
1505          * Unlike strtolower() and strtoupper() this method is locale independent.
1506          * Note that the string length may change!
1507          * eg. lower case German �(sharp S) becomes upper case "SS"
1508          * Unit-tested by Kasper
1509          * Real case folding is language dependent, this method ignores this fact.
1510          *
1511          * @param       string          Character set of string
1512          * @param       string          Input string to convert case for
1513          * @param       string          Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1514          * @return      string          The converted string
1515          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1516          * @see strtolower(), strtoupper()
1517          */
1518         function conv_case($charset,$string,$case)      {
1519                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3)   {
1520                         if ($case == 'toLower') {
1521                                 return mb_strtolower($string,$charset);
1522                         } else {
1523                                 return mb_strtoupper($string,$charset);
1524                         }
1525                 } elseif ($charset == 'utf-8')  {
1526                         return $this->utf8_char_mapping($string,'case',$case);
1527                 } elseif (isset($this->eucBasedSets[$charset])) {
1528                         return $this->euc_char_mapping($string,$charset,'case',$case);
1529                 } else {
1530                                 // treat everything else as single-byte encoding
1531                         return $this->sb_char_mapping($string,$charset,'case',$case);
1532                 }
1533
1534                 return $string;
1535         }
1536
1537         /**
1538          * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1539          *
1540          * @param       string          Character set of string
1541          * @param       string          Input string to convert
1542          * @return      string          The converted string
1543          */
1544         function specCharsToASCII($charset,$string)     {
1545                 if ($charset == 'utf-8')        {
1546                         return $this->utf8_char_mapping($string,'ascii');
1547                 } elseif (isset($this->eucBasedSets[$charset])) {
1548                         return $this->euc_char_mapping($string,$charset,'ascii');
1549                 } else {
1550                                 // treat everything else as single-byte encoding
1551                         return $this->sb_char_mapping($string,$charset,'ascii');
1552                 }
1553
1554                 return $string;
1555         }
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568         /********************************************
1569          *
1570          * Internal string operation functions
1571          *
1572          ********************************************/
1573
1574         /**
1575          * Maps all characters of a string in a single byte charset.
1576          *
1577          * @param       string          the string
1578          * @param       string          the charset
1579          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1580          * @param       string          'case': conversion 'toLower' or 'toUpper'
1581          * @return      string          the converted string
1582          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1583          */
1584         function sb_char_mapping($str,$charset,$mode,$opt='')   {
1585                 switch($mode)   {
1586                         case 'case':
1587                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
1588                                 $map =& $this->caseFolding[$charset][$opt];
1589                                 break;
1590
1591                         case 'ascii':
1592                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
1593                                 $map =& $this->toASCII[$charset];
1594                                 break;
1595
1596                         default:
1597                                 return $str;
1598                 }
1599
1600                 $out = '';
1601                 for($i=0; strlen($str{$i}); $i++)       {
1602                         $c = $str{$i};
1603                         if (isset($map[$c]))    {
1604                                 $out .= $map[$c];
1605                         } else {
1606                                 $out .= $c;
1607                         }
1608                 }
1609
1610                 return $out;
1611         }
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622         /********************************************
1623          *
1624          * Internal UTF-8 string operation functions
1625          *
1626          ********************************************/
1627
1628         /**
1629          * Returns a part of a UTF-8 string.
1630          * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1631          *
1632          * @param       string          UTF-8 string
1633          * @param       integer         Start position (character position)
1634          * @param       integer         Length (in characters)
1635          * @return      string          The substring
1636          * @see substr()
1637          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1638          */
1639         function utf8_substr($str,$start,$len=null)     {
1640                 if (!strcmp($len,'0'))  return '';
1641
1642                 $byte_start = $this->utf8_char2byte_pos($str,$start);
1643                 if ($byte_start === false)      {
1644                         if ($start > 0) {
1645                                 return false;   // $start outside string length
1646                         } else {
1647                                 $start = 0;
1648                         }
1649                 }
1650
1651                 $str = substr($str,$byte_start);
1652
1653                 if ($len!=null) {
1654                         $byte_end = $this->utf8_char2byte_pos($str,$len);
1655                         if ($byte_end === false)        // $len outside actual string length
1656                                 return $len<0 ? '' : $str;      // When length is less than zero and exceeds, then we return blank string.
1657                         else
1658                                 return substr($str,0,$byte_end);
1659                 }
1660                 else    return $str;
1661         }
1662
1663         /**
1664          * Counts the number of characters of a string in UTF-8.
1665          * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1666          *
1667          * @param       string          UTF-8 multibyte character string
1668          * @return      integer         The number of characters
1669          * @see strlen()
1670          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1671          */
1672         function utf8_strlen($str)      {
1673                 $n=0;
1674                 for($i=0; strlen($str{$i}); $i++)       {
1675                         $c = ord($str{$i});
1676                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1677                                 $n++;
1678                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1679                                 $n++;
1680                 }
1681                 return $n;
1682         }
1683
1684         /**
1685          * Truncates a string in UTF-8 short at a given byte length.
1686          *
1687          * @param       string          UTF-8 multibyte character string
1688          * @param       integer         the byte length
1689          * @return      string          the shortened string
1690          * @see mb_strcut()
1691          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1692          */
1693         function utf8_strtrunc($str,$len)       {
1694                 $i = $len-1;
1695                 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1696                         for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
1697                         if ($i <= 0)    return ''; // sanity check
1698                         for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
1699                         if ($bc+$i > $len)      return substr($str,0,$i);
1700                         // fallthru: multibyte char fits into length
1701                 }
1702                 return substr($str,0,$len);
1703         }
1704
1705         /**
1706          * Find position of first occurrence of a string, both arguments are in UTF-8.
1707          *
1708          * @param       string          UTF-8 string to search in
1709          * @param       string          UTF-8 string to search for
1710          * @param       integer         Positition to start the search
1711          * @return      integer         The character position
1712          * @see strpos()
1713          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1714          */
1715         function utf8_strpos($haystack,$needle,$offset=0)       {
1716                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1717                         return mb_strpos($haystack,$needle,$offset,'utf-8');
1718                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1719                         return iconv_strpos($haystack,$needle,$offset,'utf-8');
1720                 }
1721
1722                 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1723                 if ($byte_offset === false)     return false; // offset beyond string length
1724
1725                 $byte_pos = strpos($haystack,$needle,$byte_offset);
1726                 if ($byte_pos === false)        return false; // needle not found
1727
1728                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1729         }
1730
1731         /**
1732          * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1733          *
1734          * @param       string          UTF-8 string to search in
1735          * @param       string          UTF-8 character to search for (single character)
1736          * @return      integer         The character position
1737          * @see strrpos()
1738          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1739          */
1740         function utf8_strrpos($haystack,$needle)        {
1741                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1742                         return mb_strrpos($haystack,$needle,'utf-8');
1743                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1744                         return iconv_strrpos($haystack,$needle,$offset,'utf-8');
1745                 }
1746
1747                 $byte_pos = strrpos($haystack,$needle);
1748                 if ($byte_pos === false)        return false; // needle not found
1749
1750                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1751         }
1752
1753         /**
1754          * Translates a character position into an 'absolute' byte position.
1755          * Unit tested by Kasper.
1756          *
1757          * @param       string          UTF-8 string
1758          * @param       integer         Character position (negative values start from the end)
1759          * @return      integer         Byte position
1760          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1761          */
1762         function utf8_char2byte_pos($str,$pos)  {
1763                 $n = 0;                         // number of characters found
1764                 $p = abs($pos);         // number of characters wanted
1765
1766                 if ($pos >= 0)  {
1767                         $i = 0;
1768                         $d = 1;
1769                 } else {
1770                         $i = strlen($str)-1;
1771                         $d = -1;
1772                 }
1773
1774                 for( ; strlen($str{$i}) && $n<$p; $i+=$d)       {
1775                         $c = (int)ord($str{$i});
1776                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1777                                 $n++;
1778                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1779                                 $n++;
1780                 }
1781                 if (!strlen($str{$i}))  return false; // offset beyond string length
1782
1783                 if ($pos >= 0)  {
1784                                 // skip trailing multi-byte data bytes
1785                         while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1786                 } else {
1787                                 // correct offset
1788                         $i++;
1789                 }
1790
1791                 return $i;
1792         }
1793
1794         /**
1795          * Translates an 'absolute' byte position into a character position.
1796          * Unit tested by Kasper.
1797          *
1798          * @param       string          UTF-8 string
1799          * @param       integer         byte position
1800          * @return      integer         character position
1801          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1802          */
1803         function utf8_byte2char_pos($str,$pos)  {
1804                 $n = 0; // number of characters
1805                 for($i=$pos; $i>0; $i--)        {
1806                         $c = (int)ord($str{$i});
1807                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1808                                 $n++;
1809                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1810                                 $n++;
1811                 }
1812                 if (!strlen($str{$i}))  return false; // offset beyond string length
1813
1814                 return $n;
1815         }
1816
1817         /**
1818          * Maps all characters of an UTF-8 string.
1819          *
1820          * @param       string          UTF-8 string
1821          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1822          * @param       string          'case': conversion 'toLower' or 'toUpper'
1823          * @return      string          the converted string
1824          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1825          */
1826         function utf8_char_mapping($str,$mode,$opt='')  {
1827                 if (!$this->initUnicodeData($mode))     return $str;    // do nothing
1828
1829                 $out = '';
1830                 switch($mode)   {
1831                         case 'case':
1832                                 $map =& $this->caseFolding['utf-8'][$opt];
1833                                 break;
1834
1835                         case 'ascii':
1836                                 $map =& $this->toASCII['utf-8'];
1837                                 break;
1838
1839                         default:
1840                                 return $str;
1841                 }
1842
1843                 for($i=0; strlen($str{$i}); $i++)       {
1844                         $c = ord($str{$i});
1845                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1846                                 $mbc = $str{$i};
1847                         elseif (($c & 0xC0) == 0xC0)    {       // multi-byte starting byte (11xxxxxx)
1848                                 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1849                                 $mbc = substr($str,$i,$bc);
1850                                 $i += $bc-1;
1851                         }
1852
1853                         if (isset($map[$mbc]))  {
1854                                 $out .= $map[$mbc];
1855                         } else {
1856                                 $out .= $mbc;
1857                         }
1858                 }
1859
1860                 return $out;
1861         }
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880         /********************************************
1881          *
1882          * Internal EUC string operation functions
1883          *
1884          * Extended Unix Code:
1885          *  ASCII compatible 7bit single bytes chars
1886          *  8bit two byte chars
1887          *
1888          * Shift-JIS is treated as a special case.
1889          *
1890          ********************************************/
1891
1892         /**
1893          * Cuts a string in the EUC charset family short at a given byte length.
1894          *
1895          * @param       string          EUC multibyte character string
1896          * @param       integer         the byte length
1897          * @param       string          the charset
1898          * @return      string          the shortened string
1899          * @see mb_strcut()
1900          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1901          */
1902         function euc_strtrunc($str,$len,$charset)        {
1903                 $sjis = ($charset == 'shift_jis');
1904                 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1905                         $c = ord($str{$i});
1906                         if ($sjis)      {
1907                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
1908                         }
1909                         else    {
1910                                 if ($c >= 0x80) $i++;   // advance a double-byte char
1911                         }
1912                 }
1913                 if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
1914
1915                 if ($i>$len)
1916                         return substr($str,0,$len-1);   // we ended on a first byte
1917                 else
1918                         return substr($str,0,$len);
1919         }
1920
1921         /**
1922          * Returns a part of a string in the EUC charset family.
1923          *
1924          * @param       string          EUC multibyte character string
1925          * @param       integer         start position (character position)
1926          * @param       string          the charset
1927          * @param       integer         length (in characters)
1928          * @return      string          the substring
1929          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1930          */
1931         function euc_substr($str,$start,$charset,$len=null)     {
1932                 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1933                 if ($byte_start === false)      return false;   // $start outside string length
1934
1935                 $str = substr($str,$byte_start);
1936
1937                 if ($len!=null) {
1938                         $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1939                         if ($byte_end === false)        // $len outside actual string length
1940                                 return $str;
1941                         else
1942                                 return substr($str,0,$byte_end);
1943                 }
1944                 else    return $str;
1945         }
1946
1947         /**
1948          * Counts the number of characters of a string in the EUC charset family.
1949          *
1950          * @param       string          EUC multibyte character string
1951          * @param       string          the charset
1952          * @return      integer         the number of characters
1953          * @see strlen()
1954          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1955          */
1956         function euc_strlen($str,$charset)       {
1957                 $sjis = ($charset == 'shift_jis');
1958                 $n=0;
1959                 for ($i=0; strlen($str{$i}); $i++) {
1960                         $c = ord($str{$i});
1961                         if ($sjis)      {
1962                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
1963                         }
1964                         else    {
1965                                 if ($c >= 0x80) $i++;   // advance a double-byte char
1966                         }
1967
1968                         $n++;
1969                 }
1970
1971                 return $n;
1972         }
1973
1974         /**
1975          * Translates a character position into an 'absolute' byte position.
1976          *
1977          * @param       string          EUC multibyte character string
1978          * @param       integer         character position (negative values start from the end)
1979          * @param       string          the charset
1980          * @return      integer         byte position
1981          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1982          */
1983         function euc_char2byte_pos($str,$pos,$charset)  {
1984                 $sjis = ($charset == 'shift_jis');
1985                 $n = 0; // number of characters seen
1986                 $p = abs($pos); // number of characters wanted
1987
1988                 if ($pos >= 0)  {
1989                         $i = 0;
1990                         $d = 1;
1991                 } else {
1992                         $i = strlen($str)-1;
1993                         $d = -1;
1994                 }
1995
1996                 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
1997                         $c = ord($str{$i});
1998                         if ($sjis)      {
1999                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
2000                         }
2001                         else    {
2002                                 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2003                         }
2004
2005                         $n++;
2006                 }
2007                 if (!strlen($str{$i}))  return false; // offset beyond string length
2008
2009                 if ($pos < 0)   $i++;   // correct offset
2010
2011                 return $i;
2012         }
2013
2014         /**
2015          * Maps all characters of a string in the EUC charset family.
2016          *
2017          * @param       string          EUC multibyte character string
2018          * @param       string          the charset
2019          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2020          * @param       string          'case': conversion 'toLower' or 'toUpper'
2021          * @return      string          the converted string
2022          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2023          */
2024         function euc_char_mapping($str,$charset,$mode,$opt='')  {
2025                 switch($mode)   {
2026                         case 'case':
2027                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
2028                                 $map =& $this->caseFolding[$charset][$opt];
2029                                 break;
2030
2031                         case 'ascii':
2032                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
2033                                 $map =& $this->toASCII[$charset];
2034                                 break;
2035
2036                         default:
2037                                 return $str;
2038                 }
2039
2040                 $sjis = ($charset == 'shift_jis');
2041                 $out = '';
2042                 for($i=0; strlen($str{$i}); $i++)       {
2043                         $mbc = $str{$i};
2044                         $c = ord($mbc);
2045
2046                         if ($sjis)      {
2047                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
2048                                         $mbc = substr($str,$i,2);
2049                                         $i++;
2050                                 }
2051                         }
2052                         else    {
2053                                 if ($c >= 0x80) {       // a double-byte char
2054                                         $mbc = substr($str,$i,2);
2055                                         $i++;
2056                                 }
2057                         }
2058
2059                         if (isset($map[$mbc]))  {
2060                                 $out .= $map[$mbc];
2061                         } else {
2062                                 $out .= $mbc;
2063                         }
2064                 }
2065
2066                 return $out;
2067         }
2068
2069 }
2070
2071 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])        {
2072         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2073 }
2074 ?>