inc/utf8.php

   1 <?php
   2 /**
   3  * UTF8 helper functions
   4  *
   5  * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
   6  * @author     Andreas Gohr <andi@splitbrain.org>
   7  */
   8
   9 /**
  10  * check for mb_string support
  11  */
  12 if(!defined('UTF8_MBSTRING')){
  13     if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
  14         define('UTF8_MBSTRING',1);
  15     }else{
  16         define('UTF8_MBSTRING',0);
  17     }
  18 }
  19
  20 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
  21
  22 if(!function_exists('utf8_encodeFN')){
  23     /**
  24      * URL-Encode a filename to allow unicodecharacters
  25      *
  26      * Slashes are not encoded
  27      *
  28      * When the second parameter is true the string will
  29      * be encoded only if non ASCII characters are detected -
  30      * This makes it safe to run it multiple times on the
  31      * same string (default is true)
  32      *
  33      * @author Andreas Gohr <andi@splitbrain.org>
  34      * @see    urlencode
  35      */
  36     function utf8_encodeFN($file,$safe=true){
  37         if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
  38             return $file;
  39         }
  40         $file = urlencode($file);
  41         $file = str_replace('%2F','/',$file);
  42         return $file;
  43     }
  44 }
  45
  46 if(!function_exists('utf8_decodeFN')){
  47     /**
  48      * URL-Decode a filename
  49      *
  50      * This is just a wrapper around urldecode
  51      *
  52      * @author Andreas Gohr <andi@splitbrain.org>
  53      * @see    urldecode
  54      */
  55     function utf8_decodeFN($file){
  56         $file = urldecode($file);
  57         return $file;
  58     }
  59 }
  60
  61 if(!function_exists('utf8_isASCII')){
  62     /**
  63      * Checks if a string contains 7bit ASCII only
  64      *
  65      * @author Andreas Haerter <netzmeister@andreas-haerter.de>
  66      */
  67     function utf8_isASCII($str){
  68         return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
  69     }
  70 }
  71
  72 if(!function_exists('utf8_strip')){
  73     /**
  74      * Strips all highbyte chars
  75      *
  76      * Returns a pure ASCII7 string
  77      *
  78      * @author Andreas Gohr <andi@splitbrain.org>
  79      */
  80     function utf8_strip($str){
  81         $ascii = '';
  82         $len = strlen($str);
  83         for($i=0; $i<$len; $i++){
  84             if(ord($str{$i}) <128){
  85                 $ascii .= $str{$i};
  86             }
  87         }
  88         return $ascii;
  89     }
  90 }
  91
  92 if(!function_exists('utf8_check')){
  93     /**
  94      * Tries to detect if a string is in Unicode encoding
  95      *
  96      * @author <bmorel@ssi.fr>
  97      * @link   http://www.php.net/manual/en/function.utf8-encode.php
  98      */
  99     function utf8_check($Str) {
 100         $len = strlen($Str);
 101         for ($i=0; $i<$len; $i++) {
 102             $b = ord($Str[$i]);
 103             if ($b < 0x80) continue; # 0bbbbbbb
 104             elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
 105             elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
 106             elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
 107             elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
 108             elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
 109             else return false; # Does not match any model
 110
 111             for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
 112                 if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
 113                     return false;
 114             }
 115         }
 116         return true;
 117     }
 118 }
 119
 120 if(!function_exists('utf8_strlen')){
 121     /**
 122      * Unicode aware replacement for strlen()
 123      *
 124      * utf8_decode() converts characters that are not in ISO-8859-1
 125      * to '?', which, for the purpose of counting, is alright - It's
 126      * even faster than mb_strlen.
 127      *
 128      * @author <chernyshevsky at hotmail dot com>
 129      * @see    strlen()
 130      * @see    utf8_decode()
 131      */
 132     function utf8_strlen($string){
 133         return strlen(utf8_decode($string));
 134     }
 135 }
 136
 137 if(!function_exists('utf8_substr')){
 138     /**
 139      * UTF-8 aware alternative to substr
 140      *
 141      * Return part of a string given character offset (and optionally length)
 142      *
 143      * @author Harry Fuecks <hfuecks@gmail.com>
 144      * @author Chris Smith <chris@jalakai.co.uk>
 145      * @param string
 146      * @param integer number of UTF-8 characters offset (from left)
 147      * @param integer (optional) length in UTF-8 characters from offset
 148      * @return mixed string or false if failure
 149      */
 150     function utf8_substr($str, $offset, $length = null) {
 151         if(UTF8_MBSTRING){
 152             if( $length === null ){
 153                 return mb_substr($str, $offset);
 154             }else{
 155                 return mb_substr($str, $offset, $length);
 156             }
 157         }
 158
 159         /*
 160          * Notes:
 161          *
 162          * no mb string support, so we'll use pcre regex's with 'u' flag
 163          * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
 164          * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
 165          *
 166          * substr documentation states false can be returned in some cases (e.g. offset > string length)
 167          * mb_substr never returns false, it will return an empty string instead.
 168          *
 169          * calculating the number of characters in the string is a relatively expensive operation, so
 170          * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
 171          */
 172
 173         // cast parameters to appropriate types to avoid multiple notices/warnings
 174         $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
 175         $offset = (int)$offset;
 176         if (!is_null($length)) $length = (int)$length;
 177
 178         // handle trivial cases
 179         if ($length === 0) return '';
 180         if ($offset < 0 && $length < 0 && $length < $offset) return '';
 181
 182         $offset_pattern = '';
 183         $length_pattern = '';
 184
 185         // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
 186         if ($offset < 0) {
 187             $strlen = strlen(utf8_decode($str));        // see notes
 188             $offset = $strlen + $offset;
 189             if ($offset < 0) $offset = 0;
 190         }
 191
 192         // establish a pattern for offset, a non-captured group equal in length to offset
 193         if ($offset > 0) {
 194             $Ox = (int)($offset/65535);
 195             $Oy = $offset%65535;
 196
 197             if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
 198             $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
 199         } else {
 200             $offset_pattern = '^';                      // offset == 0; just anchor the pattern
 201         }
 202
 203         // establish a pattern for length
 204         if (is_null($length)) {
 205             $length_pattern = '(.*)$';                  // the rest of the string
 206         } else {
 207
 208             if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
 209             if ($offset > $strlen) return '';           // another trivial case
 210
 211             if ($length > 0) {
 212
 213                 $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
 214
 215                 $Lx = (int)($length/65535);
 216                 $Ly = $length%65535;
 217
 218                 // +ve length requires ... a captured group of length characters
 219                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 220                     $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
 221
 222             } else if ($length < 0) {
 223
 224                 if ($length < ($offset - $strlen)) return '';
 225
 226                 $Lx = (int)((-$length)/65535);
 227                 $Ly = (-$length)%65535;
 228
 229                 // -ve length requires ... capture everything except a group of -length characters
 230                 //                         anchored at the tail-end of the string
 231                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 232                 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
 233             }
 234         }
 235
 236         if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
 237         return $match[1];
 238     }
 239 }
 240
 241 if(!function_exists('utf8_substr_replace')){
 242     /**
 243      * Unicode aware replacement for substr_replace()
 244      *
 245      * @author Andreas Gohr <andi@splitbrain.org>
 246      * @see    substr_replace()
 247      */
 248     function utf8_substr_replace($string, $replacement, $start , $length=0 ){
 249         $ret = '';
 250         if($start>0) $ret .= utf8_substr($string, 0, $start);
 251         $ret .= $replacement;
 252         $ret .= utf8_substr($string, $start+$length);
 253         return $ret;
 254     }
 255 }
 256
 257 if(!function_exists('utf8_ltrim')){
 258     /**
 259      * Unicode aware replacement for ltrim()
 260      *
 261      * @author Andreas Gohr <andi@splitbrain.org>
 262      * @see    ltrim()
 263      * @return string
 264      */
 265     function utf8_ltrim($str,$charlist=''){
 266         if($charlist == '') return ltrim($str);
 267
 268         //quote charlist for use in a characterclass
 269         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
 270
 271         return preg_replace('/^['.$charlist.']+/u','',$str);
 272     }
 273 }
 274
 275 if(!function_exists('utf8_rtrim')){
 276     /**
 277      * Unicode aware replacement for rtrim()
 278      *
 279      * @author Andreas Gohr <andi@splitbrain.org>
 280      * @see    rtrim()
 281      * @return string
 282      */
 283     function  utf8_rtrim($str,$charlist=''){
 284         if($charlist == '') return rtrim($str);
 285
 286         //quote charlist for use in a characterclass
 287         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
 288
 289         return preg_replace('/['.$charlist.']+$/u','',$str);
 290     }
 291 }
 292
 293 if(!function_exists('utf8_trim')){
 294     /**
 295      * Unicode aware replacement for trim()
 296      *
 297      * @author Andreas Gohr <andi@splitbrain.org>
 298      * @see    trim()
 299      * @return string
 300      */
 301     function  utf8_trim($str,$charlist='') {
 302         if($charlist == '') return trim($str);
 303
 304         return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
 305     }
 306 }
 307
 308 if(!function_exists('utf8_strtolower')){
 309     /**
 310      * This is a unicode aware replacement for strtolower()
 311      *
 312      * Uses mb_string extension if available
 313      *
 314      * @author Leo Feyer <leo@typolight.org>
 315      * @see    strtolower()
 316      * @see    utf8_strtoupper()
 317      */
 318     function utf8_strtolower($string){
 319         if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
 320
 321         global $UTF8_UPPER_TO_LOWER;
 322         return strtr($string,$UTF8_UPPER_TO_LOWER);
 323     }
 324 }
 325
 326 if(!function_exists('utf8_strtoupper')){
 327     /**
 328      * This is a unicode aware replacement for strtoupper()
 329      *
 330      * Uses mb_string extension if available
 331      *
 332      * @author Leo Feyer <leo@typolight.org>
 333      * @see    strtoupper()
 334      * @see    utf8_strtoupper()
 335      */
 336     function utf8_strtoupper($string){
 337         if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
 338
 339         global $UTF8_LOWER_TO_UPPER;
 340         return strtr($string,$UTF8_LOWER_TO_UPPER);
 341     }
 342 }
 343
 344 if(!function_exists('utf8_ucfirst')){
 345     /**
 346      * UTF-8 aware alternative to ucfirst
 347      * Make a string's first character uppercase
 348      *
 349      * @author Harry Fuecks
 350      * @param string
 351      * @return string with first character as upper case (if applicable)
 352      */
 353     function utf8_ucfirst($str){
 354         switch ( utf8_strlen($str) ) {
 355             case 0:
 356                 return '';
 357             case 1:
 358                 return utf8_strtoupper($str);
 359             default:
 360                 preg_match('/^(.{1})(.*)$/us', $str, $matches);
 361                 return utf8_strtoupper($matches[1]).$matches[2];
 362         }
 363     }
 364 }
 365
 366 if(!function_exists('utf8_ucwords')){
 367     /**
 368      * UTF-8 aware alternative to ucwords
 369      * Uppercase the first character of each word in a string
 370      *
 371      * @author Harry Fuecks
 372      * @param string
 373      * @return string with first char of each word uppercase
 374      * @see http://www.php.net/ucwords
 375      */
 376     function utf8_ucwords($str) {
 377         // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
 378         // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
 379         // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
 380         $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
 381
 382         return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
 383     }
 384
 385     /**
 386      * Callback function for preg_replace_callback call in utf8_ucwords
 387      * You don't need to call this yourself
 388      *
 389      * @author Harry Fuecks
 390      * @param array of matches corresponding to a single word
 391      * @return string with first char of the word in uppercase
 392      * @see utf8_ucwords
 393      * @see utf8_strtoupper
 394      */
 395     function utf8_ucwords_callback($matches) {
 396         $leadingws = $matches[2];
 397         $ucfirst = utf8_strtoupper($matches[3]);
 398         $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
 399         return $leadingws . $ucword;
 400     }
 401 }
 402
 403 if(!function_exists('utf8_deaccent')){
 404     /**
 405      * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
 406      *
 407      * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
 408      * letters. Default is to deaccent both cases ($case = 0)
 409      *
 410      * @author Andreas Gohr <andi@splitbrain.org>
 411      */
 412     function utf8_deaccent($string,$case=0){
 413         if($case <= 0){
 414             global $UTF8_LOWER_ACCENTS;
 415             $string = strtr($string,$UTF8_LOWER_ACCENTS);
 416         }
 417         if($case >= 0){
 418             global $UTF8_UPPER_ACCENTS;
 419             $string = strtr($string,$UTF8_UPPER_ACCENTS);
 420         }
 421         return $string;
 422     }
 423 }
 424
 425 if(!function_exists('utf8_romanize')){
 426     /**
 427      * Romanize a non-latin string
 428      *
 429      * @author Andreas Gohr <andi@splitbrain.org>
 430      */
 431     function utf8_romanize($string){
 432         if(utf8_isASCII($string)) return $string; //nothing to do
 433
 434         global $UTF8_ROMANIZATION;
 435         return strtr($string,$UTF8_ROMANIZATION);
 436     }
 437 }
 438
 439 if(!function_exists('utf8_stripspecials')){
 440     /**
 441      * Removes special characters (nonalphanumeric) from a UTF-8 string
 442      *
 443      * This function adds the controlchars 0x00 to 0x19 to the array of
 444      * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
 445      *
 446      * @author Andreas Gohr <andi@splitbrain.org>
 447      * @param  string $string     The UTF8 string to strip of special chars
 448      * @param  string $repl       Replace special with this string
 449      * @param  string $additional Additional chars to strip (used in regexp char class)
 450      */
 451     function utf8_stripspecials($string,$repl='',$additional=''){
 452         global $UTF8_SPECIAL_CHARS;
 453         global $UTF8_SPECIAL_CHARS2;
 454
 455         static $specials = null;
 456         if(is_null($specials)){
 457             #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
 458             $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
 459         }
 460
 461         return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
 462     }
 463 }
 464
 465 if(!function_exists('utf8_strpos')){
 466     /**
 467      * This is an Unicode aware replacement for strpos
 468      *
 469      * @author Leo Feyer <leo@typolight.org>
 470      * @see    strpos()
 471      * @param  string
 472      * @param  string
 473      * @param  integer
 474      * @return integer
 475      */
 476     function utf8_strpos($haystack, $needle, $offset=0){
 477         $comp = 0;
 478         $length = null;
 479
 480         while (is_null($length) || $length < $offset) {
 481             $pos = strpos($haystack, $needle, $offset + $comp);
 482
 483             if ($pos === false)
 484                 return false;
 485
 486             $length = utf8_strlen(substr($haystack, 0, $pos));
 487
 488             if ($length < $offset)
 489                 $comp = $pos - $length;
 490         }
 491
 492         return $length;
 493     }
 494 }
 495
 496 if(!function_exists('utf8_tohtml')){
 497     /**
 498      * Encodes UTF-8 characters to HTML entities
 499      *
 500      * @author Tom N Harris <tnharris@whoopdedo.org>
 501      * @author <vpribish at shopping dot com>
 502      * @link   http://www.php.net/manual/en/function.utf8-decode.php
 503      */
 504     function utf8_tohtml ($str) {
 505         $ret = '';
 506         foreach (utf8_to_unicode($str) as $cp) {
 507             if ($cp < 0x80)
 508                 $ret .= chr($cp);
 509             elseif ($cp < 0x100)
 510                 $ret .= "&#$cp;";
 511             else
 512                 $ret .= '&#x'.dechex($cp).';';
 513         }
 514         return $ret;
 515     }
 516 }
 517
 518 if(!function_exists('utf8_unhtml')){
 519     /**
 520      * Decodes HTML entities to UTF-8 characters
 521      *
 522      * Convert any &#..; entity to a codepoint,
 523      * The entities flag defaults to only decoding numeric entities.
 524      * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
 525      * are handled as well. Avoids the problem that would occur if you
 526      * had to decode "&amp;#38;&#38;amp;#38;"
 527      *
 528      * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
 529      * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
 530      * what it should be                   -> "&#38;&amp#38;"
 531      *
 532      * @author Tom N Harris <tnharris@whoopdedo.org>
 533      * @param  string  $str      UTF-8 encoded string
 534      * @param  boolean $entities Flag controlling decoding of named entities.
 535      * @return UTF-8 encoded string with numeric (and named) entities replaced.
 536      */
 537     function utf8_unhtml($str, $entities=null) {
 538         static $decoder = null;
 539         if (is_null($decoder))
 540             $decoder = new utf8_entity_decoder();
 541         if (is_null($entities))
 542             return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
 543                                          'utf8_decode_numeric', $str);
 544         else
 545             return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
 546                                          array(&$decoder, 'decode'), $str);
 547     }
 548 }
 549
 550 if(!function_exists('utf8_decode_numeric')){
 551     function utf8_decode_numeric($ent) {
 552         switch ($ent[2]) {
 553             case 'X':
 554             case 'x':
 555                 $cp = hexdec($ent[3]);
 556                 break;
 557             default:
 558                 $cp = intval($ent[3]);
 559                 break;
 560         }
 561         return unicode_to_utf8(array($cp));
 562     }
 563 }
 564
 565 if(!class_exists('utf8_entity_decoder')){
 566     class utf8_entity_decoder {
 567         var $table;
 568         function utf8_entity_decoder() {
 569             $table = get_html_translation_table(HTML_ENTITIES);
 570             $table = array_flip($table);
 571             $this->table = array_map(array(&$this,'makeutf8'), $table);
 572         }
 573         function makeutf8($c) {
 574             return unicode_to_utf8(array(ord($c)));
 575         }
 576         function decode($ent) {
 577             if ($ent[1] == '#') {
 578                 return utf8_decode_numeric($ent);
 579             } elseif (array_key_exists($ent[0],$this->table)) {
 580                 return $this->table[$ent[0]];
 581             } else {
 582                 return $ent[0];
 583             }
 584         }
 585     }
 586 }
 587
 588 if(!function_exists('utf8_to_unicode')){
 589     /**
 590      * Takes an UTF-8 string and returns an array of ints representing the
 591      * Unicode characters. Astral planes are supported ie. the ints in the
 592      * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 593      * are not allowed.
 594      *
 595      * If $strict is set to true the function returns false if the input
 596      * string isn't a valid UTF-8 octet sequence and raises a PHP error at
 597      * level E_USER_WARNING
 598      *
 599      * Note: this function has been modified slightly in this library to
 600      * trigger errors on encountering bad bytes
 601      *
 602      * @author <hsivonen@iki.fi>
 603      * @author Harry Fuecks <hfuecks@gmail.com>
 604      * @param  string  UTF-8 encoded string
 605      * @param  boolean Check for invalid sequences?
 606      * @return mixed array of unicode code points or false if UTF-8 invalid
 607      * @see    unicode_to_utf8
 608      * @link   http://hsivonen.iki.fi/php-utf8/
 609      * @link   http://sourceforge.net/projects/phputf8/
 610      */
 611     function utf8_to_unicode($str,$strict=false) {
 612         $mState = 0;     // cached expected number of octets after the current octet
 613                          // until the beginning of the next UTF8 character sequence
 614         $mUcs4  = 0;     // cached Unicode character
 615         $mBytes = 1;     // cached expected number of octets in the current sequence
 616
 617         $out = array();
 618
 619         $len = strlen($str);
 620
 621         for($i = 0; $i < $len; $i++) {
 622
 623             $in = ord($str{$i});
 624
 625             if ( $mState == 0) {
 626
 627                 // When mState is zero we expect either a US-ASCII character or a
 628                 // multi-octet sequence.
 629                 if (0 == (0x80 & ($in))) {
 630                     // US-ASCII, pass straight through.
 631                     $out[] = $in;
 632                     $mBytes = 1;
 633
 634                 } else if (0xC0 == (0xE0 & ($in))) {
 635                     // First octet of 2 octet sequence
 636                     $mUcs4 = ($in);
 637                     $mUcs4 = ($mUcs4 & 0x1F) << 6;
 638                     $mState = 1;
 639                     $mBytes = 2;
 640
 641                 } else if (0xE0 == (0xF0 & ($in))) {
 642                     // First octet of 3 octet sequence
 643                     $mUcs4 = ($in);
 644                     $mUcs4 = ($mUcs4 & 0x0F) << 12;
 645                     $mState = 2;
 646                     $mBytes = 3;
 647
 648                 } else if (0xF0 == (0xF8 & ($in))) {
 649                     // First octet of 4 octet sequence
 650                     $mUcs4 = ($in);
 651                     $mUcs4 = ($mUcs4 & 0x07) << 18;
 652                     $mState = 3;
 653                     $mBytes = 4;
 654
 655                 } else if (0xF8 == (0xFC & ($in))) {
 656                     /* First octet of 5 octet sequence.
 657                      *
 658                      * This is illegal because the encoded codepoint must be either
 659                      * (a) not the shortest form or
 660                      * (b) outside the Unicode range of 0-0x10FFFF.
 661                      * Rather than trying to resynchronize, we will carry on until the end
 662                      * of the sequence and let the later error handling code catch it.
 663                      */
 664                     $mUcs4 = ($in);
 665                     $mUcs4 = ($mUcs4 & 0x03) << 24;
 666                     $mState = 4;
 667                     $mBytes = 5;
 668
 669                 } else if (0xFC == (0xFE & ($in))) {
 670                     // First octet of 6 octet sequence, see comments for 5 octet sequence.
 671                     $mUcs4 = ($in);
 672                     $mUcs4 = ($mUcs4 & 1) << 30;
 673                     $mState = 5;
 674                     $mBytes = 6;
 675
 676                 } elseif($strict) {
 677                     /* Current octet is neither in the US-ASCII range nor a legal first
 678                      * octet of a multi-octet sequence.
 679                      */
 680                     trigger_error(
 681                             'utf8_to_unicode: Illegal sequence identifier '.
 682                                 'in UTF-8 at byte '.$i,
 683                             E_USER_WARNING
 684                         );
 685                     return false;
 686
 687                 }
 688
 689             } else {
 690
 691                 // When mState is non-zero, we expect a continuation of the multi-octet
 692                 // sequence
 693                 if (0x80 == (0xC0 & ($in))) {
 694
 695                     // Legal continuation.
 696                     $shift = ($mState - 1) * 6;
 697                     $tmp = $in;
 698                     $tmp = ($tmp & 0x0000003F) << $shift;
 699                     $mUcs4 |= $tmp;
 700
 701                     /**
 702                      * End of the multi-octet sequence. mUcs4 now contains the final
 703                      * Unicode codepoint to be output
 704                      */
 705                     if (0 == --$mState) {
 706
 707                         /*
 708                          * Check for illegal sequences and codepoints.
 709                          */
 710                         // From Unicode 3.1, non-shortest form is illegal
 711                         if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 712                             ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 713                             ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 714                             (4 < $mBytes) ||
 715                             // From Unicode 3.2, surrogate characters are illegal
 716                             (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 717                             // Codepoints outside the Unicode range are illegal
 718                             ($mUcs4 > 0x10FFFF)) {
 719
 720                             if($strict){
 721                                 trigger_error(
 722                                         'utf8_to_unicode: Illegal sequence or codepoint '.
 723                                             'in UTF-8 at byte '.$i,
 724                                         E_USER_WARNING
 725                                     );
 726
 727                                 return false;
 728                             }
 729
 730                         }
 731
 732                         if (0xFEFF != $mUcs4) {
 733                             // BOM is legal but we don't want to output it
 734                             $out[] = $mUcs4;
 735                         }
 736
 737                         //initialize UTF8 cache
 738                         $mState = 0;
 739                         $mUcs4  = 0;
 740                         $mBytes = 1;
 741                     }
 742
 743                 } elseif($strict) {
 744                     /**
 745                      *((0xC0 & (*in) != 0x80) && (mState != 0))
 746                      * Incomplete multi-octet sequence.
 747                      */
 748                     trigger_error(
 749                             'utf8_to_unicode: Incomplete multi-octet '.
 750                             '   sequence in UTF-8 at byte '.$i,
 751                             E_USER_WARNING
 752                         );
 753
 754                     return false;
 755                 }
 756             }
 757         }
 758         return $out;
 759     }
 760 }
 761
 762 if(!function_exists('unicode_to_utf8')){
 763     /**
 764      * Takes an array of ints representing the Unicode characters and returns
 765      * a UTF-8 string. Astral planes are supported ie. the ints in the
 766      * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 767      * are not allowed.
 768      *
 769      * If $strict is set to true the function returns false if the input
 770      * array contains ints that represent surrogates or are outside the
 771      * Unicode range and raises a PHP error at level E_USER_WARNING
 772      *
 773      * Note: this function has been modified slightly in this library to use
 774      * output buffering to concatenate the UTF-8 string (faster) as well as
 775      * reference the array by it's keys
 776      *
 777      * @param  array of unicode code points representing a string
 778      * @param  boolean Check for invalid sequences?
 779      * @return mixed UTF-8 string or false if array contains invalid code points
 780      * @author <hsivonen@iki.fi>
 781      * @author Harry Fuecks <hfuecks@gmail.com>
 782      * @see    utf8_to_unicode
 783      * @link   http://hsivonen.iki.fi/php-utf8/
 784      * @link   http://sourceforge.net/projects/phputf8/
 785      */
 786     function unicode_to_utf8($arr,$strict=false) {
 787         if (!is_array($arr)) return '';
 788         ob_start();
 789
 790         foreach (array_keys($arr) as $k) {
 791
 792             if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
 793                 # ASCII range (including control chars)
 794
 795                 echo chr($arr[$k]);
 796
 797             } else if ($arr[$k] <= 0x07ff) {
 798                 # 2 byte sequence
 799
 800                 echo chr(0xc0 | ($arr[$k] >> 6));
 801                 echo chr(0x80 | ($arr[$k] & 0x003f));
 802
 803             } else if($arr[$k] == 0xFEFF) {
 804                 # Byte order mark (skip)
 805
 806                 // nop -- zap the BOM
 807
 808             } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
 809                 # Test for illegal surrogates
 810
 811                 // found a surrogate
 812                 if($strict){
 813                     trigger_error(
 814                         'unicode_to_utf8: Illegal surrogate '.
 815                             'at index: '.$k.', value: '.$arr[$k],
 816                         E_USER_WARNING
 817                         );
 818                     return false;
 819                 }
 820
 821             } else if ($arr[$k] <= 0xffff) {
 822                 # 3 byte sequence
 823
 824                 echo chr(0xe0 | ($arr[$k] >> 12));
 825                 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
 826                 echo chr(0x80 | ($arr[$k] & 0x003f));
 827
 828             } else if ($arr[$k] <= 0x10ffff) {
 829                 # 4 byte sequence
 830
 831                 echo chr(0xf0 | ($arr[$k] >> 18));
 832                 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
 833                 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
 834                 echo chr(0x80 | ($arr[$k] & 0x3f));
 835
 836             } elseif($strict) {
 837
 838                 trigger_error(
 839                     'unicode_to_utf8: Codepoint out of Unicode range '.
 840                         'at index: '.$k.', value: '.$arr[$k],
 841                     E_USER_WARNING
 842                     );
 843
 844                 // out of range
 845                 return false;
 846             }
 847         }
 848
 849         $result = ob_get_contents();
 850         ob_end_clean();
 851         return $result;
 852     }
 853 }
 854
 855 if(!function_exists('utf8_to_utf16be')){
 856     /**
 857      * UTF-8 to UTF-16BE conversion.
 858      *
 859      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 860      */
 861     function utf8_to_utf16be(&$str, $bom = false) {
 862         $out = $bom ? "\xFE\xFF" : '';
 863         if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
 864
 865         $uni = utf8_to_unicode($str);
 866         foreach($uni as $cp){
 867             $out .= pack('n',$cp);
 868         }
 869         return $out;
 870     }
 871 }
 872
 873 if(!function_exists('utf16be_to_utf8')){
 874     /**
 875      * UTF-8 to UTF-16BE conversion.
 876      *
 877      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 878      */
 879     function utf16be_to_utf8(&$str) {
 880         $uni = unpack('n*',$str);
 881         return unicode_to_utf8($uni);
 882     }
 883 }
 884
 885 if(!function_exists('utf8_bad_replace')){
 886     /**
 887      * Replace bad bytes with an alternative character
 888      *
 889      * ASCII character is recommended for replacement char
 890      *
 891      * PCRE Pattern to locate bad bytes in a UTF-8 string
 892      * Comes from W3 FAQ: Multilingual Forms
 893      * Note: modified to include full ASCII range including control chars
 894      *
 895      * @author Harry Fuecks <hfuecks@gmail.com>
 896      * @see http://www.w3.org/International/questions/qa-forms-utf-8
 897      * @param string to search
 898      * @param string to replace bad bytes with (defaults to '?') - use ASCII
 899      * @return string
 900      */
 901     function utf8_bad_replace($str, $replace = '') {
 902         $UTF8_BAD =
 903          '([\x00-\x7F]'.                          # ASCII (including control chars)
 904          '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 905          '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 906          '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 907          '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 908          '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 909          '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 910          '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 911          '|(.{1}))';                              # invalid byte
 912         ob_start();
 913         while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 914             if ( !isset($matches[2])) {
 915                 echo $matches[0];
 916             } else {
 917                 echo $replace;
 918             }
 919             $str = substr($str,strlen($matches[0]));
 920         }
 921         $result = ob_get_contents();
 922         ob_end_clean();
 923         return $result;
 924     }
 925 }
 926
 927 if(!function_exists('utf8_correctIdx')){
 928     /**
 929      * adjust a byte index into a utf8 string to a utf8 character boundary
 930      *
 931      * @param $str   string   utf8 character string
 932      * @param $i     int      byte index into $str
 933      * @param $next  bool     direction to search for boundary,
 934      *                           false = up (current character)
 935      *                           true = down (next character)
 936      *
 937      * @return int            byte index into $str now pointing to a utf8 character boundary
 938      *
 939      * @author       chris smith <chris@jalakai.co.uk>
 940      */
 941     function utf8_correctIdx(&$str,$i,$next=false) {
 942
 943         if ($i <= 0) return 0;
 944
 945         $limit = strlen($str);
 946         if ($i>=$limit) return $limit;
 947
 948         if ($next) {
 949             while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
 950         } else {
 951             while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
 952         }
 953
 954         return $i;
 955     }
 956 }
 957
 958 // only needed if no mb_string available
 959 if(!UTF8_MBSTRING){
 960     /**
 961      * UTF-8 Case lookup table
 962      *
 963      * This lookuptable defines the upper case letters to their correspponding
 964      * lower case letter in UTF-8
 965      *
 966      * @author Andreas Gohr <andi@splitbrain.org>
 967      */
 968     global $UTF8_LOWER_TO_UPPER;
 969     if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
 970             "ｚ"=>"Ｚ","ｙ"=>"Ｙ","ｘ"=>"Ｘ","ｗ"=>"Ｗ","ｖ"=>"Ｖ","ｕ"=>"Ｕ","ｔ"=>"Ｔ","ｓ"=>"Ｓ","ｒ"=>"Ｒ","ｑ"=>"Ｑ",
 971             "ｐ"=>"Ｐ","ｏ"=>"Ｏ","ｎ"=>"Ｎ","ｍ"=>"Ｍ","ｌ"=>"Ｌ","ｋ"=>"Ｋ","ｊ"=>"Ｊ","ｉ"=>"Ｉ","ｈ"=>"Ｈ","ｇ"=>"Ｇ",
 972             "ｆ"=>"Ｆ","ｅ"=>"Ｅ","ｄ"=>"Ｄ","ｃ"=>"Ｃ","ｂ"=>"Ｂ","ａ"=>"Ａ","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
 973             "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
 974             "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
 975             "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
 976             "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
 977             "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
 978             "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
 979             "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
 980             "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
 981             "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
 982             "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
 983             "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
 984             "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
 985             "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
 986             "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
 987             "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
 988             "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
 989             "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
 990             "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
 991             "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
 992             "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
 993             "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
 994             "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
 995             "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
 996             "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
 997             "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
 998             "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
 999             "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1000             "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1001             "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1002             "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1003             "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1004             "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1005             "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1006             "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1007             "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1008             "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1009             "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1010             "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1011             "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1012             "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1013             "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1014             "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1015             "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1016             "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1017             "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1018             "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1019             "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1020             "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","ǳ"=>"ǲ","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1021             "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1022             "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","ǌ"=>"ǋ","ǉ"=>"ǈ","ǆ"=>"ǅ","ƿ"=>"Ƿ",
1023             "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1024             "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1025             "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1026             "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1027             "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1028             "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ĳ"=>"Ĳ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1029             "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1030             "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1031             "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1032             "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1033             "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1034             "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1035             "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1036             "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1037                 );
1038
1039     /**
1040      * UTF-8 Case lookup table
1041      *
1042      * This lookuptable defines the lower case letters to their correspponding
1043      * upper case letter in UTF-8
1044      *
1045      * @author Andreas Gohr <andi@splitbrain.org>
1046      */
1047     global $UTF8_UPPER_TO_LOWER;
1048     if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1049             "Ｚ"=>"ｚ","Ｙ"=>"ｙ","Ｘ"=>"ｘ","Ｗ"=>"ｗ","Ｖ"=>"ｖ","Ｕ"=>"ｕ","Ｔ"=>"ｔ","Ｓ"=>"ｓ","Ｒ"=>"ｒ","Ｑ"=>"ｑ",
1050             "Ｐ"=>"ｐ","Ｏ"=>"ｏ","Ｎ"=>"ｎ","Ｍ"=>"ｍ","Ｌ"=>"ｌ","Ｋ"=>"ｋ","Ｊ"=>"ｊ","Ｉ"=>"ｉ","Ｈ"=>"ｈ","Ｇ"=>"ｇ",
1051             "Ｆ"=>"ｆ","Ｅ"=>"ｅ","Ｄ"=>"ｄ","Ｃ"=>"ｃ","Ｂ"=>"ｂ","Ａ"=>"ａ","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1052             "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1053             "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1054             "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1055             "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1056             "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1057             "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1058             "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1059             "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1060             "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1061             "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1062             "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1063             "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1064             "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1065             "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1066             "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1067             "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1068             "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1069             "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1070             "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1071             "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1072             "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1073             "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1074             "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1075             "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1076             "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1077             "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1078             "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1079             "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1080             "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1081             "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1082             "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1083             "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1084             "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1085             "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1086             "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1087             "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1088             "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1089             "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1090             "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1091             "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1092             "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1093             "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1094             "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1095             "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1096             "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1097             "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1098             "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1099             "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","ǲ"=>"ǳ","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1100             "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1101             "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","ǋ"=>"ǌ","ǈ"=>"ǉ","ǅ"=>"ǆ","Ƿ"=>"ƿ",
1102             "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1103             "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1104             "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1105             "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1106             "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1107             "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","Ĳ"=>"ĳ","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1108             "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1109             "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1110             "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1111             "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1112             "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1113             "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1114             "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1115             "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1116                 );
1117 }; // end of case lookup tables
1118
1119 /**
1120  * UTF-8 lookup table for lower case accented letters
1121  *
1122  * This lookuptable defines replacements for accented characters from the ASCII-7
1123  * range. This are lower case letters only.
1124  *
1125  * @author Andreas Gohr <andi@splitbrain.org>
1126  * @see    utf8_deaccent()
1127  */
1128 global $UTF8_LOWER_ACCENTS;
1129 if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1130   'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1131   'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1132   'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1133   'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1134   'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1135   'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1136   'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1137   'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1138   'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1139   'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1140   'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1141   'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1142   'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1143   'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1144   'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1145 );
1146
1147 /**
1148  * UTF-8 lookup table for upper case accented letters
1149  *
1150  * This lookuptable defines replacements for accented characters from the ASCII-7
1151  * range. This are upper case letters only.
1152  *
1153  * @author Andreas Gohr <andi@splitbrain.org>
1154  * @see    utf8_deaccent()
1155  */
1156 global $UTF8_UPPER_ACCENTS;
1157 if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1158   'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1159   'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1160   'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1161   'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1162   'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1163   'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1164   'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1165   'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1166   'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1167   'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1168   'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1169   'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1170   'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1171   'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1172   'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1173 );
1174
1175 /**
1176  * UTF-8 array of common special characters
1177  *
1178  * This array should contain all special characters (not a letter or digit)
1179  * defined in the various local charsets - it's not a complete list of non-alphanum
1180  * characters in UTF-8. It's not perfect but should match most cases of special
1181  * chars.
1182  *
1183  * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1184  * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1185  *
1186  * @author Andreas Gohr <andi@splitbrain.org>
1187  * @see    utf8_stripspecials()
1188  */
1189 global $UTF8_SPECIAL_CHARS;
1190 if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1191   0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1192   0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1193           0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1194   0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1195   0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1196   0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1197   0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1198   0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1199   0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1200   0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1201   0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1202   0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1203   0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1204   0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1205   0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1206   0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1207   0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1208   0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1209   0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1210   0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1211   0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1212   0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1213   0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1214   0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1215   0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1216   0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1217   0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1218   0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1219   0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1220   0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1221   0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1222   0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1223   0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1224   0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1225   0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1226   0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1227   0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1228   0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1229   0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1230   0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1231   0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1232   0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1233   0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1234   0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1235   0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1236   0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1237   0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1238   0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1239   0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1240   0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1241   0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1242   0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1243   0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1244           0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1245   0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1246   0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1247   0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1248   0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1249   0xffeb, 0xffec, 0xffed, 0xffee,
1250   0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1251   0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1252   0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1253   0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1254   0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1255 );
1256
1257 // utf8 version of above data
1258 global $UTF8_SPECIAL_CHARS2;
1259 if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1260     "\x1A".'\e\x1c\x1d\x1e\x1f !"#$%&\'()+,/;<=>?@[\]^`{|}~\x7f�'.
1261     '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'.
1262     '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1263     '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1264     '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1265     '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1266     '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1267     '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1268     '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1269     '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1270     '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1271     '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1272     '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1273     '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1274     '➷➸➹➺➻➼➽➾'.
1275     '　、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1276     '�'.
1277     '�ﹼﹽ'.
1278     '！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？＠［＼］＾｀｛｜｝～'.
1279     '｟｠｡｢｣､･￠￡￢￣￤￥￦￨￩￪￫￬￭￮'.
1280     '𝛼𝛽𝛾𝛿𝜀𝜁𝜂𝜃𝜄𝜅𝜆𝜇𝜈𝜉𝜊𝜋𝜌𝜍𝜎𝜏𝜐𝜑𝜒𝜓𝜔𝜕𝜖𝜗𝜘𝜙𝜚𝜛'.
1281     '   ⁠';
1282
1283 /**
1284  * Romanization lookup table
1285  *
1286  * This lookup tables provides a way to transform strings written in a language
1287  * different from the ones based upon latin letters into plain ASCII.
1288  *
1289  * Please note: this is not a scientific transliteration table. It only works
1290  * oneway from nonlatin to ASCII and it works by simple character replacement
1291  * only. Specialities of each language are not supported.
1292  *
1293  * @author Andreas Gohr <andi@splitbrain.org>
1294  * @author Vitaly Blokhin <vitinfo@vitn.com>
1295  * @link   http://www.uconv.com/translit.htm
1296  * @author Bisqwit <bisqwit@iki.fi>
1297  * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1298  * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1299  * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1300  * @link   http://www.btranslations.com/resources/romanization/korean.asp
1301  * @author Arthit Suriyawongkul <arthit@gmail.com>
1302  * @author Denis Scheither <amorphis@uni-bremen.de>
1303  */
1304 global $UTF8_ROMANIZATION;
1305 if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1306   // scandinavian - differs from what we do in deaccent
1307   'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1308
1309   //russian cyrillic
1310   'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1311   'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1312   'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1313   'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1314   'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1315   'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1316   'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1317   'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1318   'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1319   // Ukrainian cyrillic
1320   'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1321   // Georgian
1322   'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1323   'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1324   'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1325   'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1326   'ჰ'=>'xh',
1327   //Sanskrit
1328   'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1329   'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1330   'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1331   'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1332   'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1333   'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1334   'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1335   //Hebrew
1336   'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1337   'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1338   'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1339   'ש'=>'sh','ת'=>'t',
1340   //Arabic
1341   'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1342   'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1343   'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1344   'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1345
1346   // Japanese characters  (last update: 2008-05-09)
1347
1348   // Japanese hiragana
1349
1350   // 3 character syllables, っ doubles the consonant after
1351   'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1352   'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1353   'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1354   'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1355   // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1356   'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1357   'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1358   'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1359   'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1360   'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1361   'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1362
1363   // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1364   'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1365   'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1366
1367    // 2 character syllables - normal
1368   'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1369   'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1370   'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1371   'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1372   'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1373   'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1374   'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1375   'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1376   'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1377   'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1378   'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1379   'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1380   'うぇ'=>'we','うぃ'=>'wi',
1381   'いぇ'=>'ye',
1382
1383   // 2 character syllables, っ doubles the consonant after
1384   'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1385   'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1386   'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1387   'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1388   'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1389   'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1390   'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1391   'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1392   'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1393   'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1394   'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1395
1396   // 1 character syllabels
1397   'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1398   'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1399   'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1400   'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1401   'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1402   'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1403   'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1404   'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1405   'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1406   'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1407   'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1408   'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1409   'わ'=>'wa','を'=>'wo',
1410   'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1411   'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1412   // old characters
1413   'ゑ'=>'we','ゐ'=>'wi',
1414
1415   //  convert what's left (probably only kicks in when something's missing above)
1416   // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1417   // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1418
1419   // never seen one of those (disabled for the moment)
1420   // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1421   // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1422   // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1423   // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1424   // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1425   // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1426   // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1427   // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1428   // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1429   // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1430   // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1431   // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1432   // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1433   // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1434
1435   // 'spare' characters from other romanization systems
1436   // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1437   // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1438   // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1439   // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1440   //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1441   //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1442   //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1443   //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1444   //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1445   //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1446
1447
1448   // Japanese katakana
1449
1450   // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1451   'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1452   'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1453   'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1454   'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1455   'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1456   'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1457   'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1458   'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1459   'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1460   'ッティー'=>'ttii',
1461   'ッヂィー'=>'ddii',
1462
1463   // 3 character syllables - doubled vowels
1464   'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1465   'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1466   'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1467   'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1468   'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1469   'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1470   'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1471   'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1472   'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1473   'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1474   'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1475   'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1476   'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1477   'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1478   'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1479   'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1480   'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1481   'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1482   'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1483   'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1484   'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1485   'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1486   'ウェー'=>'wee','ウィー'=>'wii',
1487   'イェー'=>'yee',
1488   'ティー'=>'tii',
1489   'ヂィー'=>'dii',
1490
1491   // 3 character syllables - doubled consonants
1492   'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1493   'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1494   'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1495   'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1496   'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1497   'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1498   'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1499   'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1500   'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1501   'ッティ'=>'tti',
1502   'ッヂィ'=>'ddi',
1503
1504   // 3 character syllables - doubled vowel and consonants
1505   'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1506   'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1507   'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1508   'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1509   'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1510   'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1511   'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1512   'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1513   'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1514   'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1515   'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1516
1517   // 2 character syllables - normal
1518   'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1519   // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1520   'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1521   'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1522   'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1523   'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1524   'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1525   'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1526   'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1527   'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1528   'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1529   'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1530   'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1531   'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1532   'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1533   'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1534   // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1535   'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1536   'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1537   'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1538   'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1539   'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1540   'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1541   'ウェ'=>'we','ウィ'=>'wi',
1542   'イェ'=>'ye',
1543   'ティ'=>'ti',
1544   'ヂィ'=>'di',
1545
1546   // 2 character syllables - doubled vocal
1547   'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1548   'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1549   'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1550   'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1551   'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1552   'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1553   'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1554   'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1555   'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1556   'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1557   'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1558   'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1559   'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1560   'ワー'=>'waa','ヲー'=>'woo',
1561   'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1562   'ヵー'=>'kaa','ヶー'=>'kee',
1563   // old characters
1564   'ヱー'=>'wee','ヰー'=>'wii',
1565
1566   // seperate katakana 'n'
1567   'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1568   'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1569
1570   // 2 character syllables - doubled consonants
1571   'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1572   'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1573   'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1574   'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1575   'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1576   'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1577   'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1578   'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1579   'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1580   'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1581   'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1582
1583   // 1 character syllables
1584   'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1585   'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1586   'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1587   'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1588   'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1589   'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1590   'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1591   'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1592   'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1593   'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1594   'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1595   'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1596   'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1597   'ワ'=>'wa','ヲ'=>'wo',
1598   'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1599   'ヵ'=>'ka','ヶ'=>'ke',
1600   // old characters
1601   'ヱ'=>'we','ヰ'=>'wi',
1602
1603   //  convert what's left (probably only kicks in when something's missing above)
1604   'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1605   'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1606
1607   // special characters
1608   '・'=>'_','、'=>'_',
1609   'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1610
1611   // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1612   // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1613   //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1614   // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1615   // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1616   //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1617   //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1618   // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1619   // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1620   //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1621   //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1622   //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1623
1624   // "Greeklish"
1625   'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1626   'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1627
1628   // Thai
1629   'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1630   'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1631   'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1632   'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1633   'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1634   'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1635   'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1636   'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1637   'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1638   'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1639   'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1640   'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1641   'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1642   '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1643   '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1644   'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1645   '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1646   '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1647
1648   // Korean
1649   'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1650   'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1651   'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1652   'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1653   'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1654   'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1655 );
1656
1657