inc/utf8.php

   1 <?php
   2 /**
   3  * UTF8 helper functions
   4  *
   5  * @license    LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
   6  * @author     Andreas Gohr <andi@splitbrain.org>
   7  */
   8
   9 /**
  10  * check for mb_string support
  11  */
  12 if(!defined('UTF8_MBSTRING')){
  13     if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
  14         define('UTF8_MBSTRING',1);
  15     }else{
  16         define('UTF8_MBSTRING',0);
  17     }
  18 }
  19
  20 /**
  21  * Check if PREG was compiled with UTF-8 support
  22  *
  23  * Without this many of the functions below will not work, so this is a minimal requirement
  24  */
  25 if(!defined('UTF8_PREGSUPPORT')){
  26     define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
  27 }
  28
  29 /**
  30  * Check if PREG was compiled with Unicode Property support
  31  *
  32  * This is not required for the functions below, but might be needed in a UTF-8 aware application
  33  */
  34 if(!defined('UTF8_PROPERTYSUPPORT')){
  35     define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
  36 }
  37
  38
  39 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
  40
  41 if(!function_exists('utf8_isASCII')){
  42     /**
  43      * Checks if a string contains 7bit ASCII only
  44      *
  45      * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
  46      *
  47      * @param string $str
  48      * @return bool
  49      */
  50     function utf8_isASCII($str){
  51         return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
  52     }
  53 }
  54
  55 if(!function_exists('utf8_strip')){
  56     /**
  57      * Strips all highbyte chars
  58      *
  59      * Returns a pure ASCII7 string
  60      *
  61      * @author Andreas Gohr <andi@splitbrain.org>
  62      *
  63      * @param string $str
  64      * @return string
  65      */
  66     function utf8_strip($str){
  67         $ascii = '';
  68         $len = strlen($str);
  69         for($i=0; $i<$len; $i++){
  70             if(ord($str{$i}) <128){
  71                 $ascii .= $str{$i};
  72             }
  73         }
  74         return $ascii;
  75     }
  76 }
  77
  78 if(!function_exists('utf8_check')){
  79     /**
  80      * Tries to detect if a string is in Unicode encoding
  81      *
  82      * @author <bmorel@ssi.fr>
  83      * @link   http://www.php.net/manual/en/function.utf8-encode.php
  84      *
  85      * @param string $Str
  86      * @return bool
  87      */
  88     function utf8_check($Str) {
  89         $len = strlen($Str);
  90         for ($i=0; $i<$len; $i++) {
  91             $b = ord($Str[$i]);
  92             if ($b < 0x80) continue; # 0bbbbbbb
  93             elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
  94             elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
  95             elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
  96             elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
  97             elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
  98             else return false; # Does not match any model
  99
 100             for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
 101                 if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
 102                     return false;
 103             }
 104         }
 105         return true;
 106     }
 107 }
 108
 109 if(!function_exists('utf8_basename')){
 110     /**
 111      * A locale independent basename() implementation
 112      *
 113      * works around a bug in PHP's basename() implementation
 114      *
 115      * @see basename()
 116      * @link   https://bugs.php.net/bug.php?id=37738
 117      *
 118      * @param string $path     A path
 119      * @param string $suffix   If the name component ends in suffix this will also be cut off
 120      * @return string
 121      */
 122     function utf8_basename($path, $suffix=''){
 123         $path = trim($path,'\\/');
 124         $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
 125         if($rpos) $path = substr($path, $rpos+1);
 126
 127         $suflen = strlen($suffix);
 128         if($suflen && (substr($path, -$suflen) == $suffix)){
 129             $path = substr($path, 0, -$suflen);
 130         }
 131
 132         return $path;
 133     }
 134 }
 135
 136 if(!function_exists('utf8_strlen')){
 137     /**
 138      * Unicode aware replacement for strlen()
 139      *
 140      * utf8_decode() converts characters that are not in ISO-8859-1
 141      * to '?', which, for the purpose of counting, is alright - It's
 142      * even faster than mb_strlen.
 143      *
 144      * @author <chernyshevsky at hotmail dot com>
 145      * @see    strlen()
 146      * @see    utf8_decode()
 147      *
 148      * @param string $string
 149      * @return int
 150      */
 151     function utf8_strlen($string){
 152         return strlen(utf8_decode($string));
 153     }
 154 }
 155
 156 if(!function_exists('utf8_substr')){
 157     /**
 158      * UTF-8 aware alternative to substr
 159      *
 160      * Return part of a string given character offset (and optionally length)
 161      *
 162      * @author Harry Fuecks <hfuecks@gmail.com>
 163      * @author Chris Smith <chris@jalakai.co.uk>
 164      *
 165      * @param string $str
 166      * @param int $offset number of UTF-8 characters offset (from left)
 167      * @param int $length (optional) length in UTF-8 characters from offset
 168      * @return string
 169      */
 170     function utf8_substr($str, $offset, $length = null) {
 171         if(UTF8_MBSTRING){
 172             if( $length === null ){
 173                 return mb_substr($str, $offset);
 174             }else{
 175                 return mb_substr($str, $offset, $length);
 176             }
 177         }
 178
 179         /*
 180          * Notes:
 181          *
 182          * no mb string support, so we'll use pcre regex's with 'u' flag
 183          * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
 184          * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
 185          *
 186          * substr documentation states false can be returned in some cases (e.g. offset > string length)
 187          * mb_substr never returns false, it will return an empty string instead.
 188          *
 189          * calculating the number of characters in the string is a relatively expensive operation, so
 190          * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
 191          */
 192
 193         // cast parameters to appropriate types to avoid multiple notices/warnings
 194         $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
 195         $offset = (int)$offset;
 196         if (!is_null($length)) $length = (int)$length;
 197
 198         // handle trivial cases
 199         if ($length === 0) return '';
 200         if ($offset < 0 && $length < 0 && $length < $offset) return '';
 201
 202         $offset_pattern = '';
 203         $length_pattern = '';
 204
 205         // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
 206         if ($offset < 0) {
 207             $strlen = strlen(utf8_decode($str));        // see notes
 208             $offset = $strlen + $offset;
 209             if ($offset < 0) $offset = 0;
 210         }
 211
 212         // establish a pattern for offset, a non-captured group equal in length to offset
 213         if ($offset > 0) {
 214             $Ox = (int)($offset/65535);
 215             $Oy = $offset%65535;
 216
 217             if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
 218             $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
 219         } else {
 220             $offset_pattern = '^';                      // offset == 0; just anchor the pattern
 221         }
 222
 223         // establish a pattern for length
 224         if (is_null($length)) {
 225             $length_pattern = '(.*)$';                  // the rest of the string
 226         } else {
 227
 228             if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
 229             if ($offset > $strlen) return '';           // another trivial case
 230
 231             if ($length > 0) {
 232
 233                 $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
 234
 235                 $Lx = (int)($length/65535);
 236                 $Ly = $length%65535;
 237
 238                 // +ve length requires ... a captured group of length characters
 239                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 240                     $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
 241
 242             } else if ($length < 0) {
 243
 244                 if ($length < ($offset - $strlen)) return '';
 245
 246                 $Lx = (int)((-$length)/65535);
 247                 $Ly = (-$length)%65535;
 248
 249                 // -ve length requires ... capture everything except a group of -length characters
 250                 //                         anchored at the tail-end of the string
 251                 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 252                 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
 253             }
 254         }
 255
 256         if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
 257         return $match[1];
 258     }
 259 }
 260
 261 if(!function_exists('utf8_substr_replace')){
 262     /**
 263      * Unicode aware replacement for substr_replace()
 264      *
 265      * @author Andreas Gohr <andi@splitbrain.org>
 266      * @see    substr_replace()
 267      *
 268      * @param string $string      input string
 269      * @param string $replacement the replacement
 270      * @param int    $start       the replacing will begin at the start'th offset into string.
 271      * @param int    $length      If given and is positive, it represents the length of the portion of string which is
 272      *                            to be replaced. If length is zero then this function will have the effect of inserting
 273      *                            replacement into string at the given start offset.
 274      * @return string
 275      */
 276     function utf8_substr_replace($string, $replacement, $start , $length=0 ){
 277         $ret = '';
 278         if($start>0) $ret .= utf8_substr($string, 0, $start);
 279         $ret .= $replacement;
 280         $ret .= utf8_substr($string, $start+$length);
 281         return $ret;
 282     }
 283 }
 284
 285 if(!function_exists('utf8_ltrim')){
 286     /**
 287      * Unicode aware replacement for ltrim()
 288      *
 289      * @author Andreas Gohr <andi@splitbrain.org>
 290      * @see    ltrim()
 291      *
 292      * @param  string $str
 293      * @param  string $charlist
 294      * @return string
 295      */
 296     function utf8_ltrim($str,$charlist=''){
 297         if($charlist == '') return ltrim($str);
 298
 299         //quote charlist for use in a characterclass
 300         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
 301
 302         return preg_replace('/^['.$charlist.']+/u','',$str);
 303     }
 304 }
 305
 306 if(!function_exists('utf8_rtrim')){
 307     /**
 308      * Unicode aware replacement for rtrim()
 309      *
 310      * @author Andreas Gohr <andi@splitbrain.org>
 311      * @see    rtrim()
 312      *
 313      * @param  string $str
 314      * @param  string $charlist
 315      * @return string
 316      */
 317     function  utf8_rtrim($str,$charlist=''){
 318         if($charlist == '') return rtrim($str);
 319
 320         //quote charlist for use in a characterclass
 321         $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
 322
 323         return preg_replace('/['.$charlist.']+$/u','',$str);
 324     }
 325 }
 326
 327 if(!function_exists('utf8_trim')){
 328     /**
 329      * Unicode aware replacement for trim()
 330      *
 331      * @author Andreas Gohr <andi@splitbrain.org>
 332      * @see    trim()
 333      *
 334      * @param  string $str
 335      * @param  string $charlist
 336      * @return string
 337      */
 338     function  utf8_trim($str,$charlist='') {
 339         if($charlist == '') return trim($str);
 340
 341         return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
 342     }
 343 }
 344
 345 if(!function_exists('utf8_strtolower')){
 346     /**
 347      * This is a unicode aware replacement for strtolower()
 348      *
 349      * Uses mb_string extension if available
 350      *
 351      * @author Leo Feyer <leo@typolight.org>
 352      * @see    strtolower()
 353      * @see    utf8_strtoupper()
 354      *
 355      * @param string $string
 356      * @return string
 357      */
 358     function utf8_strtolower($string){
 359         if(UTF8_MBSTRING) {
 360             if (class_exists("Normalizer", $autoload = false))
 361                 return normalizer::normalize(mb_strtolower($string,'utf-8'));
 362             else
 363                 return (mb_strtolower($string,'utf-8'));
 364         }
 365         global $UTF8_UPPER_TO_LOWER;
 366         return strtr($string,$UTF8_UPPER_TO_LOWER);
 367     }
 368 }
 369
 370 if(!function_exists('utf8_strtoupper')){
 371     /**
 372      * This is a unicode aware replacement for strtoupper()
 373      *
 374      * Uses mb_string extension if available
 375      *
 376      * @author Leo Feyer <leo@typolight.org>
 377      * @see    strtoupper()
 378      * @see    utf8_strtoupper()
 379      *
 380      * @param string $string
 381      * @return string
 382      */
 383     function utf8_strtoupper($string){
 384         if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
 385
 386         global $UTF8_LOWER_TO_UPPER;
 387         return strtr($string,$UTF8_LOWER_TO_UPPER);
 388     }
 389 }
 390
 391 if(!function_exists('utf8_ucfirst')){
 392     /**
 393      * UTF-8 aware alternative to ucfirst
 394      * Make a string's first character uppercase
 395      *
 396      * @author Harry Fuecks
 397      *
 398      * @param string $str
 399      * @return string with first character as upper case (if applicable)
 400      */
 401     function utf8_ucfirst($str){
 402         switch ( utf8_strlen($str) ) {
 403             case 0:
 404                 return '';
 405             case 1:
 406                 return utf8_strtoupper($str);
 407             default:
 408                 preg_match('/^(.{1})(.*)$/us', $str, $matches);
 409                 return utf8_strtoupper($matches[1]).$matches[2];
 410         }
 411     }
 412 }
 413
 414 if(!function_exists('utf8_ucwords')){
 415     /**
 416      * UTF-8 aware alternative to ucwords
 417      * Uppercase the first character of each word in a string
 418      *
 419      * @author Harry Fuecks
 420      * @see http://www.php.net/ucwords
 421      *
 422      * @param string $str
 423      * @return string with first char of each word uppercase
 424      */
 425     function utf8_ucwords($str) {
 426         // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
 427         // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
 428         // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
 429         $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
 430
 431         return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
 432     }
 433
 434     /**
 435      * Callback function for preg_replace_callback call in utf8_ucwords
 436      * You don't need to call this yourself
 437      *
 438      * @author Harry Fuecks
 439      * @see utf8_ucwords
 440      * @see utf8_strtoupper
 441      *
 442      * @param  array $matches matches corresponding to a single word
 443      * @return string with first char of the word in uppercase
 444      */
 445     function utf8_ucwords_callback($matches) {
 446         $leadingws = $matches[2];
 447         $ucfirst = utf8_strtoupper($matches[3]);
 448         $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
 449         return $leadingws . $ucword;
 450     }
 451 }
 452
 453 if(!function_exists('utf8_deaccent')){
 454     /**
 455      * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
 456      *
 457      * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
 458      * letters. Default is to deaccent both cases ($case = 0)
 459      *
 460      * @author Andreas Gohr <andi@splitbrain.org>
 461      *
 462      * @param string $string
 463      * @param int $case
 464      * @return string
 465      */
 466     function utf8_deaccent($string,$case=0){
 467         if($case <= 0){
 468             global $UTF8_LOWER_ACCENTS;
 469             $string = strtr($string,$UTF8_LOWER_ACCENTS);
 470         }
 471         if($case >= 0){
 472             global $UTF8_UPPER_ACCENTS;
 473             $string = strtr($string,$UTF8_UPPER_ACCENTS);
 474         }
 475         return $string;
 476     }
 477 }
 478
 479 if(!function_exists('utf8_romanize')){
 480     /**
 481      * Romanize a non-latin string
 482      *
 483      * @author Andreas Gohr <andi@splitbrain.org>
 484      *
 485      * @param string $string
 486      * @return string
 487      */
 488     function utf8_romanize($string){
 489         if(utf8_isASCII($string)) return $string; //nothing to do
 490
 491         global $UTF8_ROMANIZATION;
 492         return strtr($string,$UTF8_ROMANIZATION);
 493     }
 494 }
 495
 496 if(!function_exists('utf8_stripspecials')){
 497     /**
 498      * Removes special characters (nonalphanumeric) from a UTF-8 string
 499      *
 500      * This function adds the controlchars 0x00 to 0x19 to the array of
 501      * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
 502      *
 503      * @author Andreas Gohr <andi@splitbrain.org>
 504      *
 505      * @param  string $string     The UTF8 string to strip of special chars
 506      * @param  string $repl       Replace special with this string
 507      * @param  string $additional Additional chars to strip (used in regexp char class)
 508      * @return string
 509      */
 510     function utf8_stripspecials($string,$repl='',$additional=''){
 511         global $UTF8_SPECIAL_CHARS2;
 512
 513         static $specials = null;
 514         if(is_null($specials)){
 515             #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
 516             $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
 517         }
 518
 519         return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
 520     }
 521 }
 522
 523 if(!function_exists('utf8_strpos')){
 524     /**
 525      * This is an Unicode aware replacement for strpos
 526      *
 527      * @author Leo Feyer <leo@typolight.org>
 528      * @see    strpos()
 529      *
 530      * @param  string  $haystack
 531      * @param  string  $needle
 532      * @param  integer $offset
 533      * @return integer
 534      */
 535     function utf8_strpos($haystack, $needle, $offset=0){
 536         $comp = 0;
 537         $length = null;
 538
 539         while (is_null($length) || $length < $offset) {
 540             $pos = strpos($haystack, $needle, $offset + $comp);
 541
 542             if ($pos === false)
 543                 return false;
 544
 545             $length = utf8_strlen(substr($haystack, 0, $pos));
 546
 547             if ($length < $offset)
 548                 $comp = $pos - $length;
 549         }
 550
 551         return $length;
 552     }
 553 }
 554
 555 if(!function_exists('utf8_tohtml')){
 556     /**
 557      * Encodes UTF-8 characters to HTML entities
 558      *
 559      * @author Tom N Harris <tnharris@whoopdedo.org>
 560      * @author <vpribish at shopping dot com>
 561      * @link   http://www.php.net/manual/en/function.utf8-decode.php
 562      *
 563      * @param string $str
 564      * @return string
 565      */
 566     function utf8_tohtml ($str) {
 567         $ret = '';
 568         foreach (utf8_to_unicode($str) as $cp) {
 569             if ($cp < 0x80)
 570                 $ret .= chr($cp);
 571             elseif ($cp < 0x100)
 572                 $ret .= "&#$cp;";
 573             else
 574                 $ret .= '&#x'.dechex($cp).';';
 575         }
 576         return $ret;
 577     }
 578 }
 579
 580 if(!function_exists('utf8_unhtml')){
 581     /**
 582      * Decodes HTML entities to UTF-8 characters
 583      *
 584      * Convert any &#..; entity to a codepoint,
 585      * The entities flag defaults to only decoding numeric entities.
 586      * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
 587      * are handled as well. Avoids the problem that would occur if you
 588      * had to decode "&amp;#38;&#38;amp;#38;"
 589      *
 590      * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
 591      * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
 592      * what it should be                   -> "&#38;&amp#38;"
 593      *
 594      * @author Tom N Harris <tnharris@whoopdedo.org>
 595      *
 596      * @param  string  $str      UTF-8 encoded string
 597      * @param  boolean $entities Flag controlling decoding of named entities.
 598      * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
 599      */
 600     function utf8_unhtml($str, $entities=null) {
 601         static $decoder = null;
 602         if (is_null($decoder))
 603             $decoder = new utf8_entity_decoder();
 604         if (is_null($entities))
 605             return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
 606                                          'utf8_decode_numeric', $str);
 607         else
 608             return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
 609                                          array(&$decoder, 'decode'), $str);
 610     }
 611 }
 612
 613 if(!function_exists('utf8_decode_numeric')){
 614     /**
 615      * Decodes numeric HTML entities to their correct UTF-8 characters
 616      *
 617      * @param $ent string A numeric entity
 618      * @return string|false
 619      */
 620     function utf8_decode_numeric($ent) {
 621         switch ($ent[2]) {
 622             case 'X':
 623             case 'x':
 624                 $cp = hexdec($ent[3]);
 625                 break;
 626             default:
 627                 $cp = intval($ent[3]);
 628                 break;
 629         }
 630         return unicode_to_utf8(array($cp));
 631     }
 632 }
 633
 634 if(!class_exists('utf8_entity_decoder')){
 635     /**
 636      * Encapsulate HTML entity decoding tables
 637      */
 638     class utf8_entity_decoder {
 639         var $table;
 640
 641         /**
 642          * Initializes the decoding tables
 643          */
 644         function __construct() {
 645             $table = get_html_translation_table(HTML_ENTITIES);
 646             $table = array_flip($table);
 647             $this->table = array_map(array(&$this,'makeutf8'), $table);
 648         }
 649
 650         /**
 651          * Wrapper around unicode_to_utf8()
 652          *
 653          * @param string $c
 654          * @return string|false
 655          */
 656         function makeutf8($c) {
 657             return unicode_to_utf8(array(ord($c)));
 658         }
 659
 660         /**
 661          * Decodes any HTML entity to it's correct UTF-8 char equivalent
 662          *
 663          * @param string $ent An entity
 664          * @return string|false
 665          */
 666         function decode($ent) {
 667             if ($ent[1] == '#') {
 668                 return utf8_decode_numeric($ent);
 669             } elseif (array_key_exists($ent[0],$this->table)) {
 670                 return $this->table[$ent[0]];
 671             } else {
 672                 return $ent[0];
 673             }
 674         }
 675     }
 676 }
 677
 678 if(!function_exists('utf8_to_unicode')){
 679     /**
 680      * Takes an UTF-8 string and returns an array of ints representing the
 681      * Unicode characters. Astral planes are supported ie. the ints in the
 682      * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 683      * are not allowed.
 684      *
 685      * If $strict is set to true the function returns false if the input
 686      * string isn't a valid UTF-8 octet sequence and raises a PHP error at
 687      * level E_USER_WARNING
 688      *
 689      * Note: this function has been modified slightly in this library to
 690      * trigger errors on encountering bad bytes
 691      *
 692      * @author <hsivonen@iki.fi>
 693      * @author Harry Fuecks <hfuecks@gmail.com>
 694      * @see    unicode_to_utf8
 695      * @link   http://hsivonen.iki.fi/php-utf8/
 696      * @link   http://sourceforge.net/projects/phputf8/
 697      *
 698      * @param  string  $str UTF-8 encoded string
 699      * @param  boolean $strict Check for invalid sequences?
 700      * @return mixed array of unicode code points or false if UTF-8 invalid
 701      */
 702     function utf8_to_unicode($str,$strict=false) {
 703         $mState = 0;     // cached expected number of octets after the current octet
 704                          // until the beginning of the next UTF8 character sequence
 705         $mUcs4  = 0;     // cached Unicode character
 706         $mBytes = 1;     // cached expected number of octets in the current sequence
 707
 708         $out = array();
 709
 710         $len = strlen($str);
 711
 712         for($i = 0; $i < $len; $i++) {
 713
 714             $in = ord($str{$i});
 715
 716             if ( $mState == 0) {
 717
 718                 // When mState is zero we expect either a US-ASCII character or a
 719                 // multi-octet sequence.
 720                 if (0 == (0x80 & ($in))) {
 721                     // US-ASCII, pass straight through.
 722                     $out[] = $in;
 723                     $mBytes = 1;
 724
 725                 } else if (0xC0 == (0xE0 & ($in))) {
 726                     // First octet of 2 octet sequence
 727                     $mUcs4 = ($in);
 728                     $mUcs4 = ($mUcs4 & 0x1F) << 6;
 729                     $mState = 1;
 730                     $mBytes = 2;
 731
 732                 } else if (0xE0 == (0xF0 & ($in))) {
 733                     // First octet of 3 octet sequence
 734                     $mUcs4 = ($in);
 735                     $mUcs4 = ($mUcs4 & 0x0F) << 12;
 736                     $mState = 2;
 737                     $mBytes = 3;
 738
 739                 } else if (0xF0 == (0xF8 & ($in))) {
 740                     // First octet of 4 octet sequence
 741                     $mUcs4 = ($in);
 742                     $mUcs4 = ($mUcs4 & 0x07) << 18;
 743                     $mState = 3;
 744                     $mBytes = 4;
 745
 746                 } else if (0xF8 == (0xFC & ($in))) {
 747                     /* First octet of 5 octet sequence.
 748                      *
 749                      * This is illegal because the encoded codepoint must be either
 750                      * (a) not the shortest form or
 751                      * (b) outside the Unicode range of 0-0x10FFFF.
 752                      * Rather than trying to resynchronize, we will carry on until the end
 753                      * of the sequence and let the later error handling code catch it.
 754                      */
 755                     $mUcs4 = ($in);
 756                     $mUcs4 = ($mUcs4 & 0x03) << 24;
 757                     $mState = 4;
 758                     $mBytes = 5;
 759
 760                 } else if (0xFC == (0xFE & ($in))) {
 761                     // First octet of 6 octet sequence, see comments for 5 octet sequence.
 762                     $mUcs4 = ($in);
 763                     $mUcs4 = ($mUcs4 & 1) << 30;
 764                     $mState = 5;
 765                     $mBytes = 6;
 766
 767                 } elseif($strict) {
 768                     /* Current octet is neither in the US-ASCII range nor a legal first
 769                      * octet of a multi-octet sequence.
 770                      */
 771                     trigger_error(
 772                             'utf8_to_unicode: Illegal sequence identifier '.
 773                                 'in UTF-8 at byte '.$i,
 774                             E_USER_WARNING
 775                         );
 776                     return false;
 777
 778                 }
 779
 780             } else {
 781
 782                 // When mState is non-zero, we expect a continuation of the multi-octet
 783                 // sequence
 784                 if (0x80 == (0xC0 & ($in))) {
 785
 786                     // Legal continuation.
 787                     $shift = ($mState - 1) * 6;
 788                     $tmp = $in;
 789                     $tmp = ($tmp & 0x0000003F) << $shift;
 790                     $mUcs4 |= $tmp;
 791
 792                     /**
 793                      * End of the multi-octet sequence. mUcs4 now contains the final
 794                      * Unicode codepoint to be output
 795                      */
 796                     if (0 == --$mState) {
 797
 798                         /*
 799                          * Check for illegal sequences and codepoints.
 800                          */
 801                         // From Unicode 3.1, non-shortest form is illegal
 802                         if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 803                             ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 804                             ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 805                             (4 < $mBytes) ||
 806                             // From Unicode 3.2, surrogate characters are illegal
 807                             (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 808                             // Codepoints outside the Unicode range are illegal
 809                             ($mUcs4 > 0x10FFFF)) {
 810
 811                             if($strict){
 812                                 trigger_error(
 813                                         'utf8_to_unicode: Illegal sequence or codepoint '.
 814                                             'in UTF-8 at byte '.$i,
 815                                         E_USER_WARNING
 816                                     );
 817
 818                                 return false;
 819                             }
 820
 821                         }
 822
 823                         if (0xFEFF != $mUcs4) {
 824                             // BOM is legal but we don't want to output it
 825                             $out[] = $mUcs4;
 826                         }
 827
 828                         //initialize UTF8 cache
 829                         $mState = 0;
 830                         $mUcs4  = 0;
 831                         $mBytes = 1;
 832                     }
 833
 834                 } elseif($strict) {
 835                     /**
 836                      *((0xC0 & (*in) != 0x80) && (mState != 0))
 837                      * Incomplete multi-octet sequence.
 838                      */
 839                     trigger_error(
 840                             'utf8_to_unicode: Incomplete multi-octet '.
 841                             '   sequence in UTF-8 at byte '.$i,
 842                             E_USER_WARNING
 843                         );
 844
 845                     return false;
 846                 }
 847             }
 848         }
 849         return $out;
 850     }
 851 }
 852
 853 if(!function_exists('unicode_to_utf8')){
 854     /**
 855      * Takes an array of ints representing the Unicode characters and returns
 856      * a UTF-8 string. Astral planes are supported ie. the ints in the
 857      * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 858      * are not allowed.
 859      *
 860      * If $strict is set to true the function returns false if the input
 861      * array contains ints that represent surrogates or are outside the
 862      * Unicode range and raises a PHP error at level E_USER_WARNING
 863      *
 864      * Note: this function has been modified slightly in this library to use
 865      * output buffering to concatenate the UTF-8 string (faster) as well as
 866      * reference the array by it's keys
 867      *
 868      * @param  array $arr of unicode code points representing a string
 869      * @param  boolean $strict Check for invalid sequences?
 870      * @return string|false UTF-8 string or false if array contains invalid code points
 871      *
 872      * @author <hsivonen@iki.fi>
 873      * @author Harry Fuecks <hfuecks@gmail.com>
 874      * @see    utf8_to_unicode
 875      * @link   http://hsivonen.iki.fi/php-utf8/
 876      * @link   http://sourceforge.net/projects/phputf8/
 877      */
 878     function unicode_to_utf8($arr,$strict=false) {
 879         if (!is_array($arr)) return '';
 880         ob_start();
 881
 882         foreach (array_keys($arr) as $k) {
 883
 884             if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
 885                 # ASCII range (including control chars)
 886
 887                 echo chr($arr[$k]);
 888
 889             } else if ($arr[$k] <= 0x07ff) {
 890                 # 2 byte sequence
 891
 892                 echo chr(0xc0 | ($arr[$k] >> 6));
 893                 echo chr(0x80 | ($arr[$k] & 0x003f));
 894
 895             } else if($arr[$k] == 0xFEFF) {
 896                 # Byte order mark (skip)
 897
 898                 // nop -- zap the BOM
 899
 900             } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
 901                 # Test for illegal surrogates
 902
 903                 // found a surrogate
 904                 if($strict){
 905                     trigger_error(
 906                         'unicode_to_utf8: Illegal surrogate '.
 907                             'at index: '.$k.', value: '.$arr[$k],
 908                         E_USER_WARNING
 909                         );
 910                     return false;
 911                 }
 912
 913             } else if ($arr[$k] <= 0xffff) {
 914                 # 3 byte sequence
 915
 916                 echo chr(0xe0 | ($arr[$k] >> 12));
 917                 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
 918                 echo chr(0x80 | ($arr[$k] & 0x003f));
 919
 920             } else if ($arr[$k] <= 0x10ffff) {
 921                 # 4 byte sequence
 922
 923                 echo chr(0xf0 | ($arr[$k] >> 18));
 924                 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
 925                 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
 926                 echo chr(0x80 | ($arr[$k] & 0x3f));
 927
 928             } elseif($strict) {
 929
 930                 trigger_error(
 931                     'unicode_to_utf8: Codepoint out of Unicode range '.
 932                         'at index: '.$k.', value: '.$arr[$k],
 933                     E_USER_WARNING
 934                     );
 935
 936                 // out of range
 937                 return false;
 938             }
 939         }
 940
 941         $result = ob_get_contents();
 942         ob_end_clean();
 943         return $result;
 944     }
 945 }
 946
 947 if(!function_exists('utf8_to_utf16be')){
 948     /**
 949      * UTF-8 to UTF-16BE conversion.
 950      *
 951      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 952      *
 953      * @param string $str
 954      * @param bool $bom
 955      * @return string
 956      */
 957     function utf8_to_utf16be(&$str, $bom = false) {
 958         $out = $bom ? "\xFE\xFF" : '';
 959         if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
 960
 961         $uni = utf8_to_unicode($str);
 962         foreach($uni as $cp){
 963             $out .= pack('n',$cp);
 964         }
 965         return $out;
 966     }
 967 }
 968
 969 if(!function_exists('utf16be_to_utf8')){
 970     /**
 971      * UTF-8 to UTF-16BE conversion.
 972      *
 973      * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 974      *
 975      * @param string $str
 976      * @return false|string
 977      */
 978     function utf16be_to_utf8(&$str) {
 979         $uni = unpack('n*',$str);
 980         return unicode_to_utf8($uni);
 981     }
 982 }
 983
 984 if(!function_exists('utf8_bad_replace')){
 985     /**
 986      * Replace bad bytes with an alternative character
 987      *
 988      * ASCII character is recommended for replacement char
 989      *
 990      * PCRE Pattern to locate bad bytes in a UTF-8 string
 991      * Comes from W3 FAQ: Multilingual Forms
 992      * Note: modified to include full ASCII range including control chars
 993      *
 994      * @author Harry Fuecks <hfuecks@gmail.com>
 995      * @see http://www.w3.org/International/questions/qa-forms-utf-8
 996      *
 997      * @param string $str to search
 998      * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
 999      * @return string
1000      */
1001     function utf8_bad_replace($str, $replace = '') {
1002         $UTF8_BAD =
1003          '([\x00-\x7F]'.                          # ASCII (including control chars)
1004          '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
1005          '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
1006          '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
1007          '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
1008          '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
1009          '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
1010          '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
1011          '|(.{1}))';                              # invalid byte
1012         ob_start();
1013         while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
1014             if ( !isset($matches[2])) {
1015                 echo $matches[0];
1016             } else {
1017                 echo $replace;
1018             }
1019             $str = substr($str,strlen($matches[0]));
1020         }
1021         $result = ob_get_contents();
1022         ob_end_clean();
1023         return $result;
1024     }
1025 }
1026
1027 if(!function_exists('utf8_correctIdx')){
1028     /**
1029      * adjust a byte index into a utf8 string to a utf8 character boundary
1030      *
1031      * @param string $str   utf8 character string
1032      * @param int    $i     byte index into $str
1033      * @param $next  bool     direction to search for boundary,
1034      *                           false = up (current character)
1035      *                           true = down (next character)
1036      *
1037      * @return int            byte index into $str now pointing to a utf8 character boundary
1038      *
1039      * @author       chris smith <chris@jalakai.co.uk>
1040      */
1041     function utf8_correctIdx(&$str,$i,$next=false) {
1042
1043         if ($i <= 0) return 0;
1044
1045         $limit = strlen($str);
1046         if ($i>=$limit) return $limit;
1047
1048         if ($next) {
1049             while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
1050         } else {
1051             while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
1052         }
1053
1054         return $i;
1055     }
1056 }
1057
1058 // only needed if no mb_string available
1059 if(!UTF8_MBSTRING){
1060     /**
1061      * UTF-8 Case lookup table
1062      *
1063      * This lookuptable defines the upper case letters to their correspponding
1064      * lower case letter in UTF-8
1065      *
1066      * @author Andreas Gohr <andi@splitbrain.org>
1067      */
1068     global $UTF8_LOWER_TO_UPPER;
1069     if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
1070             "ｚ"=>"Ｚ","ｙ"=>"Ｙ","ｘ"=>"Ｘ","ｗ"=>"Ｗ","ｖ"=>"Ｖ","ｕ"=>"Ｕ","ｔ"=>"Ｔ","ｓ"=>"Ｓ","ｒ"=>"Ｒ","ｑ"=>"Ｑ",
1071             "ｐ"=>"Ｐ","ｏ"=>"Ｏ","ｎ"=>"Ｎ","ｍ"=>"Ｍ","ｌ"=>"Ｌ","ｋ"=>"Ｋ","ｊ"=>"Ｊ","ｉ"=>"Ｉ","ｈ"=>"Ｈ","ｇ"=>"Ｇ",
1072             "ｆ"=>"Ｆ","ｅ"=>"Ｅ","ｄ"=>"Ｄ","ｃ"=>"Ｃ","ｂ"=>"Ｂ","ａ"=>"Ａ","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
1073             "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
1074             "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
1075             "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
1076             "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
1077             "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
1078             "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1079             "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1080             "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1081             "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1082             "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1083             "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1084             "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1085             "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1086             "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1087             "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1088             "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1089             "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1090             "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1091             "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1092             "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1093             "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1094             "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1095             "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1096             "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1097             "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1098             "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1099             "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1100             "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1101             "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1102             "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1103             "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1104             "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1105             "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1106             "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1107             "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1108             "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1109             "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1110             "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1111             "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1112             "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1113             "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1114             "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1115             "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1116             "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1117             "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1118             "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1119             "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1120             "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","ǳ"=>"ǲ","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1121             "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1122             "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","ǌ"=>"ǋ","ǉ"=>"ǈ","ǆ"=>"ǅ","ƿ"=>"Ƿ",
1123             "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1124             "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1125             "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1126             "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1127             "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1128             "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ĳ"=>"Ĳ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1129             "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1130             "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1131             "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1132             "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1133             "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1134             "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1135             "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1136             "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1137                 );
1138
1139     /**
1140      * UTF-8 Case lookup table
1141      *
1142      * This lookuptable defines the lower case letters to their corresponding
1143      * upper case letter in UTF-8
1144      *
1145      * @author Andreas Gohr <andi@splitbrain.org>
1146      */
1147     global $UTF8_UPPER_TO_LOWER;
1148     if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1149             "Ｚ"=>"ｚ","Ｙ"=>"ｙ","Ｘ"=>"ｘ","Ｗ"=>"ｗ","Ｖ"=>"ｖ","Ｕ"=>"ｕ","Ｔ"=>"ｔ","Ｓ"=>"ｓ","Ｒ"=>"ｒ","Ｑ"=>"ｑ",
1150             "Ｐ"=>"ｐ","Ｏ"=>"ｏ","Ｎ"=>"ｎ","Ｍ"=>"ｍ","Ｌ"=>"ｌ","Ｋ"=>"ｋ","Ｊ"=>"ｊ","Ｉ"=>"ｉ","Ｈ"=>"ｈ","Ｇ"=>"ｇ",
1151             "Ｆ"=>"ｆ","Ｅ"=>"ｅ","Ｄ"=>"ｄ","Ｃ"=>"ｃ","Ｂ"=>"ｂ","Ａ"=>"ａ","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1152             "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1153             "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1154             "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1155             "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1156             "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1157             "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1158             "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1159             "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1160             "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1161             "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1162             "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1163             "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1164             "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1165             "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1166             "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1167             "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1168             "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1169             "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1170             "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1171             "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1172             "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1173             "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1174             "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1175             "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1176             "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1177             "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1178             "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1179             "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1180             "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1181             "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1182             "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1183             "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1184             "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1185             "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1186             "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1187             "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1188             "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1189             "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1190             "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1191             "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1192             "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1193             "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1194             "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1195             "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1196             "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1197             "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1198             "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1199             "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","ǲ"=>"ǳ","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1200             "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1201             "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","ǋ"=>"ǌ","ǈ"=>"ǉ","ǅ"=>"ǆ","Ƿ"=>"ƿ",
1202             "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1203             "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1204             "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1205             "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1206             "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1207             "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","Ĳ"=>"ĳ","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1208             "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1209             "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1210             "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1211             "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1212             "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1213             "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1214             "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1215             "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1216                 );
1217 }; // end of case lookup tables
1218
1219 /**
1220  * UTF-8 lookup table for lower case accented letters
1221  *
1222  * This lookuptable defines replacements for accented characters from the ASCII-7
1223  * range. This are lower case letters only.
1224  *
1225  * @author Andreas Gohr <andi@splitbrain.org>
1226  * @see    utf8_deaccent()
1227  */
1228 global $UTF8_LOWER_ACCENTS;
1229 if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1230   'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1231   'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1232   'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1233   'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1234   'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1235   'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1236   'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1237   'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1238   'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1239   'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1240   'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1241   'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1242   'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1243   'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1244   'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1245 );
1246
1247 /**
1248  * UTF-8 lookup table for upper case accented letters
1249  *
1250  * This lookuptable defines replacements for accented characters from the ASCII-7
1251  * range. This are upper case letters only.
1252  *
1253  * @author Andreas Gohr <andi@splitbrain.org>
1254  * @see    utf8_deaccent()
1255  */
1256 global $UTF8_UPPER_ACCENTS;
1257 if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1258   'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1259   'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1260   'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1261   'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1262   'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1263   'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1264   'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1265   'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1266   'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1267   'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1268   'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1269   'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1270   'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1271   'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1272   'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1273 );
1274
1275 /**
1276  * UTF-8 array of common special characters
1277  *
1278  * This array should contain all special characters (not a letter or digit)
1279  * defined in the various local charsets - it's not a complete list of non-alphanum
1280  * characters in UTF-8. It's not perfect but should match most cases of special
1281  * chars.
1282  *
1283  * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1284  * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1285  *
1286  * @author Andreas Gohr <andi@splitbrain.org>
1287  * @see    utf8_stripspecials()
1288  */
1289 global $UTF8_SPECIAL_CHARS;
1290 if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1291   0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1292   0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1293           0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1294   0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1295   0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1296   0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1297   0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1298   0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1299   0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1300   0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1301   0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1302   0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1303   0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1304   0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1305   0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1306   0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1307   0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1308   0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1309   0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1310   0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1311   0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1312   0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1313   0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1314   0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1315   0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1316   0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1317   0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1318   0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1319   0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1320   0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1321   0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1322   0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1323   0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1324   0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1325   0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1326   0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1327   0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1328   0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1329   0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1330   0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1331   0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1332   0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1333   0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1334   0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1335   0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1336   0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1337   0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1338   0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1339   0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1340   0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1341   0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1342   0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1343   0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1344           0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1345   0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1346   0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1347   0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1348   0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1349   0xffeb, 0xffec, 0xffed, 0xffee,
1350   0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1351   0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1352   0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1353   0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1354   0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1355 );
1356
1357 // utf8 version of above data
1358 global $UTF8_SPECIAL_CHARS2;
1359 if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1360     "\x1A".'\e\x1c\x1d\x1e\x1f !"#$%&\'()+,/;<=>?@[\]^`{|}~\x7f�'.
1361     '� ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'.
1362     '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1363     '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1364     '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1365     '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1366     '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1367     '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1368     '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1369     '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1370     '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1371     '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1372     '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1373     '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1374     '➷➸➹➺➻➼➽➾'.
1375     '　、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1376     '�'.
1377     '�ﹼﹽ'.
1378     '！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？＠［＼］＾｀｛｜｝～'.
1379     '｟｠｡｢｣､･￠￡￢￣￤￥￦￨￩￪￫￬￭￮'.
1380     '𝛼𝛽𝛾𝛿𝜀𝜁𝜂𝜃𝜄𝜅𝜆𝜇𝜈𝜉𝜊𝜋𝜌𝜍𝜎𝜏𝜐𝜑𝜒𝜓𝜔𝜕𝜖𝜗𝜘𝜙𝜚𝜛'.
1381     '   ⁠';
1382
1383 /**
1384  * Romanization lookup table
1385  *
1386  * This lookup tables provides a way to transform strings written in a language
1387  * different from the ones based upon latin letters into plain ASCII.
1388  *
1389  * Please note: this is not a scientific transliteration table. It only works
1390  * oneway from nonlatin to ASCII and it works by simple character replacement
1391  * only. Specialities of each language are not supported.
1392  *
1393  * @author Andreas Gohr <andi@splitbrain.org>
1394  * @author Vitaly Blokhin <vitinfo@vitn.com>
1395  * @link   http://www.uconv.com/translit.htm
1396  * @author Bisqwit <bisqwit@iki.fi>
1397  * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1398  * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1399  * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1400  * @link   http://www.btranslations.com/resources/romanization/korean.asp
1401  * @author Arthit Suriyawongkul <arthit@gmail.com>
1402  * @author Denis Scheither <amorphis@uni-bremen.de>
1403  * @author Eivind Morland <eivind.morland@gmail.com>
1404  */
1405 global $UTF8_ROMANIZATION;
1406 if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1407   // scandinavian - differs from what we do in deaccent
1408   'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1409
1410   //russian cyrillic
1411   'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1412   'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1413   'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1414   'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1415   'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1416   'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1417   'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1418   'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1419   'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1420   // Ukrainian cyrillic
1421   'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1422   // Georgian
1423   'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1424   'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1425   'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1426   'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1427   'ჰ'=>'xh',
1428   //Sanskrit
1429   'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1430   'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1431   'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1432   'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1433   'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1434   'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1435   'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1436   //Sanskrit diacritics
1437   'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1438   'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1439   'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1440   'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1441   //Hebrew
1442   'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1443   'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1444   'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1445   'ש'=>'sh','ת'=>'t',
1446   //Arabic
1447   'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1448   'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1449   'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1450   'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1451
1452   // Japanese characters  (last update: 2008-05-09)
1453
1454   // Japanese hiragana
1455
1456   // 3 character syllables, っ doubles the consonant after
1457   'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1458   'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1459   'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1460   'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1461   // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1462   'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1463   'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1464   'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1465   'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1466   'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1467   'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1468
1469   // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1470   'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1471   'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1472
1473    // 2 character syllables - normal
1474   'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1475   'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1476   'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1477   'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1478   'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1479   'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1480   'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1481   'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1482   'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1483   'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1484   'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1485   'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1486   'うぇ'=>'we','うぃ'=>'wi',
1487   'いぇ'=>'ye',
1488
1489   // 2 character syllables, っ doubles the consonant after
1490   'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1491   'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1492   'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1493   'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1494   'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1495   'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1496   'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1497   'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1498   'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1499   'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1500   'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1501
1502   // 1 character syllabels
1503   'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1504   'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1505   'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1506   'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1507   'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1508   'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1509   'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1510   'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1511   'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1512   'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1513   'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1514   'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1515   'わ'=>'wa','を'=>'wo',
1516   'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1517   'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1518   // old characters
1519   'ゑ'=>'we','ゐ'=>'wi',
1520
1521   //  convert what's left (probably only kicks in when something's missing above)
1522   // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1523   // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1524
1525   // never seen one of those (disabled for the moment)
1526   // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1527   // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1528   // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1529   // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1530   // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1531   // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1532   // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1533   // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1534   // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1535   // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1536   // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1537   // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1538   // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1539   // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1540
1541   // 'spare' characters from other romanization systems
1542   // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1543   // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1544   // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1545   // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1546   //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1547   //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1548   //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1549   //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1550   //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1551   //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1552
1553
1554   // Japanese katakana
1555
1556   // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1557   'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1558   'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1559   'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1560   'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1561   'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1562   'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1563   'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1564   'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1565   'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1566   'ッティー'=>'ttii',
1567   'ッヂィー'=>'ddii',
1568
1569   // 3 character syllables - doubled vowels
1570   'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1571   'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1572   'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1573   'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1574   'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1575   'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1576   'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1577   'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1578   'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1579   'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1580   'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1581   'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1582   'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1583   'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1584   'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1585   'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1586   'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1587   'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1588   'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1589   'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1590   'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1591   'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1592   'ウェー'=>'wee','ウィー'=>'wii',
1593   'イェー'=>'yee',
1594   'ティー'=>'tii',
1595   'ヂィー'=>'dii',
1596
1597   // 3 character syllables - doubled consonants
1598   'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1599   'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1600   'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1601   'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1602   'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1603   'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1604   'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1605   'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1606   'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1607   'ッティ'=>'tti',
1608   'ッヂィ'=>'ddi',
1609
1610   // 3 character syllables - doubled vowel and consonants
1611   'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1612   'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1613   'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1614   'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1615   'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1616   'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1617   'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1618   'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1619   'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1620   'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1621   'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1622
1623   // 2 character syllables - normal
1624   'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1625   // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1626   'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1627   'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1628   'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1629   'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1630   'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1631   'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1632   'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1633   'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1634   'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1635   'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1636   'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1637   'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1638   'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1639   'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1640   // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1641   'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1642   'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1643   'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1644   'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1645   'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1646   'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1647   'ウェ'=>'we','ウィ'=>'wi',
1648   'イェ'=>'ye',
1649   'ティ'=>'ti',
1650   'ヂィ'=>'di',
1651
1652   // 2 character syllables - doubled vocal
1653   'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1654   'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1655   'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1656   'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1657   'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1658   'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1659   'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1660   'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1661   'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1662   'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1663   'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1664   'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1665   'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1666   'ワー'=>'waa','ヲー'=>'woo',
1667   'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1668   'ヵー'=>'kaa','ヶー'=>'kee',
1669   // old characters
1670   'ヱー'=>'wee','ヰー'=>'wii',
1671
1672   // seperate katakana 'n'
1673   'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1674   'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1675
1676   // 2 character syllables - doubled consonants
1677   'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1678   'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1679   'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1680   'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1681   'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1682   'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1683   'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1684   'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1685   'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1686   'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1687   'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1688
1689   // 1 character syllables
1690   'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1691   'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1692   'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1693   'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1694   'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1695   'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1696   'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1697   'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1698   'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1699   'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1700   'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1701   'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1702   'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1703   'ワ'=>'wa','ヲ'=>'wo',
1704   'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1705   'ヵ'=>'ka','ヶ'=>'ke',
1706   // old characters
1707   'ヱ'=>'we','ヰ'=>'wi',
1708
1709   //  convert what's left (probably only kicks in when something's missing above)
1710   'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1711   'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1712
1713   // special characters
1714   '・'=>'_','、'=>'_',
1715   'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1716
1717   // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1718   // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1719   //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1720   // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1721   // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1722   //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1723   //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1724   // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1725   // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1726   //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1727   //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1728   //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1729
1730   // "Greeklish"
1731   'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1732   'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1733
1734   // Thai
1735   'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1736   'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1737   'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1738   'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1739   'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1740   'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1741   'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1742   'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1743   'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1744   'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1745   'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1746   'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1747   'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1748   '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1749   '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1750   'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1751   '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1752   '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1753
1754   // Korean
1755   'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1756   'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1757   'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1758   'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1759   'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1760   'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1761 );
1762
1763