Release 2015-08-10 "Detritus"
[dokuwiki.git] / inc / utf8.php
blob2b6a0c4989752575155ad6565fdd47a821e87d4a
1 <?php
2 /**
3 * UTF8 helper functions
5 * @license LGPL 2.1 (http://www.gnu.org/copyleft/lesser.html)
6 * @author Andreas Gohr <andi@splitbrain.org>
7 */
9 /**
10 * check for mb_string support
12 if(!defined('UTF8_MBSTRING')){
13 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
14 define('UTF8_MBSTRING',1);
15 }else{
16 define('UTF8_MBSTRING',0);
20 /**
21 * Check if PREG was compiled with UTF-8 support
23 * Without this many of the functions below will not work, so this is a minimal requirement
25 if(!defined('UTF8_PREGSUPPORT')){
26 define('UTF8_PREGSUPPORT', (bool) @preg_match('/^.$/u', 'ñ'));
29 /**
30 * Check if PREG was compiled with Unicode Property support
32 * This is not required for the functions below, but might be needed in a UTF-8 aware application
34 if(!defined('UTF8_PROPERTYSUPPORT')){
35 define('UTF8_PROPERTYSUPPORT', (bool) @preg_match('/^\pL$/u', 'ñ'));
39 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
41 if(!function_exists('utf8_isASCII')){
42 /**
43 * Checks if a string contains 7bit ASCII only
45 * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
47 * @param string $str
48 * @return bool
50 function utf8_isASCII($str){
51 return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
55 if(!function_exists('utf8_strip')){
56 /**
57 * Strips all highbyte chars
59 * Returns a pure ASCII7 string
61 * @author Andreas Gohr <andi@splitbrain.org>
63 * @param string $str
64 * @return string
66 function utf8_strip($str){
67 $ascii = '';
68 $len = strlen($str);
69 for($i=0; $i<$len; $i++){
70 if(ord($str{$i}) <128){
71 $ascii .= $str{$i};
74 return $ascii;
78 if(!function_exists('utf8_check')){
79 /**
80 * Tries to detect if a string is in Unicode encoding
82 * @author <bmorel@ssi.fr>
83 * @link http://www.php.net/manual/en/function.utf8-encode.php
85 * @param string $Str
86 * @return bool
88 function utf8_check($Str) {
89 $len = strlen($Str);
90 for ($i=0; $i<$len; $i++) {
91 $b = ord($Str[$i]);
92 if ($b < 0x80) continue; # 0bbbbbbb
93 elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
94 elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
95 elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
96 elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
97 elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
98 else return false; # Does not match any model
100 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
101 if ((++$i == $len) || ((ord($Str[$i]) & 0xC0) != 0x80))
102 return false;
105 return true;
109 if(!function_exists('utf8_basename')){
111 * A locale independent basename() implementation
113 * works around a bug in PHP's basename() implementation
115 * @see basename()
116 * @link https://bugs.php.net/bug.php?id=37738
118 * @param string $path A path
119 * @param string $suffix If the name component ends in suffix this will also be cut off
120 * @return string
122 function utf8_basename($path, $suffix=''){
123 $path = trim($path,'\\/');
124 $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
125 if($rpos) $path = substr($path, $rpos+1);
127 $suflen = strlen($suffix);
128 if($suflen && (substr($path, -$suflen) == $suffix)){
129 $path = substr($path, 0, -$suflen);
132 return $path;
136 if(!function_exists('utf8_strlen')){
138 * Unicode aware replacement for strlen()
140 * utf8_decode() converts characters that are not in ISO-8859-1
141 * to '?', which, for the purpose of counting, is alright - It's
142 * even faster than mb_strlen.
144 * @author <chernyshevsky at hotmail dot com>
145 * @see strlen()
146 * @see utf8_decode()
148 * @param string $string
149 * @return int
151 function utf8_strlen($string){
152 return strlen(utf8_decode($string));
156 if(!function_exists('utf8_substr')){
158 * UTF-8 aware alternative to substr
160 * Return part of a string given character offset (and optionally length)
162 * @author Harry Fuecks <hfuecks@gmail.com>
163 * @author Chris Smith <chris@jalakai.co.uk>
165 * @param string $str
166 * @param int $offset number of UTF-8 characters offset (from left)
167 * @param int $length (optional) length in UTF-8 characters from offset
168 * @return string
170 function utf8_substr($str, $offset, $length = null) {
171 if(UTF8_MBSTRING){
172 if( $length === null ){
173 return mb_substr($str, $offset);
174 }else{
175 return mb_substr($str, $offset, $length);
180 * Notes:
182 * no mb string support, so we'll use pcre regex's with 'u' flag
183 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
184 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
186 * substr documentation states false can be returned in some cases (e.g. offset > string length)
187 * mb_substr never returns false, it will return an empty string instead.
189 * calculating the number of characters in the string is a relatively expensive operation, so
190 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
193 // cast parameters to appropriate types to avoid multiple notices/warnings
194 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects
195 $offset = (int)$offset;
196 if (!is_null($length)) $length = (int)$length;
198 // handle trivial cases
199 if ($length === 0) return '';
200 if ($offset < 0 && $length < 0 && $length < $offset) return '';
202 $offset_pattern = '';
203 $length_pattern = '';
205 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
206 if ($offset < 0) {
207 $strlen = strlen(utf8_decode($str)); // see notes
208 $offset = $strlen + $offset;
209 if ($offset < 0) $offset = 0;
212 // establish a pattern for offset, a non-captured group equal in length to offset
213 if ($offset > 0) {
214 $Ox = (int)($offset/65535);
215 $Oy = $offset%65535;
217 if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
218 $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
219 } else {
220 $offset_pattern = '^'; // offset == 0; just anchor the pattern
223 // establish a pattern for length
224 if (is_null($length)) {
225 $length_pattern = '(.*)$'; // the rest of the string
226 } else {
228 if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes
229 if ($offset > $strlen) return ''; // another trivial case
231 if ($length > 0) {
233 $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string
235 $Lx = (int)($length/65535);
236 $Ly = $length%65535;
238 // +ve length requires ... a captured group of length characters
239 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
240 $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
242 } else if ($length < 0) {
244 if ($length < ($offset - $strlen)) return '';
246 $Lx = (int)((-$length)/65535);
247 $Ly = (-$length)%65535;
249 // -ve length requires ... capture everything except a group of -length characters
250 // anchored at the tail-end of the string
251 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
252 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
256 if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
257 return $match[1];
261 if(!function_exists('utf8_substr_replace')){
263 * Unicode aware replacement for substr_replace()
265 * @author Andreas Gohr <andi@splitbrain.org>
266 * @see substr_replace()
268 * @param string $string input string
269 * @param string $replacement the replacement
270 * @param int $start the replacing will begin at the start'th offset into string.
271 * @param int $length If given and is positive, it represents the length of the portion of string which is
272 * to be replaced. If length is zero then this function will have the effect of inserting
273 * replacement into string at the given start offset.
274 * @return string
276 function utf8_substr_replace($string, $replacement, $start , $length=0 ){
277 $ret = '';
278 if($start>0) $ret .= utf8_substr($string, 0, $start);
279 $ret .= $replacement;
280 $ret .= utf8_substr($string, $start+$length);
281 return $ret;
285 if(!function_exists('utf8_ltrim')){
287 * Unicode aware replacement for ltrim()
289 * @author Andreas Gohr <andi@splitbrain.org>
290 * @see ltrim()
292 * @param string $str
293 * @param string $charlist
294 * @return string
296 function utf8_ltrim($str,$charlist=''){
297 if($charlist == '') return ltrim($str);
299 //quote charlist for use in a characterclass
300 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
302 return preg_replace('/^['.$charlist.']+/u','',$str);
306 if(!function_exists('utf8_rtrim')){
308 * Unicode aware replacement for rtrim()
310 * @author Andreas Gohr <andi@splitbrain.org>
311 * @see rtrim()
313 * @param string $str
314 * @param string $charlist
315 * @return string
317 function utf8_rtrim($str,$charlist=''){
318 if($charlist == '') return rtrim($str);
320 //quote charlist for use in a characterclass
321 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
323 return preg_replace('/['.$charlist.']+$/u','',$str);
327 if(!function_exists('utf8_trim')){
329 * Unicode aware replacement for trim()
331 * @author Andreas Gohr <andi@splitbrain.org>
332 * @see trim()
334 * @param string $str
335 * @param string $charlist
336 * @return string
338 function utf8_trim($str,$charlist='') {
339 if($charlist == '') return trim($str);
341 return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
345 if(!function_exists('utf8_strtolower')){
347 * This is a unicode aware replacement for strtolower()
349 * Uses mb_string extension if available
351 * @author Leo Feyer <leo@typolight.org>
352 * @see strtolower()
353 * @see utf8_strtoupper()
355 * @param string $string
356 * @return string
358 function utf8_strtolower($string){
359 if(UTF8_MBSTRING) {
360 if (class_exists("Normalizer", $autoload = false))
361 return normalizer::normalize(mb_strtolower($string,'utf-8'));
362 else
363 return (mb_strtolower($string,'utf-8'));
365 global $UTF8_UPPER_TO_LOWER;
366 return strtr($string,$UTF8_UPPER_TO_LOWER);
370 if(!function_exists('utf8_strtoupper')){
372 * This is a unicode aware replacement for strtoupper()
374 * Uses mb_string extension if available
376 * @author Leo Feyer <leo@typolight.org>
377 * @see strtoupper()
378 * @see utf8_strtoupper()
380 * @param string $string
381 * @return string
383 function utf8_strtoupper($string){
384 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
386 global $UTF8_LOWER_TO_UPPER;
387 return strtr($string,$UTF8_LOWER_TO_UPPER);
391 if(!function_exists('utf8_ucfirst')){
393 * UTF-8 aware alternative to ucfirst
394 * Make a string's first character uppercase
396 * @author Harry Fuecks
398 * @param string $str
399 * @return string with first character as upper case (if applicable)
401 function utf8_ucfirst($str){
402 switch ( utf8_strlen($str) ) {
403 case 0:
404 return '';
405 case 1:
406 return utf8_strtoupper($str);
407 default:
408 preg_match('/^(.{1})(.*)$/us', $str, $matches);
409 return utf8_strtoupper($matches[1]).$matches[2];
414 if(!function_exists('utf8_ucwords')){
416 * UTF-8 aware alternative to ucwords
417 * Uppercase the first character of each word in a string
419 * @author Harry Fuecks
420 * @see http://www.php.net/ucwords
422 * @param string $str
423 * @return string with first char of each word uppercase
425 function utf8_ucwords($str) {
426 // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
427 // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
428 // This corresponds to the definition of a "word" defined at http://www.php.net/ucwords
429 $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
431 return preg_replace_callback($pattern, 'utf8_ucwords_callback',$str);
435 * Callback function for preg_replace_callback call in utf8_ucwords
436 * You don't need to call this yourself
438 * @author Harry Fuecks
439 * @see utf8_ucwords
440 * @see utf8_strtoupper
442 * @param array $matches matches corresponding to a single word
443 * @return string with first char of the word in uppercase
445 function utf8_ucwords_callback($matches) {
446 $leadingws = $matches[2];
447 $ucfirst = utf8_strtoupper($matches[3]);
448 $ucword = utf8_substr_replace(ltrim($matches[0]),$ucfirst,0,1);
449 return $leadingws . $ucword;
453 if(!function_exists('utf8_deaccent')){
455 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
457 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
458 * letters. Default is to deaccent both cases ($case = 0)
460 * @author Andreas Gohr <andi@splitbrain.org>
462 * @param string $string
463 * @param int $case
464 * @return string
466 function utf8_deaccent($string,$case=0){
467 if($case <= 0){
468 global $UTF8_LOWER_ACCENTS;
469 $string = strtr($string,$UTF8_LOWER_ACCENTS);
471 if($case >= 0){
472 global $UTF8_UPPER_ACCENTS;
473 $string = strtr($string,$UTF8_UPPER_ACCENTS);
475 return $string;
479 if(!function_exists('utf8_romanize')){
481 * Romanize a non-latin string
483 * @author Andreas Gohr <andi@splitbrain.org>
485 * @param string $string
486 * @return string
488 function utf8_romanize($string){
489 if(utf8_isASCII($string)) return $string; //nothing to do
491 global $UTF8_ROMANIZATION;
492 return strtr($string,$UTF8_ROMANIZATION);
496 if(!function_exists('utf8_stripspecials')){
498 * Removes special characters (nonalphanumeric) from a UTF-8 string
500 * This function adds the controlchars 0x00 to 0x19 to the array of
501 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
503 * @author Andreas Gohr <andi@splitbrain.org>
505 * @param string $string The UTF8 string to strip of special chars
506 * @param string $repl Replace special with this string
507 * @param string $additional Additional chars to strip (used in regexp char class)
508 * @return string
510 function utf8_stripspecials($string,$repl='',$additional=''){
511 global $UTF8_SPECIAL_CHARS2;
513 static $specials = null;
514 if(is_null($specials)){
515 #$specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
516 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
519 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
523 if(!function_exists('utf8_strpos')){
525 * This is an Unicode aware replacement for strpos
527 * @author Leo Feyer <leo@typolight.org>
528 * @see strpos()
530 * @param string $haystack
531 * @param string $needle
532 * @param integer $offset
533 * @return integer
535 function utf8_strpos($haystack, $needle, $offset=0){
536 $comp = 0;
537 $length = null;
539 while (is_null($length) || $length < $offset) {
540 $pos = strpos($haystack, $needle, $offset + $comp);
542 if ($pos === false)
543 return false;
545 $length = utf8_strlen(substr($haystack, 0, $pos));
547 if ($length < $offset)
548 $comp = $pos - $length;
551 return $length;
555 if(!function_exists('utf8_tohtml')){
557 * Encodes UTF-8 characters to HTML entities
559 * @author Tom N Harris <tnharris@whoopdedo.org>
560 * @author <vpribish at shopping dot com>
561 * @link http://www.php.net/manual/en/function.utf8-decode.php
563 * @param string $str
564 * @return string
566 function utf8_tohtml ($str) {
567 $ret = '';
568 foreach (utf8_to_unicode($str) as $cp) {
569 if ($cp < 0x80)
570 $ret .= chr($cp);
571 elseif ($cp < 0x100)
572 $ret .= "&#$cp;";
573 else
574 $ret .= '&#x'.dechex($cp).';';
576 return $ret;
580 if(!function_exists('utf8_unhtml')){
582 * Decodes HTML entities to UTF-8 characters
584 * Convert any &#..; entity to a codepoint,
585 * The entities flag defaults to only decoding numeric entities.
586 * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
587 * are handled as well. Avoids the problem that would occur if you
588 * had to decode "&amp;#38;&#38;amp;#38;"
590 * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
591 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
592 * what it should be -> "&#38;&amp#38;"
594 * @author Tom N Harris <tnharris@whoopdedo.org>
596 * @param string $str UTF-8 encoded string
597 * @param boolean $entities Flag controlling decoding of named entities.
598 * @return string UTF-8 encoded string with numeric (and named) entities replaced.
600 function utf8_unhtml($str, $entities=null) {
601 static $decoder = null;
602 if (is_null($decoder))
603 $decoder = new utf8_entity_decoder();
604 if (is_null($entities))
605 return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
606 'utf8_decode_numeric', $str);
607 else
608 return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
609 array(&$decoder, 'decode'), $str);
613 if(!function_exists('utf8_decode_numeric')){
615 * Decodes numeric HTML entities to their correct UTF-8 characters
617 * @param $ent string A numeric entity
618 * @return string|false
620 function utf8_decode_numeric($ent) {
621 switch ($ent[2]) {
622 case 'X':
623 case 'x':
624 $cp = hexdec($ent[3]);
625 break;
626 default:
627 $cp = intval($ent[3]);
628 break;
630 return unicode_to_utf8(array($cp));
634 if(!class_exists('utf8_entity_decoder')){
636 * Encapsulate HTML entity decoding tables
638 class utf8_entity_decoder {
639 var $table;
642 * Initializes the decoding tables
644 function __construct() {
645 $table = get_html_translation_table(HTML_ENTITIES);
646 $table = array_flip($table);
647 $this->table = array_map(array(&$this,'makeutf8'), $table);
651 * Wrapper around unicode_to_utf8()
653 * @param string $c
654 * @return string|false
656 function makeutf8($c) {
657 return unicode_to_utf8(array(ord($c)));
661 * Decodes any HTML entity to it's correct UTF-8 char equivalent
663 * @param string $ent An entity
664 * @return string|false
666 function decode($ent) {
667 if ($ent[1] == '#') {
668 return utf8_decode_numeric($ent);
669 } elseif (array_key_exists($ent[0],$this->table)) {
670 return $this->table[$ent[0]];
671 } else {
672 return $ent[0];
678 if(!function_exists('utf8_to_unicode')){
680 * Takes an UTF-8 string and returns an array of ints representing the
681 * Unicode characters. Astral planes are supported ie. the ints in the
682 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
683 * are not allowed.
685 * If $strict is set to true the function returns false if the input
686 * string isn't a valid UTF-8 octet sequence and raises a PHP error at
687 * level E_USER_WARNING
689 * Note: this function has been modified slightly in this library to
690 * trigger errors on encountering bad bytes
692 * @author <hsivonen@iki.fi>
693 * @author Harry Fuecks <hfuecks@gmail.com>
694 * @see unicode_to_utf8
695 * @link http://hsivonen.iki.fi/php-utf8/
696 * @link http://sourceforge.net/projects/phputf8/
698 * @param string $str UTF-8 encoded string
699 * @param boolean $strict Check for invalid sequences?
700 * @return mixed array of unicode code points or false if UTF-8 invalid
702 function utf8_to_unicode($str,$strict=false) {
703 $mState = 0; // cached expected number of octets after the current octet
704 // until the beginning of the next UTF8 character sequence
705 $mUcs4 = 0; // cached Unicode character
706 $mBytes = 1; // cached expected number of octets in the current sequence
708 $out = array();
710 $len = strlen($str);
712 for($i = 0; $i < $len; $i++) {
714 $in = ord($str{$i});
716 if ( $mState == 0) {
718 // When mState is zero we expect either a US-ASCII character or a
719 // multi-octet sequence.
720 if (0 == (0x80 & ($in))) {
721 // US-ASCII, pass straight through.
722 $out[] = $in;
723 $mBytes = 1;
725 } else if (0xC0 == (0xE0 & ($in))) {
726 // First octet of 2 octet sequence
727 $mUcs4 = ($in);
728 $mUcs4 = ($mUcs4 & 0x1F) << 6;
729 $mState = 1;
730 $mBytes = 2;
732 } else if (0xE0 == (0xF0 & ($in))) {
733 // First octet of 3 octet sequence
734 $mUcs4 = ($in);
735 $mUcs4 = ($mUcs4 & 0x0F) << 12;
736 $mState = 2;
737 $mBytes = 3;
739 } else if (0xF0 == (0xF8 & ($in))) {
740 // First octet of 4 octet sequence
741 $mUcs4 = ($in);
742 $mUcs4 = ($mUcs4 & 0x07) << 18;
743 $mState = 3;
744 $mBytes = 4;
746 } else if (0xF8 == (0xFC & ($in))) {
747 /* First octet of 5 octet sequence.
749 * This is illegal because the encoded codepoint must be either
750 * (a) not the shortest form or
751 * (b) outside the Unicode range of 0-0x10FFFF.
752 * Rather than trying to resynchronize, we will carry on until the end
753 * of the sequence and let the later error handling code catch it.
755 $mUcs4 = ($in);
756 $mUcs4 = ($mUcs4 & 0x03) << 24;
757 $mState = 4;
758 $mBytes = 5;
760 } else if (0xFC == (0xFE & ($in))) {
761 // First octet of 6 octet sequence, see comments for 5 octet sequence.
762 $mUcs4 = ($in);
763 $mUcs4 = ($mUcs4 & 1) << 30;
764 $mState = 5;
765 $mBytes = 6;
767 } elseif($strict) {
768 /* Current octet is neither in the US-ASCII range nor a legal first
769 * octet of a multi-octet sequence.
771 trigger_error(
772 'utf8_to_unicode: Illegal sequence identifier '.
773 'in UTF-8 at byte '.$i,
774 E_USER_WARNING
776 return false;
780 } else {
782 // When mState is non-zero, we expect a continuation of the multi-octet
783 // sequence
784 if (0x80 == (0xC0 & ($in))) {
786 // Legal continuation.
787 $shift = ($mState - 1) * 6;
788 $tmp = $in;
789 $tmp = ($tmp & 0x0000003F) << $shift;
790 $mUcs4 |= $tmp;
793 * End of the multi-octet sequence. mUcs4 now contains the final
794 * Unicode codepoint to be output
796 if (0 == --$mState) {
799 * Check for illegal sequences and codepoints.
801 // From Unicode 3.1, non-shortest form is illegal
802 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
803 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
804 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
805 (4 < $mBytes) ||
806 // From Unicode 3.2, surrogate characters are illegal
807 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
808 // Codepoints outside the Unicode range are illegal
809 ($mUcs4 > 0x10FFFF)) {
811 if($strict){
812 trigger_error(
813 'utf8_to_unicode: Illegal sequence or codepoint '.
814 'in UTF-8 at byte '.$i,
815 E_USER_WARNING
818 return false;
823 if (0xFEFF != $mUcs4) {
824 // BOM is legal but we don't want to output it
825 $out[] = $mUcs4;
828 //initialize UTF8 cache
829 $mState = 0;
830 $mUcs4 = 0;
831 $mBytes = 1;
834 } elseif($strict) {
836 *((0xC0 & (*in) != 0x80) && (mState != 0))
837 * Incomplete multi-octet sequence.
839 trigger_error(
840 'utf8_to_unicode: Incomplete multi-octet '.
841 ' sequence in UTF-8 at byte '.$i,
842 E_USER_WARNING
845 return false;
849 return $out;
853 if(!function_exists('unicode_to_utf8')){
855 * Takes an array of ints representing the Unicode characters and returns
856 * a UTF-8 string. Astral planes are supported ie. the ints in the
857 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
858 * are not allowed.
860 * If $strict is set to true the function returns false if the input
861 * array contains ints that represent surrogates or are outside the
862 * Unicode range and raises a PHP error at level E_USER_WARNING
864 * Note: this function has been modified slightly in this library to use
865 * output buffering to concatenate the UTF-8 string (faster) as well as
866 * reference the array by it's keys
868 * @param array $arr of unicode code points representing a string
869 * @param boolean $strict Check for invalid sequences?
870 * @return string|false UTF-8 string or false if array contains invalid code points
872 * @author <hsivonen@iki.fi>
873 * @author Harry Fuecks <hfuecks@gmail.com>
874 * @see utf8_to_unicode
875 * @link http://hsivonen.iki.fi/php-utf8/
876 * @link http://sourceforge.net/projects/phputf8/
878 function unicode_to_utf8($arr,$strict=false) {
879 if (!is_array($arr)) return '';
880 ob_start();
882 foreach (array_keys($arr) as $k) {
884 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
885 # ASCII range (including control chars)
887 echo chr($arr[$k]);
889 } else if ($arr[$k] <= 0x07ff) {
890 # 2 byte sequence
892 echo chr(0xc0 | ($arr[$k] >> 6));
893 echo chr(0x80 | ($arr[$k] & 0x003f));
895 } else if($arr[$k] == 0xFEFF) {
896 # Byte order mark (skip)
898 // nop -- zap the BOM
900 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
901 # Test for illegal surrogates
903 // found a surrogate
904 if($strict){
905 trigger_error(
906 'unicode_to_utf8: Illegal surrogate '.
907 'at index: '.$k.', value: '.$arr[$k],
908 E_USER_WARNING
910 return false;
913 } else if ($arr[$k] <= 0xffff) {
914 # 3 byte sequence
916 echo chr(0xe0 | ($arr[$k] >> 12));
917 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
918 echo chr(0x80 | ($arr[$k] & 0x003f));
920 } else if ($arr[$k] <= 0x10ffff) {
921 # 4 byte sequence
923 echo chr(0xf0 | ($arr[$k] >> 18));
924 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
925 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
926 echo chr(0x80 | ($arr[$k] & 0x3f));
928 } elseif($strict) {
930 trigger_error(
931 'unicode_to_utf8: Codepoint out of Unicode range '.
932 'at index: '.$k.', value: '.$arr[$k],
933 E_USER_WARNING
936 // out of range
937 return false;
941 $result = ob_get_contents();
942 ob_end_clean();
943 return $result;
947 if(!function_exists('utf8_to_utf16be')){
949 * UTF-8 to UTF-16BE conversion.
951 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
953 * @param string $str
954 * @param bool $bom
955 * @return string
957 function utf8_to_utf16be(&$str, $bom = false) {
958 $out = $bom ? "\xFE\xFF" : '';
959 if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
961 $uni = utf8_to_unicode($str);
962 foreach($uni as $cp){
963 $out .= pack('n',$cp);
965 return $out;
969 if(!function_exists('utf16be_to_utf8')){
971 * UTF-8 to UTF-16BE conversion.
973 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
975 * @param string $str
976 * @return false|string
978 function utf16be_to_utf8(&$str) {
979 $uni = unpack('n*',$str);
980 return unicode_to_utf8($uni);
984 if(!function_exists('utf8_bad_replace')){
986 * Replace bad bytes with an alternative character
988 * ASCII character is recommended for replacement char
990 * PCRE Pattern to locate bad bytes in a UTF-8 string
991 * Comes from W3 FAQ: Multilingual Forms
992 * Note: modified to include full ASCII range including control chars
994 * @author Harry Fuecks <hfuecks@gmail.com>
995 * @see http://www.w3.org/International/questions/qa-forms-utf-8
997 * @param string $str to search
998 * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
999 * @return string
1001 function utf8_bad_replace($str, $replace = '') {
1002 $UTF8_BAD =
1003 '([\x00-\x7F]'. # ASCII (including control chars)
1004 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
1005 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
1006 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
1007 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
1008 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
1009 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
1010 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
1011 '|(.{1}))'; # invalid byte
1012 ob_start();
1013 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
1014 if ( !isset($matches[2])) {
1015 echo $matches[0];
1016 } else {
1017 echo $replace;
1019 $str = substr($str,strlen($matches[0]));
1021 $result = ob_get_contents();
1022 ob_end_clean();
1023 return $result;
1027 if(!function_exists('utf8_correctIdx')){
1029 * adjust a byte index into a utf8 string to a utf8 character boundary
1031 * @param string $str utf8 character string
1032 * @param int $i byte index into $str
1033 * @param $next bool direction to search for boundary,
1034 * false = up (current character)
1035 * true = down (next character)
1037 * @return int byte index into $str now pointing to a utf8 character boundary
1039 * @author chris smith <chris@jalakai.co.uk>
1041 function utf8_correctIdx(&$str,$i,$next=false) {
1043 if ($i <= 0) return 0;
1045 $limit = strlen($str);
1046 if ($i>=$limit) return $limit;
1048 if ($next) {
1049 while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
1050 } else {
1051 while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
1054 return $i;
1058 // only needed if no mb_string available
1059 if(!UTF8_MBSTRING){
1061 * UTF-8 Case lookup table
1063 * This lookuptable defines the upper case letters to their correspponding
1064 * lower case letter in UTF-8
1066 * @author Andreas Gohr <andi@splitbrain.org>
1068 global $UTF8_LOWER_TO_UPPER;
1069 if(empty($UTF8_LOWER_TO_UPPER)) $UTF8_LOWER_TO_UPPER = array(
1070 "z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T","s"=>"S","r"=>"R","q"=>"Q",
1071 "p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J","i"=>"I","h"=>"H","g"=>"G",
1072 "f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
1073 "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
1074 "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
1075 "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
1076 "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
1077 "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
1078 "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
1079 "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
1080 "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
1081 "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
1082 "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
1083 "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
1084 "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
1085 "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
1086 "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
1087 "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
1088 "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
1089 "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
1090 "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
1091 "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
1092 "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
1093 "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
1094 "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
1095 "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
1096 "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
1097 "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
1098 "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
1099 "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
1100 "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
1101 "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
1102 "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
1103 "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
1104 "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
1105 "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
1106 "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
1107 "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
1108 "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
1109 "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
1110 "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
1111 "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
1112 "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
1113 "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
1114 "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
1115 "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
1116 "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
1117 "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
1118 "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
1119 "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
1120 "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","dz"=>"Dz","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
1121 "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
1122 "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","nj"=>"Nj","lj"=>"Lj","dž"=>"Dž","ƿ"=>"Ƿ",
1123 "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
1124 "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
1125 "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
1126 "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
1127 "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
1128 "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ij"=>"IJ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
1129 "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
1130 "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
1131 "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
1132 "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
1133 "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
1134 "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
1135 "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
1136 "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
1140 * UTF-8 Case lookup table
1142 * This lookuptable defines the lower case letters to their corresponding
1143 * upper case letter in UTF-8
1145 * @author Andreas Gohr <andi@splitbrain.org>
1147 global $UTF8_UPPER_TO_LOWER;
1148 if(empty($UTF8_UPPER_TO_LOWER)) $UTF8_UPPER_TO_LOWER = array (
1149 "Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t","S"=>"s","R"=>"r","Q"=>"q",
1150 "P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j","I"=>"i","H"=>"h","G"=>"g",
1151 "F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
1152 "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
1153 "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
1154 "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
1155 "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
1156 "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
1157 "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
1158 "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
1159 "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
1160 "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
1161 "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
1162 "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
1163 "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
1164 "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
1165 "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
1166 "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
1167 "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
1168 "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
1169 "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
1170 "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
1171 "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
1172 "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
1173 "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
1174 "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
1175 "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
1176 "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
1177 "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
1178 "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
1179 "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1180 "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1181 "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1182 "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1183 "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1184 "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1185 "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1186 "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1187 "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1188 "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1189 "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1190 "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1191 "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1192 "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1193 "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1194 "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1195 "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1196 "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1197 "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1198 "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1199 "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","Dz"=>"dz","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1200 "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1201 "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","Nj"=>"nj","Lj"=>"lj","Dž"=>"dž","Ƿ"=>"ƿ",
1202 "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1203 "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1204 "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1205 "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1206 "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1207 "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","IJ"=>"ij","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1208 "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1209 "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1210 "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1211 "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1212 "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1213 "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1214 "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1215 "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1217 }; // end of case lookup tables
1220 * UTF-8 lookup table for lower case accented letters
1222 * This lookuptable defines replacements for accented characters from the ASCII-7
1223 * range. This are lower case letters only.
1225 * @author Andreas Gohr <andi@splitbrain.org>
1226 * @see utf8_deaccent()
1228 global $UTF8_LOWER_ACCENTS;
1229 if(empty($UTF8_LOWER_ACCENTS)) $UTF8_LOWER_ACCENTS = array(
1230 'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1231 'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1232 'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1233 'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1234 'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1235 'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1236 'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1237 'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1238 'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1239 'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1240 'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1241 'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1242 'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1243 'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1244 'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1248 * UTF-8 lookup table for upper case accented letters
1250 * This lookuptable defines replacements for accented characters from the ASCII-7
1251 * range. This are upper case letters only.
1253 * @author Andreas Gohr <andi@splitbrain.org>
1254 * @see utf8_deaccent()
1256 global $UTF8_UPPER_ACCENTS;
1257 if(empty($UTF8_UPPER_ACCENTS)) $UTF8_UPPER_ACCENTS = array(
1258 'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1259 'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1260 'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1261 'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1262 'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1263 'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1264 'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1265 'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1266 'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1267 'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1268 'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1269 'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1270 'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1271 'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1272 'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1276 * UTF-8 array of common special characters
1278 * This array should contain all special characters (not a letter or digit)
1279 * defined in the various local charsets - it's not a complete list of non-alphanum
1280 * characters in UTF-8. It's not perfect but should match most cases of special
1281 * chars.
1283 * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1284 * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1286 * @author Andreas Gohr <andi@splitbrain.org>
1287 * @see utf8_stripspecials()
1289 global $UTF8_SPECIAL_CHARS;
1290 if(empty($UTF8_SPECIAL_CHARS)) $UTF8_SPECIAL_CHARS = array(
1291 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1292 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002b, 0x002c,
1293 0x002f, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1294 0x005c, 0x005d, 0x005e, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1295 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1296 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1297 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1298 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1299 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1300 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1301 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1302 0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1303 0x0385, 0x0387, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1304 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1305 0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1306 0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1307 0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1308 0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1309 0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1310 0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1311 0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1312 0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1313 0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1314 0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1315 0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1316 0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1317 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1318 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1319 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1320 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1321 0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1322 0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1323 0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1324 0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1325 0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1326 0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1327 0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1328 0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1329 0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1330 0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1331 0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1332 0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1333 0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1334 0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1335 0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1336 0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1337 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1338 0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1339 0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1340 0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1341 0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1342 0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1343 0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1344 0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1345 0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1346 0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1347 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1348 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1349 0xffeb, 0xffec, 0xffed, 0xffee,
1350 0x01d6fc, 0x01d6fd, 0x01d6fe, 0x01d6ff, 0x01d700, 0x01d701, 0x01d702, 0x01d703,
1351 0x01d704, 0x01d705, 0x01d706, 0x01d707, 0x01d708, 0x01d709, 0x01d70a, 0x01d70b,
1352 0x01d70c, 0x01d70d, 0x01d70e, 0x01d70f, 0x01d710, 0x01d711, 0x01d712, 0x01d713,
1353 0x01d714, 0x01d715, 0x01d716, 0x01d717, 0x01d718, 0x01d719, 0x01d71a, 0x01d71b,
1354 0xc2a0, 0xe28087, 0xe280af, 0xe281a0, 0xefbbbf,
1357 // utf8 version of above data
1358 global $UTF8_SPECIAL_CHARS2;
1359 if(empty($UTF8_SPECIAL_CHARS2)) $UTF8_SPECIAL_CHARS2 =
1360 "\x1A".'\e\x1c\x1d\x1e\x1f !"#$%&\'()+,/;<=>?@[\]^`{|}~\x7f€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•�'.
1361 '�—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½�'.
1362 '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·ϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1363 '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1364 '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1365 '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1366 '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1367 '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1368 '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1369 '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1370 '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1371 '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1372 '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1373 '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1374 '➷➸➹➺➻➼➽➾'.
1375 ' 、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1376 '�'.
1377 '�ﹼﹽ'.
1378 '!"#$%&'()*+,-./:;<=>?@[\]^`{|}~'.
1379 '⦅⦆。「」、・¢£¬ ̄¦¥₩│←↑→↓■○'.
1380 '𝛼𝛽𝛾𝛿𝜀𝜁𝜂𝜃𝜄𝜅𝜆𝜇𝜈𝜉𝜊𝜋𝜌𝜍𝜎𝜏𝜐𝜑𝜒𝜓𝜔𝜕𝜖𝜗𝜘𝜙𝜚𝜛'.
1381 '   ⁠';
1384 * Romanization lookup table
1386 * This lookup tables provides a way to transform strings written in a language
1387 * different from the ones based upon latin letters into plain ASCII.
1389 * Please note: this is not a scientific transliteration table. It only works
1390 * oneway from nonlatin to ASCII and it works by simple character replacement
1391 * only. Specialities of each language are not supported.
1393 * @author Andreas Gohr <andi@splitbrain.org>
1394 * @author Vitaly Blokhin <vitinfo@vitn.com>
1395 * @link http://www.uconv.com/translit.htm
1396 * @author Bisqwit <bisqwit@iki.fi>
1397 * @link http://kanjidict.stc.cx/hiragana.php?src=2
1398 * @link http://www.translatum.gr/converter/greek-transliteration.htm
1399 * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1400 * @link http://www.btranslations.com/resources/romanization/korean.asp
1401 * @author Arthit Suriyawongkul <arthit@gmail.com>
1402 * @author Denis Scheither <amorphis@uni-bremen.de>
1403 * @author Eivind Morland <eivind.morland@gmail.com>
1405 global $UTF8_ROMANIZATION;
1406 if(empty($UTF8_ROMANIZATION)) $UTF8_ROMANIZATION = array(
1407 // scandinavian - differs from what we do in deaccent
1408 'å'=>'a','Å'=>'A','ä'=>'a','Ä'=>'A','ö'=>'o','Ö'=>'O',
1410 //russian cyrillic
1411 'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1412 'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1413 'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1414 'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1415 'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1416 'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1417 'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1418 'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1419 'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1420 // Ukrainian cyrillic
1421 'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1422 // Georgian
1423 'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1424 'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1425 'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1426 'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1427 'ჰ'=>'xh',
1428 //Sanskrit
1429 'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1430 'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1431 'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1432 'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1433 'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1434 'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1435 'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1436 //Sanskrit diacritics
1437 'Ā'=>'A','Ī'=>'I','Ū'=>'U','Ṛ'=>'R','Ṝ'=>'R','Ṅ'=>'N','Ñ'=>'N','Ṭ'=>'T',
1438 'Ḍ'=>'D','Ṇ'=>'N','Ś'=>'S','Ṣ'=>'S','Ṁ'=>'M','Ṃ'=>'M','Ḥ'=>'H','Ḷ'=>'L','Ḹ'=>'L',
1439 'ā'=>'a','ī'=>'i','ū'=>'u','ṛ'=>'r','ṝ'=>'r','ṅ'=>'n','ñ'=>'n','ṭ'=>'t',
1440 'ḍ'=>'d','ṇ'=>'n','ś'=>'s','ṣ'=>'s','ṁ'=>'m','ṃ'=>'m','ḥ'=>'h','ḷ'=>'l','ḹ'=>'l',
1441 //Hebrew
1442 'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1443 'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1444 'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1445 'ש'=>'sh','ת'=>'t',
1446 //Arabic
1447 'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1448 'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1449 'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1450 'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1452 // Japanese characters (last update: 2008-05-09)
1454 // Japanese hiragana
1456 // 3 character syllables, っ doubles the consonant after
1457 'っちゃ'=>'ccha','っちぇ'=>'cche','っちょ'=>'ccho','っちゅ'=>'cchu',
1458 'っびゃ'=>'bbya','っびぇ'=>'bbye','っびぃ'=>'bbyi','っびょ'=>'bbyo','っびゅ'=>'bbyu',
1459 'っぴゃ'=>'ppya','っぴぇ'=>'ppye','っぴぃ'=>'ppyi','っぴょ'=>'ppyo','っぴゅ'=>'ppyu',
1460 'っちゃ'=>'ccha','っちぇ'=>'cche','っち'=>'cchi','っちょ'=>'ccho','っちゅ'=>'cchu',
1461 // 'っひゃ'=>'hya','っひぇ'=>'hye','っひぃ'=>'hyi','っひょ'=>'hyo','っひゅ'=>'hyu',
1462 'っきゃ'=>'kkya','っきぇ'=>'kkye','っきぃ'=>'kkyi','っきょ'=>'kkyo','っきゅ'=>'kkyu',
1463 'っぎゃ'=>'ggya','っぎぇ'=>'ggye','っぎぃ'=>'ggyi','っぎょ'=>'ggyo','っぎゅ'=>'ggyu',
1464 'っみゃ'=>'mmya','っみぇ'=>'mmye','っみぃ'=>'mmyi','っみょ'=>'mmyo','っみゅ'=>'mmyu',
1465 'っにゃ'=>'nnya','っにぇ'=>'nnye','っにぃ'=>'nnyi','っにょ'=>'nnyo','っにゅ'=>'nnyu',
1466 'っりゃ'=>'rrya','っりぇ'=>'rrye','っりぃ'=>'rryi','っりょ'=>'rryo','っりゅ'=>'rryu',
1467 'っしゃ'=>'ssha','っしぇ'=>'sshe','っし'=>'sshi','っしょ'=>'ssho','っしゅ'=>'sshu',
1469 // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the apostrophe would be converted to _ anyway)
1470 'んあ'=>'n_a','んえ'=>'n_e','んい'=>'n_i','んお'=>'n_o','んう'=>'n_u',
1471 'んや'=>'n_ya','んよ'=>'n_yo','んゆ'=>'n_yu',
1473 // 2 character syllables - normal
1474 'ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1475 'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu',
1476 'ひゃ'=>'hya','ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu',
1477 'びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1478 'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu',
1479 'きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi','きょ'=>'kyo','きゅ'=>'kyu',
1480 'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu',
1481 'みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1482 'にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1483 'りゃ'=>'rya','りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu',
1484 'しゃ'=>'sha','しぇ'=>'she','し'=>'shi','しょ'=>'sho','しゅ'=>'shu',
1485 'じゃ'=>'ja','じぇ'=>'je','じょ'=>'jo','じゅ'=>'ju',
1486 'うぇ'=>'we','うぃ'=>'wi',
1487 'いぇ'=>'ye',
1489 // 2 character syllables, っ doubles the consonant after
1490 'っば'=>'bba','っべ'=>'bbe','っび'=>'bbi','っぼ'=>'bbo','っぶ'=>'bbu',
1491 'っぱ'=>'ppa','っぺ'=>'ppe','っぴ'=>'ppi','っぽ'=>'ppo','っぷ'=>'ppu',
1492 'った'=>'tta','って'=>'tte','っち'=>'cchi','っと'=>'tto','っつ'=>'ttsu',
1493 'っだ'=>'dda','っで'=>'dde','っぢ'=>'ddi','っど'=>'ddo','っづ'=>'ddu',
1494 'っが'=>'gga','っげ'=>'gge','っぎ'=>'ggi','っご'=>'ggo','っぐ'=>'ggu',
1495 'っか'=>'kka','っけ'=>'kke','っき'=>'kki','っこ'=>'kko','っく'=>'kku',
1496 'っま'=>'mma','っめ'=>'mme','っみ'=>'mmi','っも'=>'mmo','っむ'=>'mmu',
1497 'っな'=>'nna','っね'=>'nne','っに'=>'nni','っの'=>'nno','っぬ'=>'nnu',
1498 'っら'=>'rra','っれ'=>'rre','っり'=>'rri','っろ'=>'rro','っる'=>'rru',
1499 'っさ'=>'ssa','っせ'=>'sse','っし'=>'sshi','っそ'=>'sso','っす'=>'ssu',
1500 'っざ'=>'zza','っぜ'=>'zze','っじ'=>'jji','っぞ'=>'zzo','っず'=>'zzu',
1502 // 1 character syllabels
1503 'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ん'=>'n',
1504 'は'=>'ha','へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'fu',
1505 'ば'=>'ba','べ'=>'be','び'=>'bi','ぼ'=>'bo','ぶ'=>'bu',
1506 'ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po','ぷ'=>'pu',
1507 'た'=>'ta','て'=>'te','ち'=>'chi','と'=>'to','つ'=>'tsu',
1508 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1509 'が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu',
1510 'か'=>'ka','け'=>'ke','き'=>'ki','こ'=>'ko','く'=>'ku',
1511 'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu',
1512 'な'=>'na','ね'=>'ne','に'=>'ni','の'=>'no','ぬ'=>'nu',
1513 'ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru',
1514 'さ'=>'sa','せ'=>'se','し'=>'shi','そ'=>'so','す'=>'su',
1515 'わ'=>'wa','を'=>'wo',
1516 'ざ'=>'za','ぜ'=>'ze','じ'=>'ji','ぞ'=>'zo','ず'=>'zu',
1517 'や'=>'ya','よ'=>'yo','ゆ'=>'yu',
1518 // old characters
1519 'ゑ'=>'we','ゐ'=>'wi',
1521 // convert what's left (probably only kicks in when something's missing above)
1522 // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
1523 // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
1525 // never seen one of those (disabled for the moment)
1526 // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
1527 // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
1528 // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
1529 // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
1530 // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
1531 // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1532 // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
1533 // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1534 // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
1535 // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
1536 // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
1537 // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
1538 // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
1539 // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
1541 // 'spare' characters from other romanization systems
1542 // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
1543 // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1544 // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
1545 // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
1546 //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
1547 //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
1548 //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
1549 //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
1550 //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
1551 //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
1554 // Japanese katakana
1556 // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before (usualy written with macron, but we don't want that in our URLs)
1557 'ッビャー'=>'bbyaa','ッビェー'=>'bbyee','ッビィー'=>'bbyii','ッビョー'=>'bbyoo','ッビュー'=>'bbyuu',
1558 'ッピャー'=>'ppyaa','ッピェー'=>'ppyee','ッピィー'=>'ppyii','ッピョー'=>'ppyoo','ッピュー'=>'ppyuu',
1559 'ッキャー'=>'kkyaa','ッキェー'=>'kkyee','ッキィー'=>'kkyii','ッキョー'=>'kkyoo','ッキュー'=>'kkyuu',
1560 'ッギャー'=>'ggyaa','ッギェー'=>'ggyee','ッギィー'=>'ggyii','ッギョー'=>'ggyoo','ッギュー'=>'ggyuu',
1561 'ッミャー'=>'mmyaa','ッミェー'=>'mmyee','ッミィー'=>'mmyii','ッミョー'=>'mmyoo','ッミュー'=>'mmyuu',
1562 'ッニャー'=>'nnyaa','ッニェー'=>'nnyee','ッニィー'=>'nnyii','ッニョー'=>'nnyoo','ッニュー'=>'nnyuu',
1563 'ッリャー'=>'rryaa','ッリェー'=>'rryee','ッリィー'=>'rryii','ッリョー'=>'rryoo','ッリュー'=>'rryuu',
1564 'ッシャー'=>'sshaa','ッシェー'=>'sshee','ッシー'=>'sshii','ッショー'=>'sshoo','ッシュー'=>'sshuu',
1565 'ッチャー'=>'cchaa','ッチェー'=>'cchee','ッチー'=>'cchii','ッチョー'=>'cchoo','ッチュー'=>'cchuu',
1566 'ッティー'=>'ttii',
1567 'ッヂィー'=>'ddii',
1569 // 3 character syllables - doubled vowels
1570 'ファー'=>'faa','フェー'=>'fee','フィー'=>'fii','フォー'=>'foo',
1571 'フャー'=>'fyaa','フェー'=>'fyee','フィー'=>'fyii','フョー'=>'fyoo','フュー'=>'fyuu',
1572 'ヒャー'=>'hyaa','ヒェー'=>'hyee','ヒィー'=>'hyii','ヒョー'=>'hyoo','ヒュー'=>'hyuu',
1573 'ビャー'=>'byaa','ビェー'=>'byee','ビィー'=>'byii','ビョー'=>'byoo','ビュー'=>'byuu',
1574 'ピャー'=>'pyaa','ピェー'=>'pyee','ピィー'=>'pyii','ピョー'=>'pyoo','ピュー'=>'pyuu',
1575 'キャー'=>'kyaa','キェー'=>'kyee','キィー'=>'kyii','キョー'=>'kyoo','キュー'=>'kyuu',
1576 'ギャー'=>'gyaa','ギェー'=>'gyee','ギィー'=>'gyii','ギョー'=>'gyoo','ギュー'=>'gyuu',
1577 'ミャー'=>'myaa','ミェー'=>'myee','ミィー'=>'myii','ミョー'=>'myoo','ミュー'=>'myuu',
1578 'ニャー'=>'nyaa','ニェー'=>'nyee','ニィー'=>'nyii','ニョー'=>'nyoo','ニュー'=>'nyuu',
1579 'リャー'=>'ryaa','リェー'=>'ryee','リィー'=>'ryii','リョー'=>'ryoo','リュー'=>'ryuu',
1580 'シャー'=>'shaa','シェー'=>'shee','シー'=>'shii','ショー'=>'shoo','シュー'=>'shuu',
1581 'ジャー'=>'jaa','ジェー'=>'jee','ジー'=>'jii','ジョー'=>'joo','ジュー'=>'juu',
1582 'スァー'=>'swaa','スェー'=>'swee','スィー'=>'swii','スォー'=>'swoo','スゥー'=>'swuu',
1583 'デァー'=>'daa','デェー'=>'dee','ディー'=>'dii','デォー'=>'doo','デゥー'=>'duu',
1584 'チャー'=>'chaa','チェー'=>'chee','チー'=>'chii','チョー'=>'choo','チュー'=>'chuu',
1585 'ヂャー'=>'dyaa','ヂェー'=>'dyee','ヂィー'=>'dyii','ヂョー'=>'dyoo','ヂュー'=>'dyuu',
1586 'ツャー'=>'tsaa','ツェー'=>'tsee','ツィー'=>'tsii','ツョー'=>'tsoo','ツー'=>'tsuu',
1587 'トァー'=>'twaa','トェー'=>'twee','トィー'=>'twii','トォー'=>'twoo','トゥー'=>'twuu',
1588 'ドァー'=>'dwaa','ドェー'=>'dwee','ドィー'=>'dwii','ドォー'=>'dwoo','ドゥー'=>'dwuu',
1589 'ウァー'=>'whaa','ウェー'=>'whee','ウィー'=>'whii','ウォー'=>'whoo','ウゥー'=>'whuu',
1590 'ヴャー'=>'vyaa','ヴェー'=>'vyee','ヴィー'=>'vyii','ヴョー'=>'vyoo','ヴュー'=>'vyuu',
1591 'ヴァー'=>'vaa','ヴェー'=>'vee','ヴィー'=>'vii','ヴォー'=>'voo','ヴー'=>'vuu',
1592 'ウェー'=>'wee','ウィー'=>'wii',
1593 'イェー'=>'yee',
1594 'ティー'=>'tii',
1595 'ヂィー'=>'dii',
1597 // 3 character syllables - doubled consonants
1598 'ッビャ'=>'bbya','ッビェ'=>'bbye','ッビィ'=>'bbyi','ッビョ'=>'bbyo','ッビュ'=>'bbyu',
1599 'ッピャ'=>'ppya','ッピェ'=>'ppye','ッピィ'=>'ppyi','ッピョ'=>'ppyo','ッピュ'=>'ppyu',
1600 'ッキャ'=>'kkya','ッキェ'=>'kkye','ッキィ'=>'kkyi','ッキョ'=>'kkyo','ッキュ'=>'kkyu',
1601 'ッギャ'=>'ggya','ッギェ'=>'ggye','ッギィ'=>'ggyi','ッギョ'=>'ggyo','ッギュ'=>'ggyu',
1602 'ッミャ'=>'mmya','ッミェ'=>'mmye','ッミィ'=>'mmyi','ッミョ'=>'mmyo','ッミュ'=>'mmyu',
1603 'ッニャ'=>'nnya','ッニェ'=>'nnye','ッニィ'=>'nnyi','ッニョ'=>'nnyo','ッニュ'=>'nnyu',
1604 'ッリャ'=>'rrya','ッリェ'=>'rrye','ッリィ'=>'rryi','ッリョ'=>'rryo','ッリュ'=>'rryu',
1605 'ッシャ'=>'ssha','ッシェ'=>'sshe','ッシ'=>'sshi','ッショ'=>'ssho','ッシュ'=>'sshu',
1606 'ッチャ'=>'ccha','ッチェ'=>'cche','ッチ'=>'cchi','ッチョ'=>'ccho','ッチュ'=>'cchu',
1607 'ッティ'=>'tti',
1608 'ッヂィ'=>'ddi',
1610 // 3 character syllables - doubled vowel and consonants
1611 'ッバー'=>'bbaa','ッベー'=>'bbee','ッビー'=>'bbii','ッボー'=>'bboo','ッブー'=>'bbuu',
1612 'ッパー'=>'ppaa','ッペー'=>'ppee','ッピー'=>'ppii','ッポー'=>'ppoo','ップー'=>'ppuu',
1613 'ッケー'=>'kkee','ッキー'=>'kkii','ッコー'=>'kkoo','ックー'=>'kkuu','ッカー'=>'kkaa',
1614 'ッガー'=>'ggaa','ッゲー'=>'ggee','ッギー'=>'ggii','ッゴー'=>'ggoo','ッグー'=>'gguu',
1615 'ッマー'=>'maa','ッメー'=>'mee','ッミー'=>'mii','ッモー'=>'moo','ッムー'=>'muu',
1616 'ッナー'=>'nnaa','ッネー'=>'nnee','ッニー'=>'nnii','ッノー'=>'nnoo','ッヌー'=>'nnuu',
1617 'ッラー'=>'rraa','ッレー'=>'rree','ッリー'=>'rrii','ッロー'=>'rroo','ッルー'=>'rruu',
1618 'ッサー'=>'ssaa','ッセー'=>'ssee','ッシー'=>'sshii','ッソー'=>'ssoo','ッスー'=>'ssuu',
1619 'ッザー'=>'zzaa','ッゼー'=>'zzee','ッジー'=>'jjii','ッゾー'=>'zzoo','ッズー'=>'zzuu',
1620 'ッター'=>'ttaa','ッテー'=>'ttee','ッチー'=>'chii','ットー'=>'ttoo','ッツー'=>'ttsuu',
1621 'ッダー'=>'ddaa','ッデー'=>'ddee','ッヂー'=>'ddii','ッドー'=>'ddoo','ッヅー'=>'dduu',
1623 // 2 character syllables - normal
1624 'ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フゥ'=>'fu',
1625 // 'フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo','フュ'=>'fyu',
1626 'フャ'=>'fa','フェ'=>'fe','フィ'=>'fi','フョ'=>'fo','フュ'=>'fu',
1627 'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu',
1628 'ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo','ビュ'=>'byu',
1629 'ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1630 'キャ'=>'kya','キェ'=>'kye','キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu',
1631 'ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1632 'ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo','ミュ'=>'myu',
1633 'ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo','ニュ'=>'nyu',
1634 'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu',
1635 'シャ'=>'sha','シェ'=>'she','ショ'=>'sho','シュ'=>'shu',
1636 'ジャ'=>'ja','ジェ'=>'je','ジョ'=>'jo','ジュ'=>'ju',
1637 'スァ'=>'swa','スェ'=>'swe','スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu',
1638 'デァ'=>'da','デェ'=>'de','ディ'=>'di','デォ'=>'do','デゥ'=>'du',
1639 'チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1640 // 'ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi','ヂョ'=>'dyo','ヂュ'=>'dyu',
1641 'ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1642 'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu',
1643 'ドァ'=>'dwa','ドェ'=>'dwe','ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu',
1644 'ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi','ウォ'=>'who','ウゥ'=>'whu',
1645 'ヴャ'=>'vya','ヴェ'=>'vye','ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu',
1646 'ヴァ'=>'va','ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu',
1647 'ウェ'=>'we','ウィ'=>'wi',
1648 'イェ'=>'ye',
1649 'ティ'=>'ti',
1650 'ヂィ'=>'di',
1652 // 2 character syllables - doubled vocal
1653 'アー'=>'aa','エー'=>'ee','イー'=>'ii','オー'=>'oo','ウー'=>'uu',
1654 'ダー'=>'daa','デー'=>'dee','ヂー'=>'dii','ドー'=>'doo','ヅー'=>'duu',
1655 'ハー'=>'haa','ヘー'=>'hee','ヒー'=>'hii','ホー'=>'hoo','フー'=>'fuu',
1656 'バー'=>'baa','ベー'=>'bee','ビー'=>'bii','ボー'=>'boo','ブー'=>'buu',
1657 'パー'=>'paa','ペー'=>'pee','ピー'=>'pii','ポー'=>'poo','プー'=>'puu',
1658 'ケー'=>'kee','キー'=>'kii','コー'=>'koo','クー'=>'kuu','カー'=>'kaa',
1659 'ガー'=>'gaa','ゲー'=>'gee','ギー'=>'gii','ゴー'=>'goo','グー'=>'guu',
1660 'マー'=>'maa','メー'=>'mee','ミー'=>'mii','モー'=>'moo','ムー'=>'muu',
1661 'ナー'=>'naa','ネー'=>'nee','ニー'=>'nii','ノー'=>'noo','ヌー'=>'nuu',
1662 'ラー'=>'raa','レー'=>'ree','リー'=>'rii','ロー'=>'roo','ルー'=>'ruu',
1663 'サー'=>'saa','セー'=>'see','シー'=>'shii','ソー'=>'soo','スー'=>'suu',
1664 'ザー'=>'zaa','ゼー'=>'zee','ジー'=>'jii','ゾー'=>'zoo','ズー'=>'zuu',
1665 'ター'=>'taa','テー'=>'tee','チー'=>'chii','トー'=>'too','ツー'=>'tsuu',
1666 'ワー'=>'waa','ヲー'=>'woo',
1667 'ヤー'=>'yaa','ヨー'=>'yoo','ユー'=>'yuu',
1668 'ヵー'=>'kaa','ヶー'=>'kee',
1669 // old characters
1670 'ヱー'=>'wee','ヰー'=>'wii',
1672 // seperate katakana 'n'
1673 'ンア'=>'n_a','ンエ'=>'n_e','ンイ'=>'n_i','ンオ'=>'n_o','ンウ'=>'n_u',
1674 'ンヤ'=>'n_ya','ンヨ'=>'n_yo','ンユ'=>'n_yu',
1676 // 2 character syllables - doubled consonants
1677 'ッバ'=>'bba','ッベ'=>'bbe','ッビ'=>'bbi','ッボ'=>'bbo','ッブ'=>'bbu',
1678 'ッパ'=>'ppa','ッペ'=>'ppe','ッピ'=>'ppi','ッポ'=>'ppo','ップ'=>'ppu',
1679 'ッケ'=>'kke','ッキ'=>'kki','ッコ'=>'kko','ック'=>'kku','ッカ'=>'kka',
1680 'ッガ'=>'gga','ッゲ'=>'gge','ッギ'=>'ggi','ッゴ'=>'ggo','ッグ'=>'ggu',
1681 'ッマ'=>'ma','ッメ'=>'me','ッミ'=>'mi','ッモ'=>'mo','ッム'=>'mu',
1682 'ッナ'=>'nna','ッネ'=>'nne','ッニ'=>'nni','ッノ'=>'nno','ッヌ'=>'nnu',
1683 'ッラ'=>'rra','ッレ'=>'rre','ッリ'=>'rri','ッロ'=>'rro','ッル'=>'rru',
1684 'ッサ'=>'ssa','ッセ'=>'sse','ッシ'=>'sshi','ッソ'=>'sso','ッス'=>'ssu',
1685 'ッザ'=>'zza','ッゼ'=>'zze','ッジ'=>'jji','ッゾ'=>'zzo','ッズ'=>'zzu',
1686 'ッタ'=>'tta','ッテ'=>'tte','ッチ'=>'cchi','ット'=>'tto','ッツ'=>'ttsu',
1687 'ッダ'=>'dda','ッデ'=>'dde','ッヂ'=>'ddi','ッド'=>'ddo','ッヅ'=>'ddu',
1689 // 1 character syllables
1690 'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','ン'=>'n',
1691 'ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho','フ'=>'fu',
1692 'バ'=>'ba','ベ'=>'be','ビ'=>'bi','ボ'=>'bo','ブ'=>'bu',
1693 'パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1694 'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','カ'=>'ka',
1695 'ガ'=>'ga','ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu',
1696 'マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu',
1697 'ナ'=>'na','ネ'=>'ne','ニ'=>'ni','ノ'=>'no','ヌ'=>'nu',
1698 'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru',
1699 'サ'=>'sa','セ'=>'se','シ'=>'shi','ソ'=>'so','ス'=>'su',
1700 'ザ'=>'za','ゼ'=>'ze','ジ'=>'ji','ゾ'=>'zo','ズ'=>'zu',
1701 'タ'=>'ta','テ'=>'te','チ'=>'chi','ト'=>'to','ツ'=>'tsu',
1702 'ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do','ヅ'=>'du',
1703 'ワ'=>'wa','ヲ'=>'wo',
1704 'ヤ'=>'ya','ヨ'=>'yo','ユ'=>'yu',
1705 'ヵ'=>'ka','ヶ'=>'ke',
1706 // old characters
1707 'ヱ'=>'we','ヰ'=>'wi',
1709 // convert what's left (probably only kicks in when something's missing above)
1710 'ァ'=>'a','ェ'=>'e','ィ'=>'i','ォ'=>'o','ゥ'=>'u',
1711 'ャ'=>'ya','ョ'=>'yo','ュ'=>'yu',
1713 // special characters
1714 '・'=>'_','、'=>'_',
1715 'ー'=>'_', // when used with hiragana (seldom), this character would not be converted otherwise
1717 // 'ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo','ル'=>'lu',
1718 // 'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu',
1719 //'デャ'=>'dha','デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu',
1720 // 'リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi','リョ'=>'lyo','リュ'=>'lyu',
1721 // 'テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho','テュ'=>'thu',
1722 //'ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi','フォ'=>'fwo','フゥ'=>'fwu',
1723 //'チャ'=>'tya','チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu',
1724 // 'ジャ'=>'jya','ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu',
1725 // 'ジャ'=>'zha','ジェ'=>'zhe','ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu',
1726 //'ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi','ジョ'=>'zyo','ジュ'=>'zyu',
1727 //'シャ'=>'sya','シェ'=>'sye','シィ'=>'syi','ショ'=>'syo','シュ'=>'syu',
1728 //'シ'=>'ci','フ'=>'hu',シ'=>'si','チ'=>'ti','ツ'=>'tu','イ'=>'yi','ヂ'=>'dzi',
1730 // "Greeklish"
1731 'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1732 'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1734 // Thai
1735 'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1736 'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1737 'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1738 'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1739 'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1740 'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1741 'ะ'=>'a','ั'=>'a','รร'=>'a','า'=>'a','ๅ'=>'a','ำ'=>'am','ํา'=>'am',
1742 'ิ'=>'i','ี'=>'i','ึ'=>'ue','ี'=>'ue','ุ'=>'u','ู'=>'u',
1743 'เ'=>'e','แ'=>'ae','โ'=>'o','อ'=>'o',
1744 'ียะ'=>'ia','ีย'=>'ia','ือะ'=>'uea','ือ'=>'uea','ัวะ'=>'ua','ัว'=>'ua',
1745 'ใ'=>'ai','ไ'=>'ai','ัย'=>'ai','าย'=>'ai','าว'=>'ao',
1746 'ุย'=>'ui','อย'=>'oi','ือย'=>'ueai','วย'=>'uai',
1747 'ิว'=>'io','็ว'=>'eo','ียว'=>'iao',
1748 '่'=>'','้'=>'','๊'=>'','๋'=>'','็'=>'',
1749 '์'=>'','๎'=>'','ํ'=>'','ฺ'=>'',
1750 'ๆ'=>'2','๏'=>'o','ฯ'=>'-','๚'=>'-','๛'=>'-',
1751 '๐'=>'0','๑'=>'1','๒'=>'2','๓'=>'3','๔'=>'4',
1752 '๕'=>'5','๖'=>'6','๗'=>'7','๘'=>'8','๙'=>'9',
1754 // Korean
1755 'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1756 'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1757 'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1758 'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1759 'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1760 'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',