3 namespace dokuwiki\Utf8
;
6 * Methods to assess and clean UTF-8 strings
11 * Checks if a string contains 7bit ASCII only
13 * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
18 public static function isASCII($str)
20 return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
24 * Tries to detect if a string is in Unicode encoding
26 * @author <bmorel@ssi.fr>
27 * @link http://php.net/manual/en/function.utf8-encode.php
32 public static function isUtf8($str)
35 for ($i = 0; $i < $len; $i++
) {
37 if ($b < 0x80) continue; # 0bbbbbbb
38 elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
39 elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
40 elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
41 elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
42 elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
43 else return false; # Does not match any model
45 for ($j = 0; $j < $n; $j++
) { # n bytes matching 10bbbbbb follow ?
46 if ((++
$i === $len) ||
((ord($str[$i]) & 0xC0) !== 0x80))
54 * Strips all high byte chars
56 * Returns a pure ASCII7 string
58 * @author Andreas Gohr <andi@splitbrain.org>
63 public static function strip($str)
67 for ($i = 0; $i < $len; $i++
) {
68 if (ord($str[$i]) < 128) {
76 * Removes special characters (nonalphanumeric) from a UTF-8 string
78 * This function adds the controlchars 0x00 to 0x19 to the array of
79 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
81 * @author Andreas Gohr <andi@splitbrain.org>
83 * @param string $string The UTF8 string to strip of special chars
84 * @param string $repl Replace special with this string
85 * @param string $additional Additional chars to strip (used in regexp char class)
88 public static function stripspecials($string, $repl = '', $additional = '')
90 static $specials = null;
91 if ($specials === null) {
92 $specials = preg_quote(Table
::specialChars(), '/');
95 return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
99 * Replace bad bytes with an alternative character
101 * ASCII character is recommended for replacement char
103 * PCRE Pattern to locate bad bytes in a UTF-8 string
104 * Comes from W3 FAQ: Multilingual Forms
105 * Note: modified to include full ASCII range including control chars
107 * @author Harry Fuecks <hfuecks@gmail.com>
108 * @see http://www.w3.org/International/questions/qa-forms-utf-8
110 * @param string $str to search
111 * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
114 public static function replaceBadBytes($str, $replace = '')
117 '([\x00-\x7F]' . # ASCII (including control chars)
118 '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte
119 '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs
120 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte
121 '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates
122 '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3
123 '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15
124 '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16
125 '|(.{1}))'; # invalid byte
127 while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
128 if (!isset($matches[2])) {
133 $str = substr($str, strlen($matches[0]));
135 return ob_get_clean();
140 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
142 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
143 * letters. Default is to deaccent both cases ($case = 0)
145 * @author Andreas Gohr <andi@splitbrain.org>
147 * @param string $string
151 public static function deaccent($string, $case = 0)
154 $string = strtr($string, Table
::lowerAccents());
157 $string = strtr($string, Table
::upperAccents());
163 * Romanize a non-latin string
165 * @author Andreas Gohr <andi@splitbrain.org>
167 * @param string $string
170 public static function romanize($string)
172 if (self
::isASCII($string)) return $string; //nothing to do
174 return strtr($string, Table
::romanization());
178 * adjust a byte index into a utf8 string to a utf8 character boundary
180 * @author chris smith <chris@jalakai.co.uk>
182 * @param string $str utf8 character string
183 * @param int $i byte index into $str
184 * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
185 * @return int byte index into $str now pointing to a utf8 character boundary
187 public static function correctIdx($str, $i, $next = false)
190 if ($i <= 0) return 0;
192 $limit = strlen($str);
193 if ($i >= $limit) return $limit;
196 while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++
;
198 while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;