Merge pull request #4111 from dokuwiki-translate/lang_update_739_1700675130
[dokuwiki.git] / inc / Utf8 / Clean.php
blob434da7043c580391bf7849f4c5757f6ba434b0e3
1 <?php
3 namespace dokuwiki\Utf8;
5 /**
6 * Methods to assess and clean UTF-8 strings
7 */
8 class Clean
10 /**
11 * Checks if a string contains 7bit ASCII only
13 * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
15 * @param string $str
16 * @return bool
18 public static function isASCII($str)
20 return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
23 /**
24 * Tries to detect if a string is in Unicode encoding
26 * @author <bmorel@ssi.fr>
27 * @link http://php.net/manual/en/function.utf8-encode.php
29 * @param string $str
30 * @return bool
32 public static function isUtf8($str)
34 $len = strlen($str);
35 for ($i = 0; $i < $len; $i++) {
36 $b = ord($str[$i]);
37 if ($b < 0x80) continue; # 0bbbbbbb
38 elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
39 elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
40 elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
41 elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
42 elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
43 else return false; # Does not match any model
45 for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
46 if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
47 return false;
50 return true;
53 /**
54 * Strips all high byte chars
56 * Returns a pure ASCII7 string
58 * @author Andreas Gohr <andi@splitbrain.org>
60 * @param string $str
61 * @return string
63 public static function strip($str)
65 $ascii = '';
66 $len = strlen($str);
67 for ($i = 0; $i < $len; $i++) {
68 if (ord($str[$i]) < 128) {
69 $ascii .= $str[$i];
72 return $ascii;
75 /**
76 * Removes special characters (nonalphanumeric) from a UTF-8 string
78 * This function adds the controlchars 0x00 to 0x19 to the array of
79 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
81 * @author Andreas Gohr <andi@splitbrain.org>
83 * @param string $string The UTF8 string to strip of special chars
84 * @param string $repl Replace special with this string
85 * @param string $additional Additional chars to strip (used in regexp char class)
86 * @return string
88 public static function stripspecials($string, $repl = '', $additional = '')
90 static $specials = null;
91 if ($specials === null) {
92 $specials = preg_quote(Table::specialChars(), '/');
95 return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
98 /**
99 * Replace bad bytes with an alternative character
101 * ASCII character is recommended for replacement char
103 * PCRE Pattern to locate bad bytes in a UTF-8 string
104 * Comes from W3 FAQ: Multilingual Forms
105 * Note: modified to include full ASCII range including control chars
107 * @author Harry Fuecks <hfuecks@gmail.com>
108 * @see http://www.w3.org/International/questions/qa-forms-utf-8
110 * @param string $str to search
111 * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
112 * @return string
114 public static function replaceBadBytes($str, $replace = '')
116 $UTF8_BAD =
117 '([\x00-\x7F]' . # ASCII (including control chars)
118 '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte
119 '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs
120 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte
121 '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates
122 '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3
123 '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15
124 '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16
125 '|(.{1}))'; # invalid byte
126 ob_start();
127 while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
128 if (!isset($matches[2])) {
129 echo $matches[0];
130 } else {
131 echo $replace;
133 $str = substr($str, strlen($matches[0]));
135 return ob_get_clean();
140 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
142 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
143 * letters. Default is to deaccent both cases ($case = 0)
145 * @author Andreas Gohr <andi@splitbrain.org>
147 * @param string $string
148 * @param int $case
149 * @return string
151 public static function deaccent($string, $case = 0)
153 if ($case <= 0) {
154 $string = strtr($string, Table::lowerAccents());
156 if ($case >= 0) {
157 $string = strtr($string, Table::upperAccents());
159 return $string;
163 * Romanize a non-latin string
165 * @author Andreas Gohr <andi@splitbrain.org>
167 * @param string $string
168 * @return string
170 public static function romanize($string)
172 if (self::isASCII($string)) return $string; //nothing to do
174 return strtr($string, Table::romanization());
178 * adjust a byte index into a utf8 string to a utf8 character boundary
180 * @author chris smith <chris@jalakai.co.uk>
182 * @param string $str utf8 character string
183 * @param int $i byte index into $str
184 * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
185 * @return int byte index into $str now pointing to a utf8 character boundary
187 public static function correctIdx($str, $i, $next = false)
190 if ($i <= 0) return 0;
192 $limit = strlen($str);
193 if ($i >= $limit) return $limit;
195 if ($next) {
196 while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
197 } else {
198 while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
201 return $i;