inc/Utf8/Clean.php

   1 <?php
   2
   3 namespace dokuwiki\Utf8;
   4
   5 /**
   6  * Methods to assess and clean UTF-8 strings
   7  */
   8 class Clean
   9 {
  10     /**
  11      * Checks if a string contains 7bit ASCII only
  12      *
  13      * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
  14      *
  15      * @param string $str
  16      * @return bool
  17      */
  18     public static function isASCII($str)
  19     {
  20         return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
  21     }
  22
  23     /**
  24      * Tries to detect if a string is in Unicode encoding
  25      *
  26      * @author <bmorel@ssi.fr>
  27      * @link   http://php.net/manual/en/function.utf8-encode.php
  28      *
  29      * @param string $str
  30      * @return bool
  31      */
  32     public static function isUtf8($str)
  33     {
  34         $len = strlen($str);
  35         for ($i = 0; $i < $len; $i++) {
  36             $b = ord($str[$i]);
  37             if ($b < 0x80) continue; # 0bbbbbbb
  38             elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
  39             elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
  40             elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
  41             elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
  42             elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
  43             else return false; # Does not match any model
  44
  45             for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
  46                 if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
  47                     return false;
  48             }
  49         }
  50         return true;
  51     }
  52
  53     /**
  54      * Strips all high byte chars
  55      *
  56      * Returns a pure ASCII7 string
  57      *
  58      * @author Andreas Gohr <andi@splitbrain.org>
  59      *
  60      * @param string $str
  61      * @return string
  62      */
  63     public static function strip($str)
  64     {
  65         $ascii = '';
  66         $len = strlen($str);
  67         for ($i = 0; $i < $len; $i++) {
  68             if (ord($str[$i]) < 128) {
  69                 $ascii .= $str[$i];
  70             }
  71         }
  72         return $ascii;
  73     }
  74
  75     /**
  76      * Removes special characters (nonalphanumeric) from a UTF-8 string
  77      *
  78      * This function adds the controlchars 0x00 to 0x19 to the array of
  79      * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
  80      *
  81      * @author Andreas Gohr <andi@splitbrain.org>
  82      *
  83      * @param  string $string The UTF8 string to strip of special chars
  84      * @param  string $repl Replace special with this string
  85      * @param  string $additional Additional chars to strip (used in regexp char class)
  86      * @return string
  87      */
  88     public static function stripspecials($string, $repl = '', $additional = '')
  89     {
  90         static $specials = null;
  91         if ($specials === null) {
  92             $specials = preg_quote(Table::specialChars(), '/');
  93         }
  94
  95         return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
  96     }
  97
  98     /**
  99      * Replace bad bytes with an alternative character
 100      *
 101      * ASCII character is recommended for replacement char
 102      *
 103      * PCRE Pattern to locate bad bytes in a UTF-8 string
 104      * Comes from W3 FAQ: Multilingual Forms
 105      * Note: modified to include full ASCII range including control chars
 106      *
 107      * @author Harry Fuecks <hfuecks@gmail.com>
 108      * @see http://www.w3.org/International/questions/qa-forms-utf-8
 109      *
 110      * @param string $str to search
 111      * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
 112      * @return string
 113      */
 114     public static function replaceBadBytes($str, $replace = '')
 115     {
 116         $UTF8_BAD =
 117             '([\x00-\x7F]' .                          # ASCII (including control chars)
 118             '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
 119             '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
 120             '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
 121             '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
 122             '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
 123             '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
 124             '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
 125             '|(.{1}))';                               # invalid byte
 126         ob_start();
 127         while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
 128             if (!isset($matches[2])) {
 129                 echo $matches[0];
 130             } else {
 131                 echo $replace;
 132             }
 133             $str = substr($str, strlen($matches[0]));
 134         }
 135         return ob_get_clean();
 136     }
 137
 138
 139     /**
 140      * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
 141      *
 142      * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
 143      * letters. Default is to deaccent both cases ($case = 0)
 144      *
 145      * @author Andreas Gohr <andi@splitbrain.org>
 146      *
 147      * @param string $string
 148      * @param int $case
 149      * @return string
 150      */
 151     public static function deaccent($string, $case = 0)
 152     {
 153         if ($case <= 0) {
 154             $string = strtr($string, Table::lowerAccents());
 155         }
 156         if ($case >= 0) {
 157             $string = strtr($string, Table::upperAccents());
 158         }
 159         return $string;
 160     }
 161
 162     /**
 163      * Romanize a non-latin string
 164      *
 165      * @author Andreas Gohr <andi@splitbrain.org>
 166      *
 167      * @param string $string
 168      * @return string
 169      */
 170     public static function romanize($string)
 171     {
 172         if (self::isASCII($string)) return $string; //nothing to do
 173
 174         return strtr($string, Table::romanization());
 175     }
 176
 177     /**
 178      * adjust a byte index into a utf8 string to a utf8 character boundary
 179      *
 180      * @author       chris smith <chris@jalakai.co.uk>
 181      *
 182      * @param string $str utf8 character string
 183      * @param int $i byte index into $str
 184      * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
 185      * @return int byte index into $str now pointing to a utf8 character boundary
 186      */
 187     public static function correctIdx($str, $i, $next = false)
 188     {
 189
 190         if ($i <= 0) return 0;
 191
 192         $limit = strlen($str);
 193         if ($i >= $limit) return $limit;
 194
 195         if ($next) {
 196             while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
 197         } else {
 198             while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
 199         }
 200
 201         return $i;
 202     }
 203 }