replaced iconv with mb_convert_encoding in comments
[kohana-core.git] / classes / kohana / utf8.php
blob72fba8966145558166f2eb5a063c1531d3aa6fff
1 <?php defined('SYSPATH') or die('No direct script access.');
2 /**
3 * A port of [phputf8](http://phputf8.sourceforge.net/) to a unified set
4 * of files. Provides multi-byte aware replacement string functions.
6 * For UTF-8 support to work correctly, the following requirements must be met:
8 * - PCRE needs to be compiled with UTF-8 support (--enable-utf8)
9 * - Support for [Unicode properties](http://php.net/manual/reference.pcre.pattern.modifiers.php)
10 * is highly recommended (--enable-unicode-properties)
11 * - UTF-8 conversion will be much more reliable if the
12 * [iconv extension](http://php.net/iconv) is loaded
13 * - The [mbstring extension](http://php.net/mbstring) is highly recommended,
14 * but must not be overloading string functions
16 * [!!] This file is licensed differently from the rest of Kohana. As a port of
17 * [phputf8](http://phputf8.sourceforge.net/), this file is released under the LGPL.
19 * @package Kohana
20 * @category Base
21 * @author Kohana Team
22 * @copyright (c) 2007-2012 Kohana Team
23 * @copyright (c) 2005 Harry Fuecks
24 * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt
26 class Kohana_UTF8 {
28 /**
29 * @var boolean Does the server support UTF-8 natively?
31 public static $server_utf8 = NULL;
33 /**
34 * @var array List of called methods that have had their required file included.
36 public static $called = array();
38 /**
39 * Recursively cleans arrays, objects, and strings. Removes ASCII control
40 * codes and converts to the requested charset while silently discarding
41 * incompatible characters.
43 * UTF8::clean($_GET); // Clean GET data
45 * [!!] This method requires [Iconv](http://php.net/iconv)
47 * @param mixed $var variable to clean
48 * @param string $charset character set, defaults to Kohana::$charset
49 * @return mixed
50 * @uses UTF8::strip_ascii_ctrl
51 * @uses UTF8::is_ascii
53 public static function clean($var, $charset = NULL)
55 if ( ! $charset)
57 // Use the application character set
58 $charset = Kohana::$charset;
61 if (is_array($var) OR is_object($var))
63 foreach ($var as $key => $val)
65 // Recursion!
66 $var[UTF8::clean($key)] = UTF8::clean($val);
69 elseif (is_string($var) AND $var !== '')
71 // Remove control characters
72 $var = UTF8::strip_ascii_ctrl($var);
74 if ( ! UTF8::is_ascii($var))
77 // Set the mb_substitute_character() value into temporary variable
78 $mb_substitute_character = mb_substitute_character();
80 // Disable substituting illigal characters with the default '?' character
81 mb_substitute_character('none');
83 // mb_convert_encoding is expensive, so it is only used when needed
84 $var = mb_convert_encoding($var, $charset, $charset);
86 // Reset mb_substitute_character() value back to the original setting
87 mb_substitute_character($mb_substitute_character);
92 return $var;
95 /**
96 * Tests whether a string contains only 7-bit ASCII bytes. This is used to
97 * determine when to use native functions or UTF-8 functions.
99 * $ascii = UTF8::is_ascii($str);
101 * @param mixed $str string or array of strings to check
102 * @return boolean
104 public static function is_ascii($str)
106 if (is_array($str))
108 $str = implode($str);
111 return ! preg_match('/[^\x00-\x7F]/S', $str);
115 * Strips out device control codes in the ASCII range.
117 * $str = UTF8::strip_ascii_ctrl($str);
119 * @param string $str string to clean
120 * @return string
122 public static function strip_ascii_ctrl($str)
124 return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
128 * Strips out all non-7bit ASCII bytes.
130 * $str = UTF8::strip_non_ascii($str);
132 * @param string $str string to clean
133 * @return string
135 public static function strip_non_ascii($str)
137 return preg_replace('/[^\x00-\x7F]+/S', '', $str);
141 * Replaces special/accented UTF-8 characters by ASCII-7 "equivalents".
143 * $ascii = UTF8::transliterate_to_ascii($utf8);
145 * @author Andreas Gohr <andi@splitbrain.org>
146 * @param string $str string to transliterate
147 * @param integer $case -1 lowercase only, +1 uppercase only, 0 both cases
148 * @return string
150 public static function transliterate_to_ascii($str, $case = 0)
152 if ( ! isset(UTF8::$called[__FUNCTION__]))
154 require Kohana::find_file('utf8', __FUNCTION__);
156 // Function has been called
157 UTF8::$called[__FUNCTION__] = TRUE;
160 return _transliterate_to_ascii($str, $case);
164 * Returns the length of the given string. This is a UTF8-aware version
165 * of [strlen](http://php.net/strlen).
167 * $length = UTF8::strlen($str);
169 * @param string $str string being measured for length
170 * @return integer
171 * @uses UTF8::$server_utf8
173 public static function strlen($str)
175 if (UTF8::$server_utf8)
176 return mb_strlen($str, Kohana::$charset);
178 if ( ! isset(UTF8::$called[__FUNCTION__]))
180 require Kohana::find_file('utf8', __FUNCTION__);
182 // Function has been called
183 UTF8::$called[__FUNCTION__] = TRUE;
186 return _strlen($str);
190 * Finds position of first occurrence of a UTF-8 string. This is a
191 * UTF8-aware version of [strpos](http://php.net/strpos).
193 * $position = UTF8::strpos($str, $search);
195 * @author Harry Fuecks <hfuecks@gmail.com>
196 * @param string $str haystack
197 * @param string $search needle
198 * @param integer $offset offset from which character in haystack to start searching
199 * @return integer position of needle
200 * @return boolean FALSE if the needle is not found
201 * @uses UTF8::$server_utf8
203 public static function strpos($str, $search, $offset = 0)
205 if (UTF8::$server_utf8)
206 return mb_strpos($str, $search, $offset, Kohana::$charset);
208 if ( ! isset(UTF8::$called[__FUNCTION__]))
210 require Kohana::find_file('utf8', __FUNCTION__);
212 // Function has been called
213 UTF8::$called[__FUNCTION__] = TRUE;
216 return _strpos($str, $search, $offset);
220 * Finds position of last occurrence of a char in a UTF-8 string. This is
221 * a UTF8-aware version of [strrpos](http://php.net/strrpos).
223 * $position = UTF8::strrpos($str, $search);
225 * @author Harry Fuecks <hfuecks@gmail.com>
226 * @param string $str haystack
227 * @param string $search needle
228 * @param integer $offset offset from which character in haystack to start searching
229 * @return integer position of needle
230 * @return boolean FALSE if the needle is not found
231 * @uses UTF8::$server_utf8
233 public static function strrpos($str, $search, $offset = 0)
235 if (UTF8::$server_utf8)
236 return mb_strrpos($str, $search, $offset, Kohana::$charset);
238 if ( ! isset(UTF8::$called[__FUNCTION__]))
240 require Kohana::find_file('utf8', __FUNCTION__);
242 // Function has been called
243 UTF8::$called[__FUNCTION__] = TRUE;
246 return _strrpos($str, $search, $offset);
250 * Returns part of a UTF-8 string. This is a UTF8-aware version
251 * of [substr](http://php.net/substr).
253 * $sub = UTF8::substr($str, $offset);
255 * @author Chris Smith <chris@jalakai.co.uk>
256 * @param string $str input string
257 * @param integer $offset offset
258 * @param integer $length length limit
259 * @return string
260 * @uses UTF8::$server_utf8
261 * @uses Kohana::$charset
263 public static function substr($str, $offset, $length = NULL)
265 if (UTF8::$server_utf8)
266 return ($length === NULL)
267 ? mb_substr($str, $offset, mb_strlen($str), Kohana::$charset)
268 : mb_substr($str, $offset, $length, Kohana::$charset);
270 if ( ! isset(UTF8::$called[__FUNCTION__]))
272 require Kohana::find_file('utf8', __FUNCTION__);
274 // Function has been called
275 UTF8::$called[__FUNCTION__] = TRUE;
278 return _substr($str, $offset, $length);
282 * Replaces text within a portion of a UTF-8 string. This is a UTF8-aware
283 * version of [substr_replace](http://php.net/substr_replace).
285 * $str = UTF8::substr_replace($str, $replacement, $offset);
287 * @author Harry Fuecks <hfuecks@gmail.com>
288 * @param string $str input string
289 * @param string $replacement replacement string
290 * @param integer $offset offset
291 * @return string
293 public static function substr_replace($str, $replacement, $offset, $length = NULL)
295 if ( ! isset(UTF8::$called[__FUNCTION__]))
297 require Kohana::find_file('utf8', __FUNCTION__);
299 // Function has been called
300 UTF8::$called[__FUNCTION__] = TRUE;
303 return _substr_replace($str, $replacement, $offset, $length);
307 * Makes a UTF-8 string lowercase. This is a UTF8-aware version
308 * of [strtolower](http://php.net/strtolower).
310 * $str = UTF8::strtolower($str);
312 * @author Andreas Gohr <andi@splitbrain.org>
313 * @param string $str mixed case string
314 * @return string
315 * @uses UTF8::$server_utf8
317 public static function strtolower($str)
319 if (UTF8::$server_utf8)
320 return mb_strtolower($str, Kohana::$charset);
322 if ( ! isset(UTF8::$called[__FUNCTION__]))
324 require Kohana::find_file('utf8', __FUNCTION__);
326 // Function has been called
327 UTF8::$called[__FUNCTION__] = TRUE;
330 return _strtolower($str);
334 * Makes a UTF-8 string uppercase. This is a UTF8-aware version
335 * of [strtoupper](http://php.net/strtoupper).
337 * @author Andreas Gohr <andi@splitbrain.org>
338 * @param string $str mixed case string
339 * @return string
340 * @uses UTF8::$server_utf8
341 * @uses Kohana::$charset
343 public static function strtoupper($str)
345 if (UTF8::$server_utf8)
346 return mb_strtoupper($str, Kohana::$charset);
348 if ( ! isset(UTF8::$called[__FUNCTION__]))
350 require Kohana::find_file('utf8', __FUNCTION__);
352 // Function has been called
353 UTF8::$called[__FUNCTION__] = TRUE;
356 return _strtoupper($str);
360 * Makes a UTF-8 string's first character uppercase. This is a UTF8-aware
361 * version of [ucfirst](http://php.net/ucfirst).
363 * $str = UTF8::ucfirst($str);
365 * @author Harry Fuecks <hfuecks@gmail.com>
366 * @param string $str mixed case string
367 * @return string
369 public static function ucfirst($str)
371 if ( ! isset(UTF8::$called[__FUNCTION__]))
373 require Kohana::find_file('utf8', __FUNCTION__);
375 // Function has been called
376 UTF8::$called[__FUNCTION__] = TRUE;
379 return _ucfirst($str);
383 * Makes the first character of every word in a UTF-8 string uppercase.
384 * This is a UTF8-aware version of [ucwords](http://php.net/ucwords).
386 * $str = UTF8::ucwords($str);
388 * @author Harry Fuecks <hfuecks@gmail.com>
389 * @param string $str mixed case string
390 * @return string
391 * @uses UTF8::$server_utf8
393 public static function ucwords($str)
395 if ( ! isset(UTF8::$called[__FUNCTION__]))
397 require Kohana::find_file('utf8', __FUNCTION__);
399 // Function has been called
400 UTF8::$called[__FUNCTION__] = TRUE;
403 return _ucwords($str);
407 * Case-insensitive UTF-8 string comparison. This is a UTF8-aware version
408 * of [strcasecmp](http://php.net/strcasecmp).
410 * $compare = UTF8::strcasecmp($str1, $str2);
412 * @author Harry Fuecks <hfuecks@gmail.com>
413 * @param string $str1 string to compare
414 * @param string $str2 string to compare
415 * @return integer less than 0 if str1 is less than str2
416 * @return integer greater than 0 if str1 is greater than str2
417 * @return integer 0 if they are equal
419 public static function strcasecmp($str1, $str2)
421 if ( ! isset(UTF8::$called[__FUNCTION__]))
423 require Kohana::find_file('utf8', __FUNCTION__);
425 // Function has been called
426 UTF8::$called[__FUNCTION__] = TRUE;
429 return _strcasecmp($str1, $str2);
433 * Returns a string or an array with all occurrences of search in subject
434 * (ignoring case) and replaced with the given replace value. This is a
435 * UTF8-aware version of [str_ireplace](http://php.net/str_ireplace).
437 * [!!] This function is very slow compared to the native version. Avoid
438 * using it when possible.
440 * @author Harry Fuecks <hfuecks@gmail.com
441 * @param string|array $search text to replace
442 * @param string|array $replace replacement text
443 * @param string|array $str subject text
444 * @param integer $count number of matched and replaced needles will be returned via this parameter which is passed by reference
445 * @return string if the input was a string
446 * @return array if the input was an array
448 public static function str_ireplace($search, $replace, $str, & $count = NULL)
450 if ( ! isset(UTF8::$called[__FUNCTION__]))
452 require Kohana::find_file('utf8', __FUNCTION__);
454 // Function has been called
455 UTF8::$called[__FUNCTION__] = TRUE;
458 return _str_ireplace($search, $replace, $str, $count);
462 * Case-insensitive UTF-8 version of strstr. Returns all of input string
463 * from the first occurrence of needle to the end. This is a UTF8-aware
464 * version of [stristr](http://php.net/stristr).
466 * $found = UTF8::stristr($str, $search);
468 * @author Harry Fuecks <hfuecks@gmail.com>
469 * @param string $str input string
470 * @param string $search needle
471 * @return string matched substring if found
472 * @return FALSE if the substring was not found
474 public static function stristr($str, $search)
476 if ( ! isset(UTF8::$called[__FUNCTION__]))
478 require Kohana::find_file('utf8', __FUNCTION__);
480 // Function has been called
481 UTF8::$called[__FUNCTION__] = TRUE;
484 return _stristr($str, $search);
488 * Finds the length of the initial segment matching mask. This is a
489 * UTF8-aware version of [strspn](http://php.net/strspn).
491 * $found = UTF8::strspn($str, $mask);
493 * @author Harry Fuecks <hfuecks@gmail.com>
494 * @param string $str input string
495 * @param string $mask mask for search
496 * @param integer $offset start position of the string to examine
497 * @param integer $length length of the string to examine
498 * @return integer length of the initial segment that contains characters in the mask
500 public static function strspn($str, $mask, $offset = NULL, $length = NULL)
502 if ( ! isset(UTF8::$called[__FUNCTION__]))
504 require Kohana::find_file('utf8', __FUNCTION__);
506 // Function has been called
507 UTF8::$called[__FUNCTION__] = TRUE;
510 return _strspn($str, $mask, $offset, $length);
514 * Finds the length of the initial segment not matching mask. This is a
515 * UTF8-aware version of [strcspn](http://php.net/strcspn).
517 * $found = UTF8::strcspn($str, $mask);
519 * @author Harry Fuecks <hfuecks@gmail.com>
520 * @param string $str input string
521 * @param string $mask mask for search
522 * @param integer $offset start position of the string to examine
523 * @param integer $length length of the string to examine
524 * @return integer length of the initial segment that contains characters not in the mask
526 public static function strcspn($str, $mask, $offset = NULL, $length = NULL)
528 if ( ! isset(UTF8::$called[__FUNCTION__]))
530 require Kohana::find_file('utf8', __FUNCTION__);
532 // Function has been called
533 UTF8::$called[__FUNCTION__] = TRUE;
536 return _strcspn($str, $mask, $offset, $length);
540 * Pads a UTF-8 string to a certain length with another string. This is a
541 * UTF8-aware version of [str_pad](http://php.net/str_pad).
543 * $str = UTF8::str_pad($str, $length);
545 * @author Harry Fuecks <hfuecks@gmail.com>
546 * @param string $str input string
547 * @param integer $final_str_length desired string length after padding
548 * @param string $pad_str string to use as padding
549 * @param string $pad_type padding type: STR_PAD_RIGHT, STR_PAD_LEFT, or STR_PAD_BOTH
550 * @return string
552 public static function str_pad($str, $final_str_length, $pad_str = ' ', $pad_type = STR_PAD_RIGHT)
554 if ( ! isset(UTF8::$called[__FUNCTION__]))
556 require Kohana::find_file('utf8', __FUNCTION__);
558 // Function has been called
559 UTF8::$called[__FUNCTION__] = TRUE;
562 return _str_pad($str, $final_str_length, $pad_str, $pad_type);
566 * Converts a UTF-8 string to an array. This is a UTF8-aware version of
567 * [str_split](http://php.net/str_split).
569 * $array = UTF8::str_split($str);
571 * @author Harry Fuecks <hfuecks@gmail.com>
572 * @param string $str input string
573 * @param integer $split_length maximum length of each chunk
574 * @return array
576 public static function str_split($str, $split_length = 1)
578 if ( ! isset(UTF8::$called[__FUNCTION__]))
580 require Kohana::find_file('utf8', __FUNCTION__);
582 // Function has been called
583 UTF8::$called[__FUNCTION__] = TRUE;
586 return _str_split($str, $split_length);
590 * Reverses a UTF-8 string. This is a UTF8-aware version of [strrev](http://php.net/strrev).
592 * $str = UTF8::strrev($str);
594 * @author Harry Fuecks <hfuecks@gmail.com>
595 * @param string $str string to be reversed
596 * @return string
598 public static function strrev($str)
600 if ( ! isset(UTF8::$called[__FUNCTION__]))
602 require Kohana::find_file('utf8', __FUNCTION__);
604 // Function has been called
605 UTF8::$called[__FUNCTION__] = TRUE;
608 return _strrev($str);
612 * Strips whitespace (or other UTF-8 characters) from the beginning and
613 * end of a string. This is a UTF8-aware version of [trim](http://php.net/trim).
615 * $str = UTF8::trim($str);
617 * @author Andreas Gohr <andi@splitbrain.org>
618 * @param string $str input string
619 * @param string $charlist string of characters to remove
620 * @return string
622 public static function trim($str, $charlist = NULL)
624 if ( ! isset(UTF8::$called[__FUNCTION__]))
626 require Kohana::find_file('utf8', __FUNCTION__);
628 // Function has been called
629 UTF8::$called[__FUNCTION__] = TRUE;
632 return _trim($str, $charlist);
636 * Strips whitespace (or other UTF-8 characters) from the beginning of
637 * a string. This is a UTF8-aware version of [ltrim](http://php.net/ltrim).
639 * $str = UTF8::ltrim($str);
641 * @author Andreas Gohr <andi@splitbrain.org>
642 * @param string $str input string
643 * @param string $charlist string of characters to remove
644 * @return string
646 public static function ltrim($str, $charlist = NULL)
648 if ( ! isset(UTF8::$called[__FUNCTION__]))
650 require Kohana::find_file('utf8', __FUNCTION__);
652 // Function has been called
653 UTF8::$called[__FUNCTION__] = TRUE;
656 return _ltrim($str, $charlist);
660 * Strips whitespace (or other UTF-8 characters) from the end of a string.
661 * This is a UTF8-aware version of [rtrim](http://php.net/rtrim).
663 * $str = UTF8::rtrim($str);
665 * @author Andreas Gohr <andi@splitbrain.org>
666 * @param string $str input string
667 * @param string $charlist string of characters to remove
668 * @return string
670 public static function rtrim($str, $charlist = NULL)
672 if ( ! isset(UTF8::$called[__FUNCTION__]))
674 require Kohana::find_file('utf8', __FUNCTION__);
676 // Function has been called
677 UTF8::$called[__FUNCTION__] = TRUE;
680 return _rtrim($str, $charlist);
684 * Returns the unicode ordinal for a character. This is a UTF8-aware
685 * version of [ord](http://php.net/ord).
687 * $digit = UTF8::ord($character);
689 * @author Harry Fuecks <hfuecks@gmail.com>
690 * @param string $chr UTF-8 encoded character
691 * @return integer
693 public static function ord($chr)
695 if ( ! isset(UTF8::$called[__FUNCTION__]))
697 require Kohana::find_file('utf8', __FUNCTION__);
699 // Function has been called
700 UTF8::$called[__FUNCTION__] = TRUE;
703 return _ord($chr);
707 * Takes an UTF-8 string and returns an array of ints representing the Unicode characters.
708 * Astral planes are supported i.e. the ints in the output can be > 0xFFFF.
709 * Occurrences of the BOM are ignored. Surrogates are not allowed.
711 * $array = UTF8::to_unicode($str);
713 * The Original Code is Mozilla Communicator client code.
714 * The Initial Developer of the Original Code is Netscape Communications Corporation.
715 * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
716 * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see <http://hsivonen.iki.fi/php-utf8/>
717 * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>
719 * @param string $str UTF-8 encoded string
720 * @return array unicode code points
721 * @return FALSE if the string is invalid
723 public static function to_unicode($str)
725 if ( ! isset(UTF8::$called[__FUNCTION__]))
727 require Kohana::find_file('utf8', __FUNCTION__);
729 // Function has been called
730 UTF8::$called[__FUNCTION__] = TRUE;
733 return _to_unicode($str);
737 * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
738 * Astral planes are supported i.e. the ints in the input can be > 0xFFFF.
739 * Occurrences of the BOM are ignored. Surrogates are not allowed.
741 * $str = UTF8::to_unicode($array);
743 * The Original Code is Mozilla Communicator client code.
744 * The Initial Developer of the Original Code is Netscape Communications Corporation.
745 * Portions created by the Initial Developer are Copyright (C) 1998 the Initial Developer.
746 * Ported to PHP by Henri Sivonen <hsivonen@iki.fi>, see http://hsivonen.iki.fi/php-utf8/
747 * Slight modifications to fit with phputf8 library by Harry Fuecks <hfuecks@gmail.com>.
749 * @param array $str unicode code points representing a string
750 * @return string utf8 string of characters
751 * @return boolean FALSE if a code point cannot be found
753 public static function from_unicode($arr)
755 if ( ! isset(UTF8::$called[__FUNCTION__]))
757 require Kohana::find_file('utf8', __FUNCTION__);
759 // Function has been called
760 UTF8::$called[__FUNCTION__] = TRUE;
763 return _from_unicode($arr);
766 } // End UTF8
768 if (Kohana_UTF8::$server_utf8 === NULL)
770 // Determine if this server supports UTF-8 natively
771 Kohana_UTF8::$server_utf8 = extension_loaded('mbstring');