lib/classes/text.php

   1 <?php
   2 // This file is part of Moodle - http://moodle.org/
   3 //
   4 // Moodle is free software: you can redistribute it and/or modify
   5 // it under the terms of the GNU General Public License as published by
   6 // the Free Software Foundation, either version 3 of the License, or
   7 // (at your option) any later version.
   8 //
   9 // Moodle is distributed in the hope that it will be useful,
  10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 // GNU General Public License for more details.
  13 //
  14 // You should have received a copy of the GNU General Public License
  15 // along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 /**
  18  * Defines string apis
  19  *
  20  * @package    core
  21  * @copyright  (C) 2001-3001 Eloy Lafuente (stronk7) {@link http://contiento.com}
  22  * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  23  */
  24
  25 defined('MOODLE_INTERNAL') || die();
  26
  27 /**
  28  * defines string api's for manipulating strings
  29  *
  30  * This class is used to manipulate strings under Moodle 1.6 an later. As
  31  * utf-8 text become mandatory a pool of safe functions under this encoding
  32  * become necessary. The name of the methods is exactly the
  33  * same than their PHP originals.
  34  *
  35  * A big part of this class acts as a wrapper over the Typo3 charset library,
  36  * really a cool group of utilities to handle texts and encoding conversion.
  37  *
  38  * Take a look to its own copyright and license details.
  39  *
  40  * IMPORTANT Note: Typo3 libraries always expect lowercase charsets to use 100%
  41  * its capabilities so, don't forget to make the conversion
  42  * from every wrapper function!
  43  *
  44  * @package   core
  45  * @category  string
  46  * @copyright 1999 onwards Martin Dougiamas  {@link http://moodle.com}
  47  * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  48  */
  49 class core_text {
  50
  51     /**
  52      * @var string[] Array of strings representing Unicode non-characters
  53      */
  54     protected static $noncharacters;
  55
  56     /**
  57      * Return t3lib helper class, which is used for conversion between charsets
  58      *
  59      * @param bool $reset
  60      * @return t3lib_cs
  61      */
  62     protected static function typo3($reset = false) {
  63         static $typo3cs = null;
  64
  65         if ($reset) {
  66             $typo3cs = null;
  67             return null;
  68         }
  69
  70         if (isset($typo3cs)) {
  71             return $typo3cs;
  72         }
  73
  74         global $CFG;
  75
  76         // Required files
  77         require_once($CFG->libdir.'/typo3/class.t3lib_cs.php');
  78         require_once($CFG->libdir.'/typo3/class.t3lib_div.php');
  79         require_once($CFG->libdir.'/typo3/interface.t3lib_singleton.php');
  80         require_once($CFG->libdir.'/typo3/class.t3lib_l10n_locales.php');
  81
  82         // do not use mbstring or recode because it may return invalid results in some corner cases
  83         $GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'] = 'iconv';
  84         $GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] = 'iconv';
  85
  86         // Tell Typo3 we are curl enabled always (mandatory since 2.0)
  87         $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] = '1';
  88
  89         // And this directory must exist to allow Typo to cache conversion
  90         // tables when using internal functions
  91         make_temp_directory('typo3temp/cs');
  92
  93         // Make sure typo is using our dir permissions
  94         $GLOBALS['TYPO3_CONF_VARS']['BE']['folderCreateMask'] = decoct($CFG->directorypermissions);
  95
  96         // Default mask for Typo
  97         $GLOBALS['TYPO3_CONF_VARS']['BE']['fileCreateMask'] = decoct($CFG->filepermissions);
  98
  99         // This full path constants must be defined too, transforming backslashes
 100         // to forward slashed because Typo3 requires it.
 101         if (!defined('PATH_t3lib')) {
 102             define('PATH_t3lib', str_replace('\\','/',$CFG->libdir.'/typo3/'));
 103             define('PATH_typo3', str_replace('\\','/',$CFG->libdir.'/typo3/'));
 104             define('PATH_site', str_replace('\\','/',$CFG->tempdir.'/'));
 105             define('TYPO3_OS', stristr(PHP_OS,'win')&&!stristr(PHP_OS,'darwin')?'WIN':'');
 106         }
 107
 108         $typo3cs = new t3lib_cs();
 109
 110         return $typo3cs;
 111     }
 112
 113     /**
 114      * Reset internal textlib caches.
 115      * @static
 116      */
 117     public static function reset_caches() {
 118         self::typo3(true);
 119     }
 120
 121     /**
 122      * Standardise charset name
 123      *
 124      * Please note it does not mean the returned charset is actually supported.
 125      *
 126      * @static
 127      * @param string $charset raw charset name
 128      * @return string normalised lowercase charset name
 129      */
 130     public static function parse_charset($charset) {
 131         $charset = strtolower($charset);
 132
 133         // shortcuts so that we do not have to load typo3 on every page
 134
 135         if ($charset === 'utf8' or $charset === 'utf-8') {
 136             return 'utf-8';
 137         }
 138
 139         if (preg_match('/^(cp|win|windows)-?(12[0-9]{2})$/', $charset, $matches)) {
 140             return 'windows-'.$matches[2];
 141         }
 142
 143         if (preg_match('/^iso-8859-[0-9]+$/', $charset, $matches)) {
 144             return $charset;
 145         }
 146
 147         if ($charset === 'euc-jp') {
 148             return 'euc-jp';
 149         }
 150         if ($charset === 'iso-2022-jp') {
 151             return 'iso-2022-jp';
 152         }
 153         if ($charset === 'shift-jis' or $charset === 'shift_jis') {
 154             return 'shift_jis';
 155         }
 156         if ($charset === 'gb2312') {
 157             return 'gb2312';
 158         }
 159         if ($charset === 'gb18030') {
 160             return 'gb18030';
 161         }
 162
 163         // fallback to typo3
 164         return self::typo3()->parse_charset($charset);
 165     }
 166
 167     /**
 168      * Converts the text between different encodings. It uses iconv extension with //TRANSLIT parameter,
 169      * falls back to typo3. If both source and target are utf-8 it tries to fix invalid characters only.
 170      *
 171      * @param string $text
 172      * @param string $fromCS source encoding
 173      * @param string $toCS result encoding
 174      * @return string|bool converted string or false on error
 175      */
 176     public static function convert($text, $fromCS, $toCS='utf-8') {
 177         $fromCS = self::parse_charset($fromCS);
 178         $toCS   = self::parse_charset($toCS);
 179
 180         $text = (string)$text; // we can work only with strings
 181
 182         if ($text === '') {
 183             return '';
 184         }
 185
 186         if ($fromCS === 'utf-8') {
 187             $text = fix_utf8($text);
 188             if ($toCS === 'utf-8') {
 189                 return $text;
 190             }
 191         }
 192
 193         if ($toCS === 'ascii') {
 194             // Try to normalize the conversion a bit.
 195             $text = self::specialtoascii($text, $fromCS);
 196         }
 197
 198         // Prevent any error notices, do not use //IGNORE so that we get
 199         // consistent result from Typo3 if iconv fails.
 200         $result = @iconv($fromCS, $toCS.'//TRANSLIT', $text);
 201
 202         if ($result === false or $result === '') {
 203             // note: iconv is prone to return empty string when invalid char encountered, or false if encoding unsupported
 204             $oldlevel = error_reporting(E_PARSE);
 205             $result = self::typo3()->conv((string)$text, $fromCS, $toCS);
 206             error_reporting($oldlevel);
 207         }
 208
 209         return $result;
 210     }
 211
 212     /**
 213      * Multibyte safe substr() function, uses mbstring or iconv for UTF-8, falls back to typo3.
 214      *
 215      * @param string $text string to truncate
 216      * @param int $start negative value means from end
 217      * @param int $len maximum length of characters beginning from start
 218      * @param string $charset encoding of the text
 219      * @return string portion of string specified by the $start and $len
 220      */
 221     public static function substr($text, $start, $len=null, $charset='utf-8') {
 222         $charset = self::parse_charset($charset);
 223
 224         if ($charset === 'utf-8') {
 225             if (function_exists('mb_substr')) {
 226                 // this is much faster than iconv - see MDL-31142
 227                 if ($len === null) {
 228                     $oldcharset = mb_internal_encoding();
 229                     mb_internal_encoding('UTF-8');
 230                     $result = mb_substr($text, $start);
 231                     mb_internal_encoding($oldcharset);
 232                     return $result;
 233                 } else {
 234                     return mb_substr($text, $start, $len, 'UTF-8');
 235                 }
 236
 237             } else {
 238                 if ($len === null) {
 239                     $len = iconv_strlen($text, 'UTF-8');
 240                 }
 241                 return iconv_substr($text, $start, $len, 'UTF-8');
 242             }
 243         }
 244
 245         $oldlevel = error_reporting(E_PARSE);
 246         if ($len === null) {
 247             $result = self::typo3()->substr($charset, (string)$text, $start);
 248         } else {
 249             $result = self::typo3()->substr($charset, (string)$text, $start, $len);
 250         }
 251         error_reporting($oldlevel);
 252
 253         return $result;
 254     }
 255
 256     /**
 257      * Truncates a string to no more than a certain number of bytes in a multi-byte safe manner.
 258      * UTF-8 only!
 259      *
 260      * Many of the other charsets we test for (like ISO-2022-JP and EUC-JP) are not supported
 261      * by typo3, and will give invalid results, so we are supporting UTF-8 only.
 262      *
 263      * @param string $string String to truncate
 264      * @param int $bytes Maximum length of bytes in the result
 265      * @return string Portion of string specified by $bytes
 266      * @since Moodle 3.1
 267      */
 268     public static function str_max_bytes($string, $bytes) {
 269         if (function_exists('mb_strcut')) {
 270             return mb_strcut($string, 0, $bytes, 'UTF-8');
 271         }
 272
 273         $oldlevel = error_reporting(E_PARSE);
 274         $result = self::typo3()->strtrunc('utf-8', $string, $bytes);
 275         error_reporting($oldlevel);
 276
 277         return $result;
 278     }
 279
 280     /**
 281      * Finds the last occurrence of a character in a string within another.
 282      * UTF-8 ONLY safe mb_strrchr().
 283      *
 284      * @param string $haystack The string from which to get the last occurrence of needle.
 285      * @param string $needle The string to find in haystack.
 286      * @param boolean $part If true, returns the portion before needle, else return the portion after (including needle).
 287      * @return string|false False when not found.
 288      * @since Moodle 2.4.6, 2.5.2, 2.6
 289      */
 290     public static function strrchr($haystack, $needle, $part = false) {
 291
 292         if (function_exists('mb_strrchr')) {
 293             return mb_strrchr($haystack, $needle, $part, 'UTF-8');
 294         }
 295
 296         $pos = self::strrpos($haystack, $needle);
 297         if ($pos === false) {
 298             return false;
 299         }
 300
 301         $length = null;
 302         if ($part) {
 303             $length = $pos;
 304             $pos = 0;
 305         }
 306
 307         return self::substr($haystack, $pos, $length, 'utf-8');
 308     }
 309
 310     /**
 311      * Multibyte safe strlen() function, uses mbstring or iconv for UTF-8, falls back to typo3.
 312      *
 313      * @param string $text input string
 314      * @param string $charset encoding of the text
 315      * @return int number of characters
 316      */
 317     public static function strlen($text, $charset='utf-8') {
 318         $charset = self::parse_charset($charset);
 319
 320         if ($charset === 'utf-8') {
 321             if (function_exists('mb_strlen')) {
 322                 return mb_strlen($text, 'UTF-8');
 323             } else {
 324                 return iconv_strlen($text, 'UTF-8');
 325             }
 326         }
 327
 328         $oldlevel = error_reporting(E_PARSE);
 329         $result = self::typo3()->strlen($charset, (string)$text);
 330         error_reporting($oldlevel);
 331
 332         return $result;
 333     }
 334
 335     /**
 336      * Multibyte safe strtolower() function, uses mbstring, falls back to typo3.
 337      *
 338      * @param string $text input string
 339      * @param string $charset encoding of the text (may not work for all encodings)
 340      * @return string lower case text
 341      */
 342     public static function strtolower($text, $charset='utf-8') {
 343         $charset = self::parse_charset($charset);
 344
 345         if ($charset === 'utf-8' and function_exists('mb_strtolower')) {
 346             return mb_strtolower($text, 'UTF-8');
 347         }
 348
 349         $oldlevel = error_reporting(E_PARSE);
 350         $result = self::typo3()->conv_case($charset, (string)$text, 'toLower');
 351         error_reporting($oldlevel);
 352
 353         return $result;
 354     }
 355
 356     /**
 357      * Multibyte safe strtoupper() function, uses mbstring, falls back to typo3.
 358      *
 359      * @param string $text input string
 360      * @param string $charset encoding of the text (may not work for all encodings)
 361      * @return string upper case text
 362      */
 363     public static function strtoupper($text, $charset='utf-8') {
 364         $charset = self::parse_charset($charset);
 365
 366         if ($charset === 'utf-8' and function_exists('mb_strtoupper')) {
 367             return mb_strtoupper($text, 'UTF-8');
 368         }
 369
 370         $oldlevel = error_reporting(E_PARSE);
 371         $result = self::typo3()->conv_case($charset, (string)$text, 'toUpper');
 372         error_reporting($oldlevel);
 373
 374         return $result;
 375     }
 376
 377     /**
 378      * Find the position of the first occurrence of a substring in a string.
 379      * UTF-8 ONLY safe strpos(), uses mbstring, falls back to iconv.
 380      *
 381      * @param string $haystack the string to search in
 382      * @param string $needle one or more charachters to search for
 383      * @param int $offset offset from begining of string
 384      * @return int the numeric position of the first occurrence of needle in haystack.
 385      */
 386     public static function strpos($haystack, $needle, $offset=0) {
 387         if (function_exists('mb_strpos')) {
 388             return mb_strpos($haystack, $needle, $offset, 'UTF-8');
 389         } else {
 390             return iconv_strpos($haystack, $needle, $offset, 'UTF-8');
 391         }
 392     }
 393
 394     /**
 395      * Find the position of the last occurrence of a substring in a string
 396      * UTF-8 ONLY safe strrpos(), uses mbstring, falls back to iconv.
 397      *
 398      * @param string $haystack the string to search in
 399      * @param string $needle one or more charachters to search for
 400      * @return int the numeric position of the last occurrence of needle in haystack
 401      */
 402     public static function strrpos($haystack, $needle) {
 403         if (function_exists('mb_strrpos')) {
 404             return mb_strrpos($haystack, $needle, null, 'UTF-8');
 405         } else {
 406             return iconv_strrpos($haystack, $needle, 'UTF-8');
 407         }
 408     }
 409
 410     /**
 411      * Reverse UTF-8 multibytes character sets (used for RTL languages)
 412      * (We only do this because there is no mb_strrev or iconv_strrev)
 413      *
 414      * @param string $str the multibyte string to reverse
 415      * @return string the reversed multi byte string
 416      */
 417     public static function strrev($str) {
 418         preg_match_all('/./us', $str, $ar);
 419         return join('', array_reverse($ar[0]));
 420     }
 421
 422     /**
 423      * Try to convert upper unicode characters to plain ascii,
 424      * the returned string may contain unconverted unicode characters.
 425      *
 426      * @param string $text input string
 427      * @param string $charset encoding of the text
 428      * @return string converted ascii string
 429      */
 430     public static function specialtoascii($text, $charset='utf-8') {
 431         $charset = self::parse_charset($charset);
 432         $oldlevel = error_reporting(E_PARSE);
 433         $result = self::typo3()->specCharsToASCII($charset, (string)$text);
 434         error_reporting($oldlevel);
 435         return $result;
 436     }
 437
 438     /**
 439      * Generate a correct base64 encoded header to be used in MIME mail messages.
 440      * This function seems to be 100% compliant with RFC1342. Credits go to:
 441      * paravoid (http://www.php.net/manual/en/function.mb-encode-mimeheader.php#60283).
 442      *
 443      * @param string $text input string
 444      * @param string $charset encoding of the text
 445      * @return string base64 encoded header
 446      */
 447     public static function encode_mimeheader($text, $charset='utf-8') {
 448         if (empty($text)) {
 449             return (string)$text;
 450         }
 451         // Normalize charset
 452         $charset = self::parse_charset($charset);
 453         // If the text is pure ASCII, we don't need to encode it
 454         if (self::convert($text, $charset, 'ascii') == $text) {
 455             return $text;
 456         }
 457         // Although RFC says that line feed should be \r\n, it seems that
 458         // some mailers double convert \r, so we are going to use \n alone
 459         $linefeed="\n";
 460         // Define start and end of every chunk
 461         $start = "=?$charset?B?";
 462         $end = "?=";
 463         // Accumulate results
 464         $encoded = '';
 465         // Max line length is 75 (including start and end)
 466         $length = 75 - strlen($start) - strlen($end);
 467         // Multi-byte ratio
 468         $multilength = self::strlen($text, $charset);
 469         // Detect if strlen and friends supported
 470         if ($multilength === false) {
 471             if ($charset == 'GB18030' or $charset == 'gb18030') {
 472                 while (strlen($text)) {
 473                     // try to encode first 22 chars - we expect most chars are two bytes long
 474                     if (preg_match('/^(([\x00-\x7f])|([\x81-\xfe][\x40-\x7e])|([\x81-\xfe][\x80-\xfe])|([\x81-\xfe][\x30-\x39]..)){1,22}/m', $text, $matches)) {
 475                         $chunk = $matches[0];
 476                         $encchunk = base64_encode($chunk);
 477                         if (strlen($encchunk) > $length) {
 478                             // find first 11 chars - each char in 4 bytes - worst case scenario
 479                             preg_match('/^(([\x00-\x7f])|([\x81-\xfe][\x40-\x7e])|([\x81-\xfe][\x80-\xfe])|([\x81-\xfe][\x30-\x39]..)){1,11}/m', $text, $matches);
 480                             $chunk = $matches[0];
 481                             $encchunk = base64_encode($chunk);
 482                         }
 483                         $text = substr($text, strlen($chunk));
 484                         $encoded .= ' '.$start.$encchunk.$end.$linefeed;
 485                     } else {
 486                         break;
 487                     }
 488                 }
 489                 $encoded = trim($encoded);
 490                 return $encoded;
 491             } else {
 492                 return false;
 493             }
 494         }
 495         $ratio = $multilength / strlen($text);
 496         // Base64 ratio
 497         $magic = $avglength = floor(3 * $length * $ratio / 4);
 498         // basic infinite loop protection
 499         $maxiterations = strlen($text)*2;
 500         $iteration = 0;
 501         // Iterate over the string in magic chunks
 502         for ($i=0; $i <= $multilength; $i+=$magic) {
 503             if ($iteration++ > $maxiterations) {
 504                 return false; // probably infinite loop
 505             }
 506             $magic = $avglength;
 507             $offset = 0;
 508             // Ensure the chunk fits in length, reducing magic if necessary
 509             do {
 510                 $magic -= $offset;
 511                 $chunk = self::substr($text, $i, $magic, $charset);
 512                 $chunk = base64_encode($chunk);
 513                 $offset++;
 514             } while (strlen($chunk) > $length);
 515             // This chunk doesn't break any multi-byte char. Use it.
 516             if ($chunk)
 517                 $encoded .= ' '.$start.$chunk.$end.$linefeed;
 518         }
 519         // Strip the first space and the last linefeed
 520         $encoded = substr($encoded, 1, -strlen($linefeed));
 521
 522         return $encoded;
 523     }
 524
 525     /**
 526      * Returns HTML entity transliteration table.
 527      * @return array with (html entity => utf-8) elements
 528      */
 529     protected static function get_entities_table() {
 530         static $trans_tbl = null;
 531
 532         // Generate/create $trans_tbl
 533         if (!isset($trans_tbl)) {
 534             if (version_compare(phpversion(), '5.3.4') < 0) {
 535                 $trans_tbl = array();
 536                 foreach (get_html_translation_table(HTML_ENTITIES) as $val=>$key) {
 537                     $trans_tbl[$key] = self::convert($val, 'ISO-8859-1', 'utf-8');
 538                 }
 539
 540             } else if (version_compare(phpversion(), '5.4.0') < 0) {
 541                 $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8');
 542                 $trans_tbl = array_flip($trans_tbl);
 543
 544             } else {
 545                 $trans_tbl = get_html_translation_table(HTML_ENTITIES, ENT_COMPAT | ENT_HTML401, 'UTF-8');
 546                 $trans_tbl = array_flip($trans_tbl);
 547             }
 548         }
 549
 550         return $trans_tbl;
 551     }
 552
 553     /**
 554      * Converts all the numeric entities &#nnnn; or &#xnnn; to UTF-8
 555      * Original from laurynas dot butkus at gmail at:
 556      * http://php.net/manual/en/function.html-entity-decode.php#75153
 557      * with some custom mods to provide more functionality
 558      *
 559      * @param string $str input string
 560      * @param boolean $htmlent convert also html entities (defaults to true)
 561      * @return string encoded UTF-8 string
 562      */
 563     public static function entities_to_utf8($str, $htmlent=true) {
 564         static $callback1 = null ;
 565         static $callback2 = null ;
 566
 567         if (!$callback1 or !$callback2) {
 568             $callback1 = function($matches) {
 569                 return core_text::code2utf8(hexdec($matches[1]));
 570             };
 571             $callback2 = function($matches) {
 572                 return core_text::code2utf8($matches[1]);
 573             };
 574         }
 575
 576         $result = (string)$str;
 577         $result = preg_replace_callback('/&#x([0-9a-f]+);/i', $callback1, $result);
 578         $result = preg_replace_callback('/&#([0-9]+);/', $callback2, $result);
 579
 580         // Replace literal entities (if desired)
 581         if ($htmlent) {
 582             $trans_tbl = self::get_entities_table();
 583             // It should be safe to search for ascii strings and replace them with utf-8 here.
 584             $result = strtr($result, $trans_tbl);
 585         }
 586         // Return utf8-ised string
 587         return $result;
 588     }
 589
 590     /**
 591      * Converts all Unicode chars > 127 to numeric entities &#nnnn; or &#xnnn;.
 592      *
 593      * @param string $str input string
 594      * @param boolean $dec output decadic only number entities
 595      * @param boolean $nonnum remove all non-numeric entities
 596      * @return string converted string
 597      */
 598     public static function utf8_to_entities($str, $dec=false, $nonnum=false) {
 599         static $callback = null ;
 600
 601         if ($nonnum) {
 602             $str = self::entities_to_utf8($str, true);
 603         }
 604
 605         // Avoid some notices from Typo3 code
 606         $oldlevel = error_reporting(E_PARSE);
 607         $result = self::typo3()->utf8_to_entities((string)$str);
 608         error_reporting($oldlevel);
 609
 610         if ($dec) {
 611             if (!$callback) {
 612                 $callback = function($matches) {
 613                     return '&#' . hexdec($matches[1]) . ';';
 614                 };
 615             }
 616             $result = preg_replace_callback('/&#x([0-9a-f]+);/i', $callback, $result);
 617         }
 618
 619         return $result;
 620     }
 621
 622     /**
 623      * Removes the BOM from unicode string {@link http://unicode.org/faq/utf_bom.html}
 624      *
 625      * @param string $str input string
 626      * @return string
 627      */
 628     public static function trim_utf8_bom($str) {
 629         $bom = "\xef\xbb\xbf";
 630         if (strpos($str, $bom) === 0) {
 631             return substr($str, strlen($bom));
 632         }
 633         return $str;
 634     }
 635
 636     /**
 637      * There are a number of Unicode non-characters including the byte-order mark (which may appear
 638      * multiple times in a string) and also other ranges. These can cause problems for some
 639      * processing.
 640      *
 641      * This function removes the characters using string replace, so that the rest of the string
 642      * remains unchanged.
 643      *
 644      * @param string $value Input string
 645      * @return string Cleaned string value
 646      * @since Moodle 3.5
 647      */
 648     public static function remove_unicode_non_characters($value) {
 649         // Set up list of all Unicode non-characters for fast replacing.
 650         if (!self::$noncharacters) {
 651             self::$noncharacters = [];
 652             // This list of characters is based on the Unicode standard. It includes the last two
 653             // characters of each code planes 0-16 inclusive...
 654             for ($plane = 0; $plane <= 16; $plane++) {
 655                 $base = ($plane === 0 ? '' : dechex($plane));
 656                 self::$noncharacters[] = html_entity_decode('&#x' . $base . 'fffe;');
 657                 self::$noncharacters[] = html_entity_decode('&#x' . $base . 'ffff;');
 658             }
 659             // ...And the character range U+FDD0 to U+FDEF.
 660             for ($char = 0xfdd0; $char <= 0xfdef; $char++) {
 661                 self::$noncharacters[] = html_entity_decode('&#x' . dechex($char) . ';');
 662             }
 663         }
 664
 665         // Do character replacement.
 666         return str_replace(self::$noncharacters, '', $value);
 667     }
 668
 669     /**
 670      * Returns encoding options for select boxes, utf-8 and platform encoding first
 671      *
 672      * @return array encodings
 673      */
 674     public static function get_encodings() {
 675         $encodings = array();
 676         $encodings['UTF-8'] = 'UTF-8';
 677         $winenc = strtoupper(get_string('localewincharset', 'langconfig'));
 678         if ($winenc != '') {
 679             $encodings[$winenc] = $winenc;
 680         }
 681         $nixenc = strtoupper(get_string('oldcharset', 'langconfig'));
 682         $encodings[$nixenc] = $nixenc;
 683
 684         foreach (self::typo3()->synonyms as $enc) {
 685             $enc = strtoupper($enc);
 686             $encodings[$enc] = $enc;
 687         }
 688         return $encodings;
 689     }
 690
 691     /**
 692      * Returns the utf8 string corresponding to the unicode value
 693      * (from php.net, courtesy - romans@void.lv)
 694      *
 695      * @param  int    $num one unicode value
 696      * @return string the UTF-8 char corresponding to the unicode value
 697      */
 698     public static function code2utf8($num) {
 699         if ($num < 128) {
 700             return chr($num);
 701         }
 702         if ($num < 2048) {
 703             return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
 704         }
 705         if ($num < 65536) {
 706             return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 707         }
 708         if ($num < 2097152) {
 709             return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 710         }
 711         return '';
 712     }
 713
 714     /**
 715      * Returns the code of the given UTF-8 character
 716      *
 717      * @param  string $utf8char one UTF-8 character
 718      * @return int    the code of the given character
 719      */
 720     public static function utf8ord($utf8char) {
 721         if ($utf8char == '') {
 722             return 0;
 723         }
 724         $ord0 = ord($utf8char{0});
 725         if ($ord0 >= 0 && $ord0 <= 127) {
 726             return $ord0;
 727         }
 728         $ord1 = ord($utf8char{1});
 729         if ($ord0 >= 192 && $ord0 <= 223) {
 730             return ($ord0 - 192) * 64 + ($ord1 - 128);
 731         }
 732         $ord2 = ord($utf8char{2});
 733         if ($ord0 >= 224 && $ord0 <= 239) {
 734             return ($ord0 - 224) * 4096 + ($ord1 - 128) * 64 + ($ord2 - 128);
 735         }
 736         $ord3 = ord($utf8char{3});
 737         if ($ord0 >= 240 && $ord0 <= 247) {
 738             return ($ord0 - 240) * 262144 + ($ord1 - 128 )* 4096 + ($ord2 - 128) * 64 + ($ord3 - 128);
 739         }
 740         return false;
 741     }
 742
 743     /**
 744      * Makes first letter of each word capital - words must be separated by spaces.
 745      * Use with care, this function does not work properly in many locales!!!
 746      *
 747      * @param string $text input string
 748      * @return string
 749      */
 750     public static function strtotitle($text) {
 751         if (empty($text)) {
 752             return $text;
 753         }
 754
 755         if (function_exists('mb_convert_case')) {
 756             return mb_convert_case($text, MB_CASE_TITLE, 'UTF-8');
 757         }
 758
 759         $text = self::strtolower($text);
 760         $words = explode(' ', $text);
 761         foreach ($words as $i=>$word) {
 762             $length = self::strlen($word);
 763             if (!$length) {
 764                 continue;
 765
 766             } else if ($length == 1) {
 767                 $words[$i] = self::strtoupper($word);
 768
 769             } else {
 770                 $letter = self::substr($word, 0, 1);
 771                 $letter = self::strtoupper($letter);
 772                 $rest   = self::substr($word, 1);
 773                 $words[$i] = $letter.$rest;
 774             }
 775         }
 776         return implode(' ', $words);
 777     }
 778 }