Merge pull request #4038 from dokuwiki/create-pull-request/patch
[dokuwiki.git] / inc / Utf8 / Conversion.php
blobfad9cd0b1b0d3678fe343ded9ad65f85eb24fed4
1 <?php
3 namespace dokuwiki\Utf8;
5 /**
6 * Methods to convert from and to UTF-8 strings
7 */
8 class Conversion
11 /**
12 * Encodes UTF-8 characters to HTML entities
14 * @author Tom N Harris <tnharris@whoopdedo.org>
15 * @author <vpribish at shopping dot com>
16 * @link http://php.net/manual/en/function.utf8-decode.php
18 * @param string $str
19 * @param bool $all Encode non-utf8 char to HTML as well
20 * @return string
22 public static function toHtml($str, $all = false)
24 $ret = '';
25 foreach (Unicode::fromUtf8($str) as $cp) {
26 if ($cp < 0x80 && !$all) {
27 $ret .= chr($cp);
28 } elseif ($cp < 0x100) {
29 $ret .= "&#$cp;";
30 } else {
31 $ret .= '&#x' . dechex($cp) . ';';
34 return $ret;
37 /**
38 * Decodes HTML entities to UTF-8 characters
40 * Convert any &#..; entity to a codepoint,
41 * The entities flag defaults to only decoding numeric entities.
42 * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
43 * are handled as well. Avoids the problem that would occur if you
44 * had to decode "&amp;#38;&#38;amp;#38;"
46 * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
47 * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
48 * what it should be -> "&#38;&amp#38;"
50 * @author Tom N Harris <tnharris@whoopdedo.org>
52 * @param string $str UTF-8 encoded string
53 * @param boolean $entities decode name entities in addtition to numeric ones
54 * @return string UTF-8 encoded string with numeric (and named) entities replaced.
56 public static function fromHtml($str, $entities = false)
58 if (!$entities) {
59 return preg_replace_callback(
60 '/(&#([Xx])?([0-9A-Za-z]+);)/m',
61 [__CLASS__, 'decodeNumericEntity'],
62 $str
66 return preg_replace_callback(
67 '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
68 [__CLASS__, 'decodeAnyEntity'],
69 $str
73 /**
74 * Decodes any HTML entity to it's correct UTF-8 char equivalent
76 * @param string $ent An entity
77 * @return string
79 protected static function decodeAnyEntity($ent)
81 // create the named entity lookup table
82 static $table = null;
83 if ($table === null) {
84 $table = get_html_translation_table(HTML_ENTITIES);
85 $table = array_flip($table);
86 $table = array_map(
87 static function ($c) {
88 return Unicode::toUtf8(array(ord($c)));
90 $table
94 if ($ent[1] === '#') {
95 return self::decodeNumericEntity($ent);
98 if (array_key_exists($ent[0], $table)) {
99 return $table[$ent[0]];
102 return $ent[0];
106 * Decodes numeric HTML entities to their correct UTF-8 characters
108 * @param $ent string A numeric entity
109 * @return string|false
111 protected static function decodeNumericEntity($ent)
113 switch ($ent[2]) {
114 case 'X':
115 case 'x':
116 $cp = hexdec($ent[3]);
117 break;
118 default:
119 $cp = intval($ent[3]);
120 break;
122 return Unicode::toUtf8(array($cp));
126 * UTF-8 to UTF-16BE conversion.
128 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
130 * @param string $str
131 * @param bool $bom
132 * @return string
134 public static function toUtf16be($str, $bom = false)
136 $out = $bom ? "\xFE\xFF" : '';
137 if (UTF8_MBSTRING) {
138 return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
141 $uni = Unicode::fromUtf8($str);
142 foreach ($uni as $cp) {
143 $out .= pack('n', $cp);
145 return $out;
149 * UTF-8 to UTF-16BE conversion.
151 * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
153 * @param string $str
154 * @return false|string
156 public static function fromUtf16be($str)
158 $uni = unpack('n*', $str);
159 return Unicode::toUtf8($uni);