2 /** @package verysimple::String */
5 * A set of utility functions for working with strings
7 * @package verysimple::String
9 * @copyright 1997-2008 VerySimple, Inc.
10 * @license http://www.gnu.org/licenses/lgpl.html LGPL
13 class VerySimpleStringUtil
15 /** @var the character set used when converting non ascii characters */
16 static $DEFAULT_CHARACTER_SET = 'UTF-8';
18 /** @var list of fancy/smart quote characters plus emdash w/ generic replacements */
19 static $SMART_QUOTE_CHARS;
21 /** @var list of xml reserved characters */
22 static $XML_SPECIAL_CHARS;
24 /** @var associative array containing the html translation for special characters with their numeric equivilant */
25 static $HTML_ENTITIES_TABLE;
27 /** @var common characters, especially on windows systems, that are technical not valid */
28 static $INVALID_CODE_CHARS;
30 /** @var characters used as control characters such as escape, backspace, etc */
31 static $CONTROL_CODE_CHARS;
34 * replace the first occurrance only within a string
43 static function ReplaceFirst($s, $r, $str)
46 $a = strpos($str, $s);
48 $temp = substr($str, 0, $a) . $r . substr($str, $b, ($l - $b));
53 * VerySimpleStringUtil::InitStaticVars(); is called at the bottom of this file
55 static function InitStaticVars()
57 self
::$HTML_ENTITIES_TABLE = array ();
58 foreach (get_html_translation_table(HTML_ENTITIES
, ENT_QUOTES
) as $char => $entity) {
59 self
::$HTML_ENTITIES_TABLE [$entity] = '&#' . ord($char) . ';';
62 self
::$SMART_QUOTE_CHARS = array (
74 self
::$CONTROL_CODE_CHARS = array (
104 self
::$INVALID_CODE_CHARS = array (
105 chr(128) => '€',
106 chr(130) => '‚',
107 chr(131) => 'ƒ',
108 chr(132) => '„',
109 chr(133) => '…',
110 chr(134) => '†',
111 chr(135) => '‡',
112 chr(136) => 'ˆ',
113 chr(137) => '‰',
114 chr(138) => 'Š',
115 chr(139) => '‹',
116 chr(140) => 'Œ',
117 chr(142) => 'Ž',
118 chr(145) => '‘',
119 chr(146) => '’',
120 chr(147) => '“',
121 chr(148) => '”',
122 chr(149) => '•',
123 chr(150) => '–',
124 chr(151) => '—',
125 chr(152) => '˜',
126 chr(153) => '™',
127 chr(154) => 'š',
128 chr(155) => '›',
129 chr(156) => 'œ',
130 chr(158) => 'ž',
134 self
::$XML_SPECIAL_CHARS = array (
144 * Takes the given text and converts any email address into mailto links,
145 * returning HTML content.
147 * @param string $text
149 * bool true to sanitize the text before parsing for display security
150 * @return string HTML
152 static function ConvertEmailToMailTo($text, $sanitize = false)
155 $text = VerySimpleStringUtil
::Sanitize($text);
158 $regex = "/([a-z0-9_\-\.]+)" . "@" . "([a-z0-9-]{1,64})" . "\." . "([a-z]{2,10})/i";
159 return preg_replace($regex, '<a href="mailto:\\1@\\2.\\3">\\1@\\2.\\3</a>', $text);
163 * Takes the given text and converts any URLs into links,
164 * returning HTML content.
166 * @param string $text
168 * bool true to sanitize the text before parsing for display security
169 * @return string HTML
171 static function ConvertUrlToLink($text, $sanitize = false)
174 $text = VerySimpleStringUtil
::Sanitize($text);
177 $regex = "/[[:alpha:]]+://[^<>[:space:]]+[[:alnum:]/]/i";
178 return preg_replace($regex, '<a href=\"\\0\">\\0</a>', $text);
182 * Sanitize any text so that it can be safely displayed as HTML without
183 * allowing XSS or other injection attacks
185 * @param string $text
188 static function Sanitize($text)
190 return htmlspecialchars($text);
195 * @param string $string
196 * @param bool $numericEncodingOnly
197 * set to true to only use numeric html encoding. warning, setting to false may be slower performance (default true)
198 * @param bool $encodeControlCharacters
199 * (only relevant if $numericEncodingOnly = false) false = wipe control chars. true = encode control characters (default false)
202 static function EncodeToHTML($string, $numericEncodingOnly = true, $encodeControlCharacters = false)
204 if (strlen($string) == 0) {
208 $result = $numericEncodingOnly ? self
::UTF8ToHtml($string) : self
::UTFToNamedHTML($string, $encodeControlCharacters);
214 * Decode string that has been encoded using EncodeToHTML
215 * used in combination with utf8_decode can be helpful
216 * @TODO: warning, this function is BETA!
218 * @param string $string
220 * destination character set (default = $DEFAULT_CHARACTER_SET (UTF-8))
222 static function DecodeFromHTML($string, $charset = null)
224 // this only gets named characters
225 // return html_entity_decode($string);
227 // this is a complex method that appears to be the reverse of UTF8ToHTML
228 // taken from http://www.php.net/manual/en/function.html-entity-decode.php#68491
229 // $string = self::ReplaceNonNumericEntities($string);
230 // $string = preg_replace_callback('~&(#(x?))?([^;]+);~', 'self::html_entity_replace', $string);
233 // this way at least somebody could specify a character set. UTF-8 will work most of the time
234 if ($charset == null) {
235 $charset = VerySimpleStringUtil
::$DEFAULT_CHARACTER_SET;
238 return mb_convert_encoding($string, $charset, 'HTML-ENTITIES');
242 * This HTML encodes special characters and returns an ascii safe version.
243 * This function extends EncodeToHTML to additionally strip
244 * out characters that may be disruptive when used in HTML or XML data
247 * string value to parse
248 * @param bool $escapeQuotes
249 * true to additionally escape ENT_QUOTE characters <>&"' (default = true)
250 * @param bool $numericEncodingOnly
251 * set to true to only use numeric html encoding. warning, setting to false may be slower performance (default true)
252 * @param bool $replaceSmartQuotes
253 * true to replace "smart quotes" with standard ascii ones, can be useful for stripping out windows-only codes (default = false)
256 static function EncodeSpecialCharacters($string, $escapeQuotes = true, $numericEncodingOnly = true, $replaceSmartQuotes = false)
258 if (strlen($string) == 0) {
264 // do this first before encoding
265 if ($replaceSmartQuotes) {
266 $result = self
::ReplaceSmartQuotes($result);
269 // this method does not double-encode, but replaces single-quote with a numeric entity
271 $result = htmlspecialchars($result, ENT_QUOTES
, null, false);
274 // this method double-encodes values but uses the special character entity for single quotes
275 // if ($escapeQuotes) $result = self::ReplaceXMLSpecialChars($result);
277 // for special chars we don't need to insist on numeric encoding only
278 return self
::EncodeToHTML($result, $numericEncodingOnly);
282 * Converts a string into a character array
284 * @param string $string
287 static function GetCharArray($string)
289 return preg_split("//", $string, - 1, PREG_SPLIT_NO_EMPTY
);
293 * This replaces XML special characters with HTML encoding
295 * @param string $string
298 static function ReplaceXMLSpecialChars($string)
300 return strtr($string, self
::$XML_SPECIAL_CHARS);
304 * This replaces smart (fancy) quote characters with generic ascii versions
306 * @param string $string
309 static function ReplaceSmartQuotes($string)
311 return strtr($string, self
::$SMART_QUOTE_CHARS);
315 * This replaces control characters characters with generic ascii versions
317 * @param string $string
320 static function ReplaceControlCodeChars($string)
322 return strtr($string, self
::$CONTROL_CODE_CHARS);
326 * This replaces all non-numeric html entities with the numeric equivilant
328 * @param string $string
331 static function ReplaceNonNumericEntities($string)
333 return strtr($string, self
::$HTML_ENTITIES_TABLE);
337 * This replaces illegal ascii code values $INVALID_CODE_CHARS
339 * @param string $string
342 static function ReplaceInvalidCodeChars($string)
344 return strtr($string, self
::$INVALID_CODE_CHARS);
348 * This is The same as UTFToHTML except it utilizes htmlentities, which will return the Named
349 * HTML code when possible (ie £ §, etc).
350 * It is preferrable in all cases to use
351 * UTFToHTML instead unless you absolutely have to have named entities
353 * @param string $string
354 * @param bool $encodeControlCharacters
355 * false = wipe control chars. true = encode control characters (default false)
358 static function UTFToNamedHTML($string, $encodeControlCharacters = false)
362 for ($i = 0; $i < strlen($utf8); $i ++
) {
366 // one-byte character
368 } else if ($ascii < 192) {
369 // non-utf8 character or not a start byte
370 $result .= ($encodeControlCharacters) ?
htmlentities($char) : '';
371 } else if ($ascii < 224) {
372 // two-byte character
373 $encoded = htmlentities(substr($utf8, $i, 2), ENT_QUOTES
, 'UTF-8');
375 // @hack if htmlentities didn't encode it, then we need to do a charset conversion
376 if ($encoded != '' && substr($encoded, 0, 1) != '&') {
377 $encoded = mb_convert_encoding($encoded, 'HTML-ENTITIES', self
::$DEFAULT_CHARACTER_SET);
382 } else if ($ascii < 240) {
383 // three-byte character
384 $ascii1 = ord($utf8 [$i +
1]);
385 $ascii2 = ord($utf8 [$i +
2]);
386 $unicode = (15 & $ascii) * 4096 +
(63 & $ascii1) * 64 +
(63 & $ascii2);
387 $result .= "&#$unicode;";
389 } else if ($ascii < 248) { // (TODO: should this be 245 or 248 ??)
390 // four-byte character
391 $ascii1 = ord($utf8 [$i +
1]);
392 $ascii2 = ord($utf8 [$i +
2]);
393 $ascii3 = ord($utf8 [$i +
3]);
394 $unicode = (15 & $ascii) * 262144 +
(63 & $ascii1) * 4096 +
(63 & $ascii2) * 64 +
(63 & $ascii3);
395 $result .= "&#$unicode;";
404 * Converts UTF-8 character set into html encoded goodness
407 * @link http://www.php.net/manual/en/function.htmlentities.php#92105
408 * @param string $content
410 static function UTF8ToHTML($content = "")
412 $contents = self
::unicode_string_to_array($content);
414 $iCount = count($contents);
415 for ($o = 0; $o < $iCount; $o ++
) {
416 $contents [$o] = self
::unicode_entity_replace($contents [$o]);
417 $swap .= $contents [$o];
420 return mb_convert_encoding($swap, "UTF-8"); // not really necessary, but why not.
424 * takes a unicode string and turns it into an array
425 * of UTF-8 characters
428 * @param string $string
431 static function unicode_string_to_array($string)
434 $strlen = mb_strlen($string);
436 $array [] = mb_substr($string, 0, 1, "UTF-8");
437 $string = mb_substr($string, 1, $strlen, "UTF-8");
438 $strlen = mb_strlen($string);
445 * Uses scary binary math to replace a character with
452 static function unicode_entity_replace($c)
455 if ($h <= 0x7F) { // 127
457 } else if ($h < 0xC2) { // 194
461 if ($h <= 0xDF) { // 0xDF = 223
462 $h = ($h & 0x1F) << 6 |
(ord($c {1}) & 0x3F); // 0x0F = 15, 0x1F = 31, 0x3F = 63
463 $h = "&#" . $h . ";";
465 } else if ($h <= 0xEF) { // 0xEF = 239
466 $h = ($h & 0x0F) << 12 |
(ord($c {1}) & 0x3F) << 6 |
(ord($c {2}) & 0x3F);
467 $h = "&#" . $h . ";";
469 } else if ($h <= 0xF4) { // 0xF4 = 244 (TODO: should this be 244 or 247 ??)
470 $h = ($h & 0x0F) << 18 |
(ord($c {1}) & 0x3F) << 12 |
(ord($c {2}) & 0x3F) << 6 |
(ord($c {3}) & 0x3F);
471 $h = "&#" . $h . ";";
477 * Used for decoding entities that started as UTF-8
478 * converts a character that is likely non ascii into the correct UTF-8 char value
480 * @link http://www.php.net/manual/en/function.html-entity-decode.php#68491
484 function chr_utf8($code)
488 } elseif ($code < 128) {
490 } elseif ($code < 160) { // Remove Windows Illegals Cars
493 } elseif ($code == 129) {
494 $code = 160; // not affected
495 } elseif ($code == 130) {
497 } elseif ($code == 131) {
499 } elseif ($code == 132) {
501 } elseif ($code == 133) {
503 } elseif ($code == 134) {
505 } elseif ($code == 135) {
507 } elseif ($code == 136) {
509 } elseif ($code == 137) {
511 } elseif ($code == 138) {
513 } elseif ($code == 139) {
515 } elseif ($code == 140) {
517 } elseif ($code == 141) {
518 $code = 160; // not affected
519 } elseif ($code == 142) {
521 } elseif ($code == 143) {
522 $code = 160; // not affected
523 } elseif ($code == 144) {
524 $code = 160; // not affected
525 } elseif ($code == 145) {
527 } elseif ($code == 146) {
529 } elseif ($code == 147) {
531 } elseif ($code == 148) {
533 } elseif ($code == 149) {
535 } elseif ($code == 150) {
537 } elseif ($code == 151) {
539 } elseif ($code == 152) {
541 } elseif ($code == 153) {
543 } elseif ($code == 154) {
545 } elseif ($code == 155) {
547 } elseif ($code == 156) {
549 } elseif ($code == 157) {
550 $code = 160; // not affected
551 } elseif ($code == 158) {
553 } elseif ($code == 159) {
559 return chr(192 |
($code >> 6)) . chr(128 |
($code & 63));
560 } elseif ($code < 65536) {
561 return chr(224 |
($code >> 12)) . chr(128 |
(($code >> 6) & 63)) . chr(128 |
($code & 63));
563 return chr(240 |
($code >> 18)) . chr(128 |
(($code >> 12) & 63)) . chr(128 |
(($code >> 6) & 63)) . chr(128 |
($code & 63));
568 * Callback for preg_replace_callback('~&(#(x?))?([^;]+);~', 'html_entity_replace', $str);
569 * used internally by decode
571 * @link http://www.php.net/manual/en/function.html-entity-decode.php#68491
575 function html_entity_replace($matches)
578 return self
::chr_utf8(hexdec($matches [3]));
579 } elseif ($matches [1]) {
580 return self
::chr_utf8($matches [3]);
581 } elseif ($matches [3]) {
582 // return "((&" . $matches[3] . ";))";
583 // return mb_convert_encoding('&'.$matches[3].';', 'UTF-8', 'HTML-ENTITIES');
584 return html_entity_decode('&' . $matches [3] . ';');
591 // this will be executed only once
592 VerySimpleStringUtil
::InitStaticVars();