Highway to PSR2
[openemr.git] / portal / patient / fwk / libs / verysimple / String / VerySimpleStringUtil.php
blob4d1f73d32100f462609321098fde42b93e4d6dbf
1 <?php
2 /** @package verysimple::String */
4 /**
5 * A set of utility functions for working with strings
7 * @package verysimple::String
8 * @author Jason Hinkle
9 * @copyright 1997-2008 VerySimple, Inc.
10 * @license http://www.gnu.org/licenses/lgpl.html LGPL
11 * @version 1.0
13 class VerySimpleStringUtil
15 /** @var the character set used when converting non ascii characters */
16 static $DEFAULT_CHARACTER_SET = 'UTF-8';
18 /** @var list of fancy/smart quote characters plus emdash w/ generic replacements */
19 static $SMART_QUOTE_CHARS;
21 /** @var list of xml reserved characters */
22 static $XML_SPECIAL_CHARS;
24 /** @var associative array containing the html translation for special characters with their numeric equivilant */
25 static $HTML_ENTITIES_TABLE;
27 /** @var common characters, especially on windows systems, that are technical not valid */
28 static $INVALID_CODE_CHARS;
30 /** @var characters used as control characters such as escape, backspace, etc */
31 static $CONTROL_CODE_CHARS;
33 /**
34 * replace the first occurrance only within a string
36 * @param
37 * string needle
38 * @param
39 * string replacement
40 * @param
41 * string haystack
43 static function ReplaceFirst($s, $r, $str)
45 $l = strlen($str);
46 $a = strpos($str, $s);
47 $b = $a + strlen($s);
48 $temp = substr($str, 0, $a) . $r . substr($str, $b, ($l - $b));
49 return $temp;
52 /**
53 * VerySimpleStringUtil::InitStaticVars(); is called at the bottom of this file
55 static function InitStaticVars()
57 self::$HTML_ENTITIES_TABLE = array ();
58 foreach (get_html_translation_table(HTML_ENTITIES, ENT_QUOTES) as $char => $entity) {
59 self::$HTML_ENTITIES_TABLE [$entity] = '&#' . ord($char) . ';';
62 self::$SMART_QUOTE_CHARS = array (
63 "�" => "'",
64 "�" => "'",
65 "�" => "\"",
66 "�" => "\"",
67 chr(145) => "'",
68 chr(146) => "'",
69 chr(147) => "\"",
70 chr(148) => "\"",
71 chr(151) => "-"
74 self::$CONTROL_CODE_CHARS = array (
75 chr(0) => "&#0;",
76 chr(1) => "&#1;",
77 chr(2) => "&#2;",
78 chr(3) => "&#3;",
79 chr(4) => "&#4;",
80 chr(5) => "&#5;",
81 chr(6) => "&#6;",
82 chr(7) => "&#7;",
83 chr(8) => "&#8;",
84 chr(14) => "&#14;",
85 chr(15) => "&#15;",
86 chr(16) => "&#16;",
87 chr(17) => "&#17;",
88 chr(18) => "&#18;",
89 chr(19) => "&#19;",
90 chr(20) => "&#20;",
91 chr(21) => "&#21;",
92 chr(22) => "&#22;",
93 chr(23) => "&#23;",
94 chr(24) => "&#24;",
95 chr(25) => "&#25;",
96 chr(26) => "&#26;",
97 chr(27) => "&#27;",
98 chr(28) => "&#28;",
99 chr(29) => "&#29;",
100 chr(30) => "&#30;",
101 chr(31) => "&#31;"
104 self::$INVALID_CODE_CHARS = array (
105 chr(128) => '&#8364;',
106 chr(130) => '&#8218;',
107 chr(131) => '&#402;',
108 chr(132) => '&#8222;',
109 chr(133) => '&#8230;',
110 chr(134) => '&#8224;',
111 chr(135) => '&#8225;',
112 chr(136) => '&#710;',
113 chr(137) => '&#8240;',
114 chr(138) => '&#352;',
115 chr(139) => '&#8249;',
116 chr(140) => '&#338;',
117 chr(142) => '&#381;',
118 chr(145) => '&#8216;',
119 chr(146) => '&#8217;',
120 chr(147) => '&#8220;',
121 chr(148) => '&#8221;',
122 chr(149) => '&#8226;',
123 chr(150) => '&#8211;',
124 chr(151) => '&#8212;',
125 chr(152) => '&#732;',
126 chr(153) => '&#8482;',
127 chr(154) => '&#353;',
128 chr(155) => '&#8250;',
129 chr(156) => '&#339;',
130 chr(158) => '&#382;',
131 chr(159) => '&#376;'
134 self::$XML_SPECIAL_CHARS = array (
135 "&" => "&amp;",
136 "<" => "&lt;",
137 ">" => "&gt;",
138 "\"" => "&quot;",
139 "'" => "&apos;"
144 * Takes the given text and converts any email address into mailto links,
145 * returning HTML content.
147 * @param string $text
148 * @param
149 * bool true to sanitize the text before parsing for display security
150 * @return string HTML
152 static function ConvertEmailToMailTo($text, $sanitize = false)
154 if ($sanitize) {
155 $text = VerySimpleStringUtil::Sanitize($text);
158 $regex = "/([a-z0-9_\-\.]+)" . "@" . "([a-z0-9-]{1,64})" . "\." . "([a-z]{2,10})/i";
159 return preg_replace($regex, '<a href="mailto:\\1@\\2.\\3">\\1@\\2.\\3</a>', $text);
163 * Takes the given text and converts any URLs into links,
164 * returning HTML content.
166 * @param string $text
167 * @param
168 * bool true to sanitize the text before parsing for display security
169 * @return string HTML
171 static function ConvertUrlToLink($text, $sanitize = false)
173 if ($sanitize) {
174 $text = VerySimpleStringUtil::Sanitize($text);
177 $regex = "/[[:alpha:]]+://[^<>[:space:]]+[[:alnum:]/]/i";
178 return preg_replace($regex, '<a href=\"\\0\">\\0</a>', $text);
182 * Sanitize any text so that it can be safely displayed as HTML without
183 * allowing XSS or other injection attacks
185 * @param string $text
186 * @return string
188 static function Sanitize($text)
190 return htmlspecialchars($text);
195 * @param string $string
196 * @param bool $numericEncodingOnly
197 * set to true to only use numeric html encoding. warning, setting to false may be slower performance (default true)
198 * @param bool $encodeControlCharacters
199 * (only relevant if $numericEncodingOnly = false) false = wipe control chars. true = encode control characters (default false)
200 * @return string
202 static function EncodeToHTML($string, $numericEncodingOnly = true, $encodeControlCharacters = false)
204 if (strlen($string) == 0) {
205 return "";
208 $result = $numericEncodingOnly ? self::UTF8ToHtml($string) : self::UTFToNamedHTML($string, $encodeControlCharacters);
210 return $result;
214 * Decode string that has been encoded using EncodeToHTML
215 * used in combination with utf8_decode can be helpful
216 * @TODO: warning, this function is BETA!
218 * @param string $string
219 * @param
220 * destination character set (default = $DEFAULT_CHARACTER_SET (UTF-8))
222 static function DecodeFromHTML($string, $charset = null)
224 // this only gets named characters
225 // return html_entity_decode($string);
227 // this is a complex method that appears to be the reverse of UTF8ToHTML
228 // taken from http://www.php.net/manual/en/function.html-entity-decode.php#68491
229 // $string = self::ReplaceNonNumericEntities($string);
230 // $string = preg_replace_callback('~&(#(x?))?([^;]+);~', 'self::html_entity_replace', $string);
231 // return $string;
233 // this way at least somebody could specify a character set. UTF-8 will work most of the time
234 if ($charset == null) {
235 $charset = VerySimpleStringUtil::$DEFAULT_CHARACTER_SET;
238 return mb_convert_encoding($string, $charset, 'HTML-ENTITIES');
242 * This HTML encodes special characters and returns an ascii safe version.
243 * This function extends EncodeToHTML to additionally strip
244 * out characters that may be disruptive when used in HTML or XML data
246 * @param
247 * string value to parse
248 * @param bool $escapeQuotes
249 * true to additionally escape ENT_QUOTE characters <>&"' (default = true)
250 * @param bool $numericEncodingOnly
251 * set to true to only use numeric html encoding. warning, setting to false may be slower performance (default true)
252 * @param bool $replaceSmartQuotes
253 * true to replace "smart quotes" with standard ascii ones, can be useful for stripping out windows-only codes (default = false)
254 * @return string
256 static function EncodeSpecialCharacters($string, $escapeQuotes = true, $numericEncodingOnly = true, $replaceSmartQuotes = false)
258 if (strlen($string) == 0) {
259 return "";
262 $result = $string;
264 // do this first before encoding
265 if ($replaceSmartQuotes) {
266 $result = self::ReplaceSmartQuotes($result);
269 // this method does not double-encode, but replaces single-quote with a numeric entity
270 if ($escapeQuotes) {
271 $result = htmlspecialchars($result, ENT_QUOTES, null, false);
274 // this method double-encodes values but uses the special character entity for single quotes
275 // if ($escapeQuotes) $result = self::ReplaceXMLSpecialChars($result);
277 // for special chars we don't need to insist on numeric encoding only
278 return self::EncodeToHTML($result, $numericEncodingOnly);
282 * Converts a string into a character array
284 * @param string $string
285 * @return array
287 static function GetCharArray($string)
289 return preg_split("//", $string, - 1, PREG_SPLIT_NO_EMPTY);
293 * This replaces XML special characters with HTML encoding
295 * @param string $string
296 * @return string
298 static function ReplaceXMLSpecialChars($string)
300 return strtr($string, self::$XML_SPECIAL_CHARS);
304 * This replaces smart (fancy) quote characters with generic ascii versions
306 * @param string $string
307 * @return string
309 static function ReplaceSmartQuotes($string)
311 return strtr($string, self::$SMART_QUOTE_CHARS);
315 * This replaces control characters characters with generic ascii versions
317 * @param string $string
318 * @return string
320 static function ReplaceControlCodeChars($string)
322 return strtr($string, self::$CONTROL_CODE_CHARS);
326 * This replaces all non-numeric html entities with the numeric equivilant
328 * @param string $string
329 * @return string
331 static function ReplaceNonNumericEntities($string)
333 return strtr($string, self::$HTML_ENTITIES_TABLE);
337 * This replaces illegal ascii code values $INVALID_CODE_CHARS
339 * @param string $string
340 * @return string
342 static function ReplaceInvalidCodeChars($string)
344 return strtr($string, self::$INVALID_CODE_CHARS);
348 * This is The same as UTFToHTML except it utilizes htmlentities, which will return the Named
349 * HTML code when possible (ie &pound; &sect;, etc).
350 * It is preferrable in all cases to use
351 * UTFToHTML instead unless you absolutely have to have named entities
353 * @param string $string
354 * @param bool $encodeControlCharacters
355 * false = wipe control chars. true = encode control characters (default false)
356 * @return string
358 static function UTFToNamedHTML($string, $encodeControlCharacters = false)
360 $utf8 = $string;
361 $result = '';
362 for ($i = 0; $i < strlen($utf8); $i ++) {
363 $char = $utf8 [$i];
364 $ascii = ord($char);
365 if ($ascii < 128) {
366 // one-byte character
367 $result .= $char;
368 } else if ($ascii < 192) {
369 // non-utf8 character or not a start byte
370 $result .= ($encodeControlCharacters) ? htmlentities($char) : '';
371 } else if ($ascii < 224) {
372 // two-byte character
373 $encoded = htmlentities(substr($utf8, $i, 2), ENT_QUOTES, 'UTF-8');
375 // @hack if htmlentities didn't encode it, then we need to do a charset conversion
376 if ($encoded != '' && substr($encoded, 0, 1) != '&') {
377 $encoded = mb_convert_encoding($encoded, 'HTML-ENTITIES', self::$DEFAULT_CHARACTER_SET);
380 $result .= $encoded;
381 $i ++;
382 } else if ($ascii < 240) {
383 // three-byte character
384 $ascii1 = ord($utf8 [$i + 1]);
385 $ascii2 = ord($utf8 [$i + 2]);
386 $unicode = (15 & $ascii) * 4096 + (63 & $ascii1) * 64 + (63 & $ascii2);
387 $result .= "&#$unicode;";
388 $i += 2;
389 } else if ($ascii < 248) { // (TODO: should this be 245 or 248 ??)
390 // four-byte character
391 $ascii1 = ord($utf8 [$i + 1]);
392 $ascii2 = ord($utf8 [$i + 2]);
393 $ascii3 = ord($utf8 [$i + 3]);
394 $unicode = (15 & $ascii) * 262144 + (63 & $ascii1) * 4096 + (63 & $ascii2) * 64 + (63 & $ascii3);
395 $result .= "&#$unicode;";
396 $i += 3;
400 return $result;
404 * Converts UTF-8 character set into html encoded goodness
406 * @author montana
407 * @link http://www.php.net/manual/en/function.htmlentities.php#92105
408 * @param string $content
410 static function UTF8ToHTML($content = "")
412 $contents = self::unicode_string_to_array($content);
413 $swap = "";
414 $iCount = count($contents);
415 for ($o = 0; $o < $iCount; $o ++) {
416 $contents [$o] = self::unicode_entity_replace($contents [$o]);
417 $swap .= $contents [$o];
420 return mb_convert_encoding($swap, "UTF-8"); // not really necessary, but why not.
424 * takes a unicode string and turns it into an array
425 * of UTF-8 characters
427 * @author adjwilli
428 * @param string $string
429 * @return array
431 static function unicode_string_to_array($string)
433 $array = array ();
434 $strlen = mb_strlen($string);
435 while ($strlen) {
436 $array [] = mb_substr($string, 0, 1, "UTF-8");
437 $string = mb_substr($string, 1, $strlen, "UTF-8");
438 $strlen = mb_strlen($string);
441 return $array;
445 * Uses scary binary math to replace a character with
446 * it's html entity
448 * @author m. perez
449 * @param char $c
450 * @return string
452 static function unicode_entity_replace($c)
454 $h = ord($c {0});
455 if ($h <= 0x7F) { // 127
456 return $c;
457 } else if ($h < 0xC2) { // 194
458 return $c;
461 if ($h <= 0xDF) { // 0xDF = 223
462 $h = ($h & 0x1F) << 6 | (ord($c {1}) & 0x3F); // 0x0F = 15, 0x1F = 31, 0x3F = 63
463 $h = "&#" . $h . ";";
464 return $h;
465 } else if ($h <= 0xEF) { // 0xEF = 239
466 $h = ($h & 0x0F) << 12 | (ord($c {1}) & 0x3F) << 6 | (ord($c {2}) & 0x3F);
467 $h = "&#" . $h . ";";
468 return $h;
469 } else if ($h <= 0xF4) { // 0xF4 = 244 (TODO: should this be 244 or 247 ??)
470 $h = ($h & 0x0F) << 18 | (ord($c {1}) & 0x3F) << 12 | (ord($c {2}) & 0x3F) << 6 | (ord($c {3}) & 0x3F);
471 $h = "&#" . $h . ";";
472 return $h;
477 * Used for decoding entities that started as UTF-8
478 * converts a character that is likely non ascii into the correct UTF-8 char value
480 * @link http://www.php.net/manual/en/function.html-entity-decode.php#68491
481 * @param
482 * $code
484 function chr_utf8($code)
486 if ($code < 0) {
487 return false;
488 } elseif ($code < 128) {
489 return chr($code);
490 } elseif ($code < 160) { // Remove Windows Illegals Cars
491 if ($code == 128) {
492 $code = 8364;
493 } elseif ($code == 129) {
494 $code = 160; // not affected
495 } elseif ($code == 130) {
496 $code = 8218;
497 } elseif ($code == 131) {
498 $code = 402;
499 } elseif ($code == 132) {
500 $code = 8222;
501 } elseif ($code == 133) {
502 $code = 8230;
503 } elseif ($code == 134) {
504 $code = 8224;
505 } elseif ($code == 135) {
506 $code = 8225;
507 } elseif ($code == 136) {
508 $code = 710;
509 } elseif ($code == 137) {
510 $code = 8240;
511 } elseif ($code == 138) {
512 $code = 352;
513 } elseif ($code == 139) {
514 $code = 8249;
515 } elseif ($code == 140) {
516 $code = 338;
517 } elseif ($code == 141) {
518 $code = 160; // not affected
519 } elseif ($code == 142) {
520 $code = 381;
521 } elseif ($code == 143) {
522 $code = 160; // not affected
523 } elseif ($code == 144) {
524 $code = 160; // not affected
525 } elseif ($code == 145) {
526 $code = 8216;
527 } elseif ($code == 146) {
528 $code = 8217;
529 } elseif ($code == 147) {
530 $code = 8220;
531 } elseif ($code == 148) {
532 $code = 8221;
533 } elseif ($code == 149) {
534 $code = 8226;
535 } elseif ($code == 150) {
536 $code = 8211;
537 } elseif ($code == 151) {
538 $code = 8212;
539 } elseif ($code == 152) {
540 $code = 732;
541 } elseif ($code == 153) {
542 $code = 8482;
543 } elseif ($code == 154) {
544 $code = 353;
545 } elseif ($code == 155) {
546 $code = 8250;
547 } elseif ($code == 156) {
548 $code = 339;
549 } elseif ($code == 157) {
550 $code = 160; // not affected
551 } elseif ($code == 158) {
552 $code = 382;
553 } elseif ($code == 159) {
554 $code = 376;
558 if ($code < 2048) {
559 return chr(192 | ($code >> 6)) . chr(128 | ($code & 63));
560 } elseif ($code < 65536) {
561 return chr(224 | ($code >> 12)) . chr(128 | (($code >> 6) & 63)) . chr(128 | ($code & 63));
562 } else {
563 return chr(240 | ($code >> 18)) . chr(128 | (($code >> 12) & 63)) . chr(128 | (($code >> 6) & 63)) . chr(128 | ($code & 63));
568 * Callback for preg_replace_callback('~&(#(x?))?([^;]+);~', 'html_entity_replace', $str);
569 * used internally by decode
571 * @link http://www.php.net/manual/en/function.html-entity-decode.php#68491
572 * @param
573 * array
575 function html_entity_replace($matches)
577 if ($matches [2]) {
578 return self::chr_utf8(hexdec($matches [3]));
579 } elseif ($matches [1]) {
580 return self::chr_utf8($matches [3]);
581 } elseif ($matches [3]) {
582 // return "((&" . $matches[3] . ";))";
583 // return mb_convert_encoding('&'.$matches[3].';', 'UTF-8', 'HTML-ENTITIES');
584 return html_entity_decode('&' . $matches [3] . ';');
587 return false;
591 // this will be executed only once
592 VerySimpleStringUtil::InitStaticVars();