Codemod asserts to assertxs in the runtime
[hiphop-php.git] / hphp / runtime / base / zend-string.cpp
blob2e7e52157c88d41b2124c93e5e1ebf2e8835bb79
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/base/zend-string.h"
19 #include "hphp/runtime/base/zend-printf.h"
20 #include "hphp/runtime/base/zend-math.h"
22 #include "hphp/util/lock.h"
23 #include "hphp/util/overflow.h"
24 #include <cmath>
26 #ifndef _MSC_VER
27 #include <monetary.h>
28 #endif
30 #include "hphp/util/bstring.h"
31 #include "hphp/runtime/base/exceptions.h"
32 #include "hphp/runtime/base/string-buffer.h"
33 #include "hphp/runtime/base/runtime-error.h"
34 #include "hphp/runtime/base/string-util.h"
35 #include "hphp/runtime/base/builtin-functions.h"
37 #include <folly/portability/String.h>
39 #define PHP_QPRINT_MAXL 75
41 namespace HPHP {
42 ///////////////////////////////////////////////////////////////////////////////
43 // helpers
45 void string_charmask(const char *sinput, int len, char *mask) {
46 const unsigned char *input = (unsigned char *)sinput;
47 const unsigned char *end;
48 unsigned char c;
50 memset(mask, 0, 256);
51 for (end = input+len; input < end; input++) {
52 c=*input;
53 if ((input+3 < end) && input[1] == '.' && input[2] == '.'
54 && input[3] >= c) {
55 memset(mask+c, 1, input[3] - c + 1);
56 input+=3;
57 } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') {
58 /* Error, try to be as helpful as possible:
59 (a range ending/starting with '.' won't be captured here) */
60 if (end-len >= input) { /* there was no 'left' char */
61 throw_invalid_argument
62 ("charlist: Invalid '..'-range, missing left of '..'");
63 continue;
65 if (input+2 >= end) { /* there is no 'right' char */
66 throw_invalid_argument
67 ("charlist: Invalid '..'-range, missing right of '..'");
68 continue;
70 if (input[-1] > input[2]) { /* wrong order */
71 throw_invalid_argument
72 ("charlist: '..'-range needs to be incrementing");
73 continue;
75 /* FIXME: better error (a..b..c is the only left possibility?) */
76 throw_invalid_argument("charlist: Invalid '..'-range");
77 continue;
78 } else {
79 mask[c]=1;
84 int string_copy(char *dst, const char *src, int siz) {
85 register char *d = dst;
86 register const char *s = src;
87 register size_t n = siz;
89 /* Copy as many bytes as will fit */
90 if (n != 0 && --n != 0) {
91 do {
92 if ((*d++ = *s++) == 0)
93 break;
94 } while (--n != 0);
97 /* Not enough room in dst, add NUL and traverse rest of src */
98 if (n == 0) {
99 if (siz != 0)
100 *d = '\0'; /* NUL-terminate dst */
101 while (*s++)
105 return(s - src - 1); /* count does not include NUL */
108 ///////////////////////////////////////////////////////////////////////////////
109 // comparisons
111 int string_ncmp(const char *s1, const char *s2, int len) {
112 for (int i = 0; i < len; i++) {
113 char c1 = s1[i];
114 char c2 = s2[i];
115 if (c1 > c2) return 1;
116 if (c1 < c2) return -1;
118 return 0;
121 static int compare_right(char const **a, char const *aend,
122 char const **b, char const *bend) {
123 int bias = 0;
125 /* The longest run of digits wins. That aside, the greatest
126 value wins, but we can't know that it will until we've scanned
127 both numbers to know that they have the same magnitude, so we
128 remember it in BIAS. */
129 for(;; (*a)++, (*b)++) {
130 if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
131 (*b == bend || !isdigit((int)(unsigned char)**b)))
132 return bias;
133 else if (*a == aend || !isdigit((int)(unsigned char)**a))
134 return -1;
135 else if (*b == bend || !isdigit((int)(unsigned char)**b))
136 return +1;
137 else if (**a < **b) {
138 if (!bias)
139 bias = -1;
140 } else if (**a > **b) {
141 if (!bias)
142 bias = +1;
146 return 0;
149 static int compare_left(char const **a, char const *aend,
150 char const **b, char const *bend) {
151 /* Compare two left-aligned numbers: the first to have a
152 different value wins. */
153 for(;; (*a)++, (*b)++) {
154 if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
155 (*b == bend || !isdigit((int)(unsigned char)**b)))
156 return 0;
157 else if (*a == aend || !isdigit((int)(unsigned char)**a))
158 return -1;
159 else if (*b == bend || !isdigit((int)(unsigned char)**b))
160 return +1;
161 else if (**a < **b)
162 return -1;
163 else if (**a > **b)
164 return +1;
167 return 0;
170 int string_natural_cmp(char const *a, size_t a_len,
171 char const *b, size_t b_len, int fold_case) {
172 char ca, cb;
173 char const *ap, *bp;
174 char const *aend = a + a_len, *bend = b + b_len;
175 int fractional, result;
177 if (a_len == 0 || b_len == 0)
178 return a_len - b_len;
180 ap = a;
181 bp = b;
182 while (1) {
183 ca = *ap; cb = *bp;
185 /* skip over leading spaces or zeros */
186 while (isspace((int)(unsigned char)ca))
187 ca = *++ap;
189 while (isspace((int)(unsigned char)cb))
190 cb = *++bp;
192 /* process run of digits */
193 if (isdigit((int)(unsigned char)ca) && isdigit((int)(unsigned char)cb)) {
194 fractional = (ca == '0' || cb == '0');
196 if (fractional)
197 result = compare_left(&ap, aend, &bp, bend);
198 else
199 result = compare_right(&ap, aend, &bp, bend);
201 if (result != 0)
202 return result;
203 else if (ap == aend && bp == bend)
204 /* End of the strings. Let caller sort them out. */
205 return 0;
206 else {
207 /* Keep on comparing from the current point. */
208 ca = *ap; cb = *bp;
212 if (fold_case) {
213 ca = toupper((int)(unsigned char)ca);
214 cb = toupper((int)(unsigned char)cb);
217 if (ca < cb)
218 return -1;
219 else if (ca > cb)
220 return +1;
222 ++ap; ++bp;
223 if (ap >= aend && bp >= bend)
224 /* The strings compare the same. Perhaps the caller
225 will want to call strcmp to break the tie. */
226 return 0;
227 else if (ap >= aend)
228 return -1;
229 else if (bp >= bend)
230 return 1;
234 ///////////////////////////////////////////////////////////////////////////////
236 void string_to_case(String& s, int (*tocase)(int)) {
237 assertx(!s.isNull());
238 assertx(tocase);
239 auto data = s.mutableData();
240 auto len = s.size();
241 for (int i = 0; i < len; i++) {
242 data[i] = tocase(data[i]);
246 ///////////////////////////////////////////////////////////////////////////////
248 #define STR_PAD_LEFT 0
249 #define STR_PAD_RIGHT 1
250 #define STR_PAD_BOTH 2
252 String string_pad(const char *input, int len, int pad_length,
253 const char *pad_string, int pad_str_len,
254 int pad_type) {
255 assertx(input);
256 int num_pad_chars = pad_length - len;
258 /* If resulting string turns out to be shorter than input string,
259 we simply copy the input and return. */
260 if (pad_length < 0 || num_pad_chars < 0) {
261 return String(input, len, CopyString);
264 /* Setup the padding string values if specified. */
265 if (pad_str_len == 0) {
266 throw_invalid_argument("pad_string: (empty)");
267 return String();
270 String ret(pad_length, ReserveString);
271 char *result = ret.mutableData();
273 /* We need to figure out the left/right padding lengths. */
274 int left_pad, right_pad;
275 switch (pad_type) {
276 case STR_PAD_RIGHT:
277 left_pad = 0;
278 right_pad = num_pad_chars;
279 break;
280 case STR_PAD_LEFT:
281 left_pad = num_pad_chars;
282 right_pad = 0;
283 break;
284 case STR_PAD_BOTH:
285 left_pad = num_pad_chars / 2;
286 right_pad = num_pad_chars - left_pad;
287 break;
288 default:
289 throw_invalid_argument("pad_type: %d", pad_type);
290 return String();
293 /* First we pad on the left. */
294 int result_len = 0;
295 for (int i = 0; i < left_pad; i++) {
296 result[result_len++] = pad_string[i % pad_str_len];
299 /* Then we copy the input string. */
300 memcpy(result + result_len, input, len);
301 result_len += len;
303 /* Finally, we pad on the right. */
304 for (int i = 0; i < right_pad; i++) {
305 result[result_len++] = pad_string[i % pad_str_len];
307 ret.setSize(result_len);
308 return ret;
311 ///////////////////////////////////////////////////////////////////////////////
313 int string_find(const char *input, int len, char ch, int pos,
314 bool case_sensitive) {
315 assertx(input);
316 if (pos < 0 || pos > len) {
317 return -1;
319 const void *ptr;
320 if (case_sensitive) {
321 ptr = memchr(input + pos, ch, len - pos);
322 } else {
323 ptr = bstrcasechr(input + pos, ch, len - pos);
325 if (ptr != nullptr) {
326 return (int)((const char *)ptr - input);
328 return -1;
331 int string_rfind(const char *input, int len, char ch, int pos,
332 bool case_sensitive) {
333 assertx(input);
334 if (pos < -len || pos > len) {
335 return -1;
337 const void *ptr;
338 if (case_sensitive) {
339 if (pos >= 0) {
340 ptr = memrchr(input + pos, ch, len - pos);
341 } else {
342 ptr = memrchr(input, ch, len + pos + 1);
344 } else {
345 if (pos >= 0) {
346 ptr = bstrrcasechr(input + pos, ch, len - pos);
347 } else {
348 ptr = bstrrcasechr(input, ch, len + pos + 1);
351 if (ptr != nullptr) {
352 return (int)((const char *)ptr - input);
354 return -1;
357 int string_find(const char *input, int len, const char *s, int s_len,
358 int pos, bool case_sensitive) {
359 assertx(input);
360 assertx(s);
361 if (!s_len || pos < 0 || pos > len) {
362 return -1;
364 void *ptr;
365 if (case_sensitive) {
366 ptr = (void*)string_memnstr(input + pos, s, s_len, input + len);
367 } else {
368 ptr = bstrcasestr(input + pos, len - pos, s, s_len);
370 if (ptr != nullptr) {
371 return (int)((const char *)ptr - input);
373 return -1;
376 int string_rfind(const char *input, int len, const char *s, int s_len,
377 int pos, bool case_sensitive) {
378 assertx(input);
379 assertx(s);
380 if (!s_len || pos < -len || pos > len) {
381 return -1;
383 void *ptr;
384 if (case_sensitive) {
385 if (pos >= 0) {
386 ptr = bstrrstr(input + pos, len - pos, s, s_len);
387 } else {
388 ptr = bstrrstr(input, len + pos + s_len, s, s_len);
390 } else {
391 if (pos >= 0) {
392 ptr = bstrrcasestr(input + pos, len - pos, s, s_len);
393 } else {
394 ptr = bstrrcasestr(input, len + pos + s_len, s, s_len);
397 if (ptr != nullptr) {
398 return (int)((const char *)ptr - input);
400 return -1;
403 const char *string_memnstr(const char *haystack, const char *needle,
404 int needle_len, const char *end) {
405 const char *p = haystack;
406 char ne = needle[needle_len-1];
408 end -= needle_len;
409 while (p <= end) {
410 if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
411 if (!memcmp(needle, p, needle_len-1)) {
412 return p;
415 if (p == nullptr) {
416 return nullptr;
418 p++;
420 return nullptr;
423 String string_replace(const char *s, int len, int start, int length,
424 const char *replacement, int len_repl) {
425 assertx(s);
426 assertx(replacement);
427 assertx(len >= 0);
429 // if "start" position is negative, count start position from the end
430 // of the string
431 if (start < 0) {
432 start = len + start;
433 if (start < 0) {
434 start = 0;
437 if (start > len) {
438 start = len;
440 // if "length" position is negative, set it to the length
441 // needed to stop that many chars from the end of the string
442 if (length < 0) {
443 length = (len - start) + length;
444 if (length < 0) {
445 length = 0;
448 // check if length is too large
449 if (length > len) {
450 length = len;
452 // check if the length is too large adjusting for non-zero start
453 // Write this way instead of start + length > len to avoid overflow
454 if (length > len - start) {
455 length = len - start;
458 String retString(len + len_repl - length, ReserveString);
459 char *ret = retString.mutableData();
461 int ret_len = 0;
462 if (start) {
463 memcpy(ret, s, start);
464 ret_len += start;
466 if (len_repl) {
467 memcpy(ret + ret_len, replacement, len_repl);
468 ret_len += len_repl;
470 len -= (start + length);
471 if (len) {
472 memcpy(ret + ret_len, s + start + length, len);
473 ret_len += len;
475 retString.setSize(ret_len);
476 return retString;
479 String string_replace(const char *input, int len,
480 const char *search, int len_search,
481 const char *replacement, int len_replace,
482 int &count, bool case_sensitive) {
483 assertx(input);
484 assertx(search && len_search);
485 assertx(len >= 0);
486 assertx(len_search >= 0);
487 assertx(len_replace >= 0);
489 if (len == 0) {
490 return String();
493 req::vector<int> founds;
494 founds.reserve(16);
495 if (len_search == 1) {
496 for (int pos = string_find(input, len, *search, 0, case_sensitive);
497 pos >= 0;
498 pos = string_find(input, len, *search, pos + len_search,
499 case_sensitive)) {
500 founds.push_back(pos);
502 } else {
503 for (int pos = string_find(input, len, search, len_search, 0,
504 case_sensitive);
505 pos >= 0;
506 pos = string_find(input, len, search, len_search,
507 pos + len_search, case_sensitive)) {
508 founds.push_back(pos);
512 count = founds.size();
513 if (count == 0) {
514 return String(); // not found
517 int reserve;
519 // Make sure the new size of the string wouldn't overflow int32_t. Don't
520 // bother if the replacement wouldn't make the string longer.
521 if (len_replace > len_search) {
522 auto raise = [&] { raise_error("String too large"); };
523 if (mul_overflow(len_replace - len_search, count)) {
524 raise();
526 int diff = (len_replace - len_search) * count;
527 if (add_overflow(len, diff)) {
528 raise();
530 reserve = len + diff;
531 } else {
532 reserve = len + (len_replace - len_search) * count;
535 String retString(reserve, ReserveString);
536 char *ret = retString.mutableData();
537 char *p = ret;
538 int pos = 0; // last position in input that hasn't been copied over yet
539 int n;
540 for (unsigned int i = 0; i < founds.size(); i++) {
541 n = founds[i];
542 if (n > pos) {
543 n -= pos;
544 memcpy(p, input, n);
545 p += n;
546 input += n;
547 pos += n;
549 if (len_replace) {
550 memcpy(p, replacement, len_replace);
551 p += len_replace;
553 input += len_search;
554 pos += len_search;
556 n = len;
557 if (n > pos) {
558 n -= pos;
559 memcpy(p, input, n);
560 p += n;
562 retString.setSize(p - ret);
563 return retString;
566 ///////////////////////////////////////////////////////////////////////////////
568 String string_chunk_split(const char *src, int srclen, const char *end,
569 int endlen, int chunklen) {
570 int chunks = srclen / chunklen; // complete chunks!
571 int restlen = srclen - chunks * chunklen; /* srclen % chunklen */
573 String ret(
574 safe_address(
575 chunks + 1,
576 endlen,
577 srclen
579 ReserveString
581 char *dest = ret.mutableData();
583 const char *p; char *q;
584 const char *pMax = src + srclen - chunklen + 1;
585 for (p = src, q = dest; p < pMax; ) {
586 memcpy(q, p, chunklen);
587 q += chunklen;
588 memcpy(q, end, endlen);
589 q += endlen;
590 p += chunklen;
593 if (restlen) {
594 memcpy(q, p, restlen);
595 q += restlen;
596 memcpy(q, end, endlen);
597 q += endlen;
600 ret.setSize(q - dest);
601 return ret;
604 ///////////////////////////////////////////////////////////////////////////////
606 #define PHP_TAG_BUF_SIZE 1023
609 * Check if tag is in a set of tags
611 * states:
613 * 0 start tag
614 * 1 first non-whitespace char seen
616 static int string_tag_find(const char *tag, int len, const char *set) {
617 char c, *n;
618 const char *t;
619 int state=0, done=0;
620 char *norm;
622 if (len <= 0) {
623 return 0;
626 norm = (char *)req::malloc_noptrs(len+1);
627 SCOPE_EXIT { req::free(norm); };
629 n = norm;
630 t = tag;
631 c = tolower(*t);
633 normalize the tag removing leading and trailing whitespace
634 and turn any <a whatever...> into just <a> and any </tag>
635 into <tag>
637 while (!done) {
638 switch (c) {
639 case '<':
640 *(n++) = c;
641 break;
642 case '>':
643 done =1;
644 break;
645 default:
646 if (!isspace((int)c)) {
647 if (state == 0) {
648 state=1;
650 if (c != '/') {
651 *(n++) = c;
653 } else {
654 if (state == 1)
655 done=1;
657 break;
659 c = tolower(*(++t));
661 *(n++) = '>';
662 *n = '\0';
663 if (strstr(set, norm)) {
664 done=1;
665 } else {
666 done=0;
668 return done;
672 * A simple little state-machine to strip out html and php tags
674 * State 0 is the output state, State 1 means we are inside a
675 * normal html tag and state 2 means we are inside a php tag.
677 * The state variable is passed in to allow a function like fgetss
678 * to maintain state across calls to the function.
680 * lc holds the last significant character read and br is a bracket
681 * counter.
683 * When an allow string is passed in we keep track of the string
684 * in state 1 and when the tag is closed check it against the
685 * allow string to see if we should allow it.
687 * swm: Added ability to strip <?xml tags without assuming it PHP
688 * code.
690 String string_strip_tags(const char *s, const int len,
691 const char *allow, const int allow_len,
692 bool allow_tag_spaces) {
693 const char *abuf, *p;
694 char *rbuf, *tbuf, *tp, *rp, c, lc;
696 int br, i=0, depth=0, in_q = 0;
697 int state = 0, pos;
699 assertx(s);
700 assertx(allow);
702 String retString(s, len, CopyString);
703 rbuf = retString.mutableData();
704 String allowString;
706 c = *s;
707 lc = '\0';
708 p = s;
709 rp = rbuf;
710 br = 0;
711 if (allow_len) {
712 assertx(allow);
714 allowString = String(allow_len, ReserveString);
715 char *atmp = allowString.mutableData();
716 for (const char *tmp = allow; *tmp; tmp++, atmp++) {
717 *atmp = tolower((int)*(const unsigned char *)tmp);
719 allowString.setSize(allow_len);
720 abuf = allowString.data();
722 tbuf = (char *)req::malloc_noptrs(PHP_TAG_BUF_SIZE+1);
723 tp = tbuf;
724 } else {
725 abuf = nullptr;
726 tbuf = tp = nullptr;
729 auto move = [&pos, &tbuf, &tp]() {
730 if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
731 pos = tp - tbuf;
732 tbuf = (char*)req::realloc_noptrs(tbuf,
733 (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
734 tp = tbuf + pos;
738 while (i < len) {
739 switch (c) {
740 case '\0':
741 break;
742 case '<':
743 if (isspace(*(p + 1)) && !allow_tag_spaces) {
744 goto reg_char;
746 if (state == 0) {
747 lc = '<';
748 state = 1;
749 if (allow_len) {
750 move();
751 *(tp++) = '<';
753 } else if (state == 1) {
754 depth++;
756 break;
758 case '(':
759 if (state == 2) {
760 if (lc != '"' && lc != '\'') {
761 lc = '(';
762 br++;
764 } else if (allow_len && state == 1) {
765 move();
766 *(tp++) = c;
767 } else if (state == 0) {
768 *(rp++) = c;
770 break;
772 case ')':
773 if (state == 2) {
774 if (lc != '"' && lc != '\'') {
775 lc = ')';
776 br--;
778 } else if (allow_len && state == 1) {
779 move();
780 *(tp++) = c;
781 } else if (state == 0) {
782 *(rp++) = c;
784 break;
786 case '>':
787 if (depth) {
788 depth--;
789 break;
792 if (in_q) {
793 break;
796 switch (state) {
797 case 1: /* HTML/XML */
798 lc = '>';
799 in_q = state = 0;
800 if (allow_len) {
801 move();
802 *(tp++) = '>';
803 *tp='\0';
804 if (string_tag_find(tbuf, tp-tbuf, abuf)) {
805 memcpy(rp, tbuf, tp-tbuf);
806 rp += tp-tbuf;
808 tp = tbuf;
810 break;
812 case 2: /* PHP */
813 if (!br && lc != '\"' && *(p-1) == '?') {
814 in_q = state = 0;
815 tp = tbuf;
817 break;
819 case 3:
820 in_q = state = 0;
821 tp = tbuf;
822 break;
824 case 4: /* JavaScript/CSS/etc... */
825 if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') {
826 in_q = state = 0;
827 tp = tbuf;
829 break;
831 default:
832 *(rp++) = c;
833 break;
835 break;
837 case '"':
838 case '\'':
839 if (state == 4) {
840 /* Inside <!-- comment --> */
841 break;
842 } else if (state == 2 && *(p-1) != '\\') {
843 if (lc == c) {
844 lc = '\0';
845 } else if (lc != '\\') {
846 lc = c;
848 } else if (state == 0) {
849 *(rp++) = c;
850 } else if (allow_len && state == 1) {
851 move();
852 *(tp++) = c;
854 if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) {
855 if (in_q) {
856 in_q = 0;
857 } else {
858 in_q = *p;
861 break;
863 case '!':
864 /* JavaScript & Other HTML scripting languages */
865 if (state == 1 && *(p-1) == '<') {
866 state = 3;
867 lc = c;
868 } else {
869 if (state == 0) {
870 *(rp++) = c;
871 } else if (allow_len && state == 1) {
872 move();
873 *(tp++) = c;
876 break;
878 case '-':
879 if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') {
880 state = 4;
881 } else {
882 goto reg_char;
884 break;
886 case '?':
888 if (state == 1 && *(p-1) == '<') {
889 br=0;
890 state=2;
891 break;
894 case 'E':
895 case 'e':
896 /* !DOCTYPE exception */
897 if (state==3 && p > s+6
898 && tolower(*(p-1)) == 'p'
899 && tolower(*(p-2)) == 'y'
900 && tolower(*(p-3)) == 't'
901 && tolower(*(p-4)) == 'c'
902 && tolower(*(p-5)) == 'o'
903 && tolower(*(p-6)) == 'd') {
904 state = 1;
905 break;
907 /* fall-through */
909 case 'l':
911 /* swm: If we encounter '<?xml' then we shouldn't be in
912 * state == 2 (PHP). Switch back to HTML.
915 if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') {
916 state = 1;
917 break;
920 /* fall-through */
921 default:
922 reg_char:
923 if (state == 0) {
924 *(rp++) = c;
925 } else if (allow_len && state == 1) {
926 move();
927 *(tp++) = c;
929 break;
931 c = *(++p);
932 i++;
934 if (rp < rbuf + len) {
935 *rp = '\0';
937 if (allow_len) {
938 req::free(tbuf);
941 retString.setSize(rp - rbuf);
942 return retString;
945 ///////////////////////////////////////////////////////////////////////////////
947 static char string_hex2int(int c) {
948 if (isdigit(c)) {
949 return c - '0';
951 if (c >= 'A' && c <= 'F') {
952 return c - 'A' + 10;
954 if (c >= 'a' && c <= 'f') {
955 return c - 'a' + 10;
957 return -1;
960 String string_quoted_printable_encode(const char *input, int len) {
961 size_t length = len;
962 const unsigned char *str = (unsigned char*)input;
964 unsigned long lp = 0;
965 unsigned char c;
966 char *d, *buffer;
967 char *hex = "0123456789ABCDEF";
969 String ret(
970 safe_address(
972 length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1),
974 ReserveString
976 d = buffer = ret.mutableData();
978 while (length--) {
979 if (((c = *str++) == '\015') && (*str == '\012') && length > 0) {
980 *d++ = '\015';
981 *d++ = *str++;
982 length--;
983 lp = 0;
984 } else {
985 if (iscntrl (c) || (c == 0x7f) || (c & 0x80) ||
986 (c == '=') || ((c == ' ') && (*str == '\015'))) {
987 if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f))
988 || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL))
989 || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL))
990 || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) {
991 *d++ = '=';
992 *d++ = '\015';
993 *d++ = '\012';
994 lp = 3;
996 *d++ = '=';
997 *d++ = hex[c >> 4];
998 *d++ = hex[c & 0xf];
999 } else {
1000 if ((++lp) > PHP_QPRINT_MAXL) {
1001 *d++ = '=';
1002 *d++ = '\015';
1003 *d++ = '\012';
1004 lp = 1;
1006 *d++ = c;
1010 len = d - buffer;
1012 ret.setSize(len);
1013 return ret;
1016 String string_quoted_printable_decode(const char *input, int len, bool is_q) {
1017 assertx(input);
1018 if (len == 0) {
1019 return String();
1022 int i = 0, j = 0, k;
1023 const char *str_in = input;
1024 String ret(len, ReserveString);
1025 char *str_out = ret.mutableData();
1026 while (i < len && str_in[i]) {
1027 switch (str_in[i]) {
1028 case '=':
1029 if (i + 2 < len && str_in[i + 1] && str_in[i + 2] &&
1030 isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2]))
1032 str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4)
1033 + string_hex2int((int) str_in[i + 2]);
1034 i += 3;
1035 } else /* check for soft line break according to RFC 2045*/ {
1036 k = 1;
1037 while (str_in[i + k] &&
1038 ((str_in[i + k] == 32) || (str_in[i + k] == 9))) {
1039 /* Possibly, skip spaces/tabs at the end of line */
1040 k++;
1042 if (!str_in[i + k]) {
1043 /* End of line reached */
1044 i += k;
1046 else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) {
1047 /* CRLF */
1048 i += k + 2;
1050 else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) {
1051 /* CR or LF */
1052 i += k + 1;
1054 else {
1055 str_out[j++] = str_in[i++];
1058 break;
1059 case '_':
1060 if (is_q) {
1061 str_out[j++] = ' ';
1062 i++;
1063 } else {
1064 str_out[j++] = str_in[i++];
1066 break;
1067 default:
1068 str_out[j++] = str_in[i++];
1071 ret.setSize(j);
1072 return ret;
1075 Variant string_base_to_numeric(const char *s, int len, int base) {
1076 int64_t num = 0;
1077 double fnum = 0;
1078 int mode = 0;
1079 int64_t cutoff;
1080 int cutlim;
1082 assertx(string_validate_base(base));
1084 cutoff = LONG_MAX / base;
1085 cutlim = LONG_MAX % base;
1087 for (int i = len; i > 0; i--) {
1088 char c = *s++;
1090 /* might not work for EBCDIC */
1091 if (c >= '0' && c <= '9')
1092 c -= '0';
1093 else if (c >= 'A' && c <= 'Z')
1094 c -= 'A' - 10;
1095 else if (c >= 'a' && c <= 'z')
1096 c -= 'a' - 10;
1097 else
1098 continue;
1100 if (c >= base)
1101 continue;
1103 switch (mode) {
1104 case 0: /* Integer */
1105 if (num < cutoff || (num == cutoff && c <= cutlim)) {
1106 num = num * base + c;
1107 break;
1108 } else {
1109 fnum = num;
1110 mode = 1;
1112 /* fall-through */
1113 case 1: /* Float */
1114 fnum = fnum * base + c;
1118 if (mode == 1) {
1119 return fnum;
1121 return num;
1124 String string_long_to_base(unsigned long value, int base) {
1125 static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1126 char buf[(sizeof(unsigned long) << 3) + 1];
1127 char *ptr, *end;
1129 assertx(string_validate_base(base));
1131 end = ptr = buf + sizeof(buf) - 1;
1133 do {
1134 *--ptr = digits[value % base];
1135 value /= base;
1136 } while (ptr > buf && value);
1138 return String(ptr, end - ptr, CopyString);
1141 String string_numeric_to_base(const Variant& value, int base) {
1142 static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1144 assertx(string_validate_base(base));
1145 if ((!value.isInteger() && !value.isDouble())) {
1146 return empty_string();
1149 if (value.isDouble()) {
1150 double fvalue = floor(value.toDouble()); /* floor it just in case */
1151 char *ptr, *end;
1152 char buf[(sizeof(double) << 3) + 1];
1154 /* Don't try to convert +/- infinity */
1155 if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) {
1156 raise_warning("Number too large");
1157 return empty_string();
1160 end = ptr = buf + sizeof(buf) - 1;
1162 do {
1163 *--ptr = digits[(int) fmod(fvalue, base)];
1164 fvalue /= base;
1165 } while (ptr > buf && fabs(fvalue) >= 1);
1167 return String(ptr, end - ptr, CopyString);
1170 return string_long_to_base(value.toInt64(), base);
1173 ///////////////////////////////////////////////////////////////////////////////
1174 // uuencode
1176 #define PHP_UU_ENC(c) \
1177 ((c) ? ((c) & 077) + ' ' : '`')
1178 #define PHP_UU_ENC_C2(c) \
1179 PHP_UU_ENC(((*(c) << 4) & 060) | ((*((c) + 1) >> 4) & 017))
1180 #define PHP_UU_ENC_C3(c) \
1181 PHP_UU_ENC(((*(c + 1) << 2) & 074) | ((*((c) + 2) >> 6) & 03))
1182 #define PHP_UU_DEC(c) \
1183 (((c) - ' ') & 077)
1185 String string_uuencode(const char *src, int src_len) {
1186 assertx(src);
1187 assertx(src_len);
1189 int len = 45;
1190 char *p;
1191 const char *s, *e, *ee;
1192 char *dest;
1194 /* encoded length is ~ 38% greater than the original */
1195 String ret((int)ceil(src_len * 1.38) + 45, ReserveString);
1196 p = dest = ret.mutableData();
1197 s = src;
1198 e = src + src_len;
1200 while ((s + 3) < e) {
1201 ee = s + len;
1202 if (ee > e) {
1203 ee = e;
1204 len = ee - s;
1205 if (len % 3) {
1206 ee = s + (int) (floor(len / 3) * 3);
1209 *p++ = PHP_UU_ENC(len);
1211 while (s < ee) {
1212 *p++ = PHP_UU_ENC(*s >> 2);
1213 *p++ = PHP_UU_ENC_C2(s);
1214 *p++ = PHP_UU_ENC_C3(s);
1215 *p++ = PHP_UU_ENC(*(s + 2) & 077);
1217 s += 3;
1220 if (len == 45) {
1221 *p++ = '\n';
1225 if (s < e) {
1226 if (len == 45) {
1227 *p++ = PHP_UU_ENC(e - s);
1228 len = 0;
1231 *p++ = PHP_UU_ENC(*s >> 2);
1232 *p++ = PHP_UU_ENC_C2(s);
1233 *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0');
1234 *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0');
1237 if (len < 45) {
1238 *p++ = '\n';
1241 *p++ = PHP_UU_ENC('\0');
1242 *p++ = '\n';
1243 *p = '\0';
1245 ret.setSize(p - dest);
1246 return ret;
1249 String string_uudecode(const char *src, int src_len) {
1250 int total_len = 0;
1251 int len;
1252 const char *s, *e, *ee;
1253 char *p, *dest;
1255 String ret(ceil(src_len * 0.75), ReserveString);
1256 p = dest = ret.mutableData();
1257 s = src;
1258 e = src + src_len;
1260 while (s < e) {
1261 if ((len = PHP_UU_DEC(*s++)) <= 0) {
1262 break;
1264 /* sanity check */
1265 if (len > src_len) {
1266 goto err;
1269 total_len += len;
1271 ee = s + (len == 45 ? 60 : (int) floor(len * 1.33));
1272 /* sanity check */
1273 if (ee > e) {
1274 goto err;
1277 while (s < ee) {
1278 if (s + 4 > e) goto err;
1280 *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1281 *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1282 *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1283 s += 4;
1286 if (len < 45) {
1287 break;
1290 /* skip \n */
1291 s++;
1294 if ((len = total_len > (p - dest))) {
1295 *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1296 if (len > 1) {
1297 *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1298 if (len > 2) {
1299 *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1304 ret.setSize(total_len);
1305 return ret;
1307 err:
1308 return String();
1311 ///////////////////////////////////////////////////////////////////////////////
1312 // base64
1314 namespace {
1316 const char base64_table[] = {
1317 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
1318 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
1319 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
1320 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
1321 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
1324 const char base64_pad = '=';
1326 const short base64_reverse_table[256] = {
1327 -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
1328 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1329 -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
1330 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
1331 -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1332 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
1333 -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
1334 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
1335 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1336 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1337 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1338 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1339 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1340 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1341 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1342 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
1345 folly::Optional<int> maxEncodedSize(int length) {
1346 if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
1347 return folly::none;
1349 return ((length + 2) / 3) * 4;
1352 // outstr must be at least maxEncodedSize(length) bytes
1353 size_t php_base64_encode(const unsigned char *str, int length,
1354 unsigned char* outstr) {
1355 const unsigned char *current = str;
1356 unsigned char *p = outstr;
1358 while (length > 2) { /* keep going until we have less than 24 bits */
1359 *p++ = base64_table[current[0] >> 2];
1360 *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1361 *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)];
1362 *p++ = base64_table[current[2] & 0x3f];
1364 current += 3;
1365 length -= 3; /* we just handle 3 octets of data */
1368 /* now deal with the tail end of things */
1369 if (length != 0) {
1370 *p++ = base64_table[current[0] >> 2];
1371 if (length > 1) {
1372 *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1373 *p++ = base64_table[(current[1] & 0x0f) << 2];
1374 *p++ = base64_pad;
1375 } else {
1376 *p++ = base64_table[(current[0] & 0x03) << 4];
1377 *p++ = base64_pad;
1378 *p++ = base64_pad;
1381 return p - outstr;
1384 // outstr must be at least length bytes
1385 ssize_t php_base64_decode(const char *str, int length, bool strict,
1386 unsigned char* outstr) {
1387 const unsigned char *current = (unsigned char*)str;
1388 int ch, i = 0, j = 0, k;
1389 /* this sucks for threaded environments */
1391 unsigned char* result = outstr;
1393 /* run through the whole string, converting as we go */
1394 while ((ch = *current++) != '\0' && length-- > 0) {
1395 if (ch == base64_pad) {
1396 if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
1397 if ((i % 4) != 1) {
1398 while (isspace(*(++current))) {
1399 continue;
1401 if (*current == '\0') {
1402 continue;
1405 return -1;
1407 continue;
1410 ch = base64_reverse_table[ch];
1411 if ((!strict && ch < 0) || ch == -1) {
1412 /* a space or some other separator character, we simply skip over */
1413 continue;
1414 } else if (ch == -2) {
1415 return -1;
1418 switch(i % 4) {
1419 case 0:
1420 result[j] = ch << 2;
1421 break;
1422 case 1:
1423 result[j++] |= ch >> 4;
1424 result[j] = (ch & 0x0f) << 4;
1425 break;
1426 case 2:
1427 result[j++] |= ch >>2;
1428 result[j] = (ch & 0x03) << 6;
1429 break;
1430 case 3:
1431 result[j++] |= ch;
1432 break;
1434 i++;
1437 k = j;
1438 /* mop things up if we ended on a boundary */
1439 if (ch == base64_pad) {
1440 switch(i % 4) {
1441 case 1:
1442 return -1;
1443 case 2:
1444 k++;
1445 case 3:
1446 result[k] = 0;
1449 return j;
1454 String string_base64_encode(const char* input, int len) {
1455 if (auto const wantedSize = maxEncodedSize(len)) {
1456 String ret(*wantedSize, ReserveString);
1457 auto actualSize = php_base64_encode((unsigned char*)input, len,
1458 (unsigned char*)ret.mutableData());
1459 ret.setSize(actualSize);
1460 return ret;
1462 return String();
1465 String string_base64_decode(const char* input, int len, bool strict) {
1466 String ret(len, ReserveString);
1467 auto actualSize = php_base64_decode(input, len, strict,
1468 (unsigned char*)ret.mutableData());
1469 if (actualSize < 0) return String();
1471 ret.setSize(actualSize);
1472 return ret;
1475 std::string base64_encode(const char* input, int len) {
1476 if (auto const wantedSize = maxEncodedSize(len)) {
1477 std::string ret;
1478 ret.resize(*wantedSize);
1479 auto actualSize = php_base64_encode((unsigned char*)input, len,
1480 (unsigned char*)ret.data());
1481 ret.resize(actualSize);
1482 return ret;
1484 return std::string();
1487 std::string base64_decode(const char* input, int len, bool strict) {
1488 if (!len) return std::string();
1489 std::string ret;
1490 ret.resize(len);
1491 auto actualSize = php_base64_decode(input, len, strict,
1492 (unsigned char*)ret.data());
1493 if (!actualSize) return std::string();
1495 ret.resize(actualSize);
1496 return ret;
1499 ///////////////////////////////////////////////////////////////////////////////
1501 String string_escape_shell_arg(const char *str) {
1502 int x, y, l;
1503 char *cmd;
1505 y = 0;
1506 l = strlen(str);
1508 String ret(safe_address(l, 4, 3), ReserveString); /* worst case */
1509 cmd = ret.mutableData();
1511 #ifdef _MSC_VER
1512 cmd[y++] = '"';
1513 #else
1514 cmd[y++] = '\'';
1515 #endif
1517 for (x = 0; x < l; x++) {
1518 switch (str[x]) {
1519 #ifdef _MSC_VER
1520 case '"':
1521 case '%':
1522 case '!':
1523 cmd[y++] = ' ';
1524 break;
1525 #else
1526 case '\'':
1527 cmd[y++] = '\'';
1528 cmd[y++] = '\\';
1529 cmd[y++] = '\'';
1530 #endif
1531 /* fall-through */
1532 default:
1533 cmd[y++] = str[x];
1536 #ifdef _MSC_VER
1537 if (y > 0 && '\\' == cmd[y - 1]) {
1538 int k = 0, n = y - 1;
1539 for (; n >= 0 && '\\' == cmd[n]; n--, k++);
1540 if (k % 2) {
1541 cmd[y++] = '\\';
1545 cmd[y++] = '"';
1546 #else
1547 cmd[y++] = '\'';
1548 #endif
1549 ret.setSize(y);
1550 return ret;
1553 String string_escape_shell_cmd(const char *str) {
1554 register int x, y, l;
1555 char *cmd;
1556 char *p = nullptr;
1558 l = strlen(str);
1559 String ret(safe_address(l, 2, 1), ReserveString);
1560 cmd = ret.mutableData();
1562 for (x = 0, y = 0; x < l; x++) {
1563 switch (str[x]) {
1564 #ifndef _MSC_VER
1565 case '"':
1566 case '\'':
1567 if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) {
1568 /* noop */
1569 } else if (p && *p == str[x]) {
1570 p = nullptr;
1571 } else {
1572 cmd[y++] = '\\';
1574 cmd[y++] = str[x];
1575 break;
1576 #else
1577 /* % is Windows specific for environmental variables, ^%PATH% will
1578 output PATH while ^%PATH^% will not. escapeshellcmd->val will
1579 escape all % and !.
1581 case '%':
1582 case '!':
1583 case '"':
1584 case '\'':
1585 #endif
1586 case '#': /* This is character-set independent */
1587 case '&':
1588 case ';':
1589 case '`':
1590 case '|':
1591 case '*':
1592 case '?':
1593 case '~':
1594 case '<':
1595 case '>':
1596 case '^':
1597 case '(':
1598 case ')':
1599 case '[':
1600 case ']':
1601 case '{':
1602 case '}':
1603 case '$':
1604 case '\\':
1605 case '\x0A': /* excluding these two */
1606 case '\xFF':
1607 #ifdef _MSC_VER
1608 cmd[y++] = '^';
1609 #else
1610 cmd[y++] = '\\';
1611 #endif
1612 /* fall-through */
1613 default:
1614 cmd[y++] = str[x];
1617 ret.setSize(y);
1618 return ret;
1621 ///////////////////////////////////////////////////////////////////////////////
1623 static void string_similar_str(const char *txt1, int len1,
1624 const char *txt2, int len2,
1625 int *pos1, int *pos2, int *max) {
1626 const char *p, *q;
1627 const char *end1 = txt1 + len1;
1628 const char *end2 = txt2 + len2;
1629 int l;
1631 *max = 0;
1632 for (p = txt1; p < end1; p++) {
1633 for (q = txt2; q < end2; q++) {
1634 for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++);
1635 if (l > *max) {
1636 *max = l;
1637 *pos1 = p - txt1;
1638 *pos2 = q - txt2;
1644 static int string_similar_char(const char *txt1, int len1,
1645 const char *txt2, int len2) {
1646 int sum;
1647 int pos1 = 0, pos2 = 0, max;
1649 string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);
1650 if ((sum = max)) {
1651 if (pos1 && pos2) {
1652 sum += string_similar_char(txt1, pos1, txt2, pos2);
1654 if ((pos1 + max < len1) && (pos2 + max < len2)) {
1655 sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max,
1656 txt2 + pos2 + max, len2 - pos2 - max);
1660 return sum;
1663 int string_similar_text(const char *t1, int len1,
1664 const char *t2, int len2, float *percent) {
1665 if (len1 == 0 && len2 == 0) {
1666 if (percent) *percent = 0.0;
1667 return 0;
1670 int sim = string_similar_char(t1, len1, t2, len2);
1671 if (percent) *percent = sim * 200.0 / (len1 + len2);
1672 return sim;
1675 ///////////////////////////////////////////////////////////////////////////////
1677 #define LEVENSHTEIN_MAX_LENTH 255
1679 // reference implementation, only optimized for memory usage, not speed
1680 int string_levenshtein(const char *s1, int l1, const char *s2, int l2,
1681 int cost_ins, int cost_rep, int cost_del ) {
1682 int *p1, *p2, *tmp;
1683 int i1, i2, c0, c1, c2;
1685 if (l1==0) return l2*cost_ins;
1686 if (l2==0) return l1*cost_del;
1688 if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) {
1689 raise_warning("levenshtein(): Argument string(s) too long");
1690 return -1;
1693 p1 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
1694 SCOPE_EXIT { req::free(p1); };
1695 p2 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
1696 SCOPE_EXIT { req::free(p2); };
1698 for(i2=0;i2<=l2;i2++) {
1699 p1[i2] = i2*cost_ins;
1702 for(i1=0;i1<l1;i1++) {
1703 p2[0]=p1[0]+cost_del;
1704 for(i2=0;i2<l2;i2++) {
1705 c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep);
1706 c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1;
1707 c2=p2[i2]+cost_ins; if (c2<c0) c0=c2;
1708 p2[i2+1]=c0;
1710 tmp=p1; p1=p2; p2=tmp;
1713 c0=p1[l2];
1714 return c0;
1717 ///////////////////////////////////////////////////////////////////////////////
1719 String string_money_format(const char *format, double value) {
1720 bool check = false;
1721 const char *p = format;
1722 while ((p = strchr(p, '%'))) {
1723 if (*(p + 1) == '%') {
1724 p += 2;
1725 } else if (!check) {
1726 check = true;
1727 p++;
1728 } else {
1729 throw_invalid_argument
1730 ("format: Only a single %%i or %%n token can be used");
1731 return String();
1735 int format_len = strlen(format);
1736 int str_len = safe_address(format_len, 1, 1024);
1737 String ret(str_len, ReserveString);
1738 char *str = ret.mutableData();
1739 if ((str_len = strfmon(str, str_len, format, value)) < 0) {
1740 return String();
1742 ret.setSize(str_len);
1743 return ret;
1746 ///////////////////////////////////////////////////////////////////////////////
1748 String string_number_format(double d, int dec,
1749 const String& dec_point,
1750 const String& thousand_sep) {
1751 char *tmpbuf = nullptr, *resbuf;
1752 char *s, *t; /* source, target */
1753 char *dp;
1754 int integral;
1755 int tmplen, reslen=0;
1756 int count=0;
1757 int is_negative=0;
1759 if (d < 0) {
1760 is_negative = 1;
1761 d = -d;
1764 if (dec < 0) dec = 0;
1765 d = php_math_round(d, dec);
1767 // departure from PHP: we got rid of dependencies on spprintf() here.
1768 String tmpstr(63, ReserveString);
1769 tmpbuf = tmpstr.mutableData();
1770 tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d);
1771 if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1772 tmpstr.setSize(tmplen);
1773 return tmpstr;
1775 if (tmplen >= 64) {
1776 // Uncommon, asked for more than 64 chars worth of precision
1777 tmpstr = String(tmplen, ReserveString);
1778 tmpbuf = tmpstr.mutableData();
1779 tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d);
1780 if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1781 tmpstr.setSize(tmplen);
1782 return tmpstr;
1786 /* find decimal point, if expected */
1787 if (dec) {
1788 dp = strpbrk(tmpbuf, ".,");
1789 } else {
1790 dp = nullptr;
1793 /* calculate the length of the return buffer */
1794 if (dp) {
1795 integral = dp - tmpbuf;
1796 } else {
1797 /* no decimal point was found */
1798 integral = tmplen;
1801 /* allow for thousand separators */
1802 if (!thousand_sep.empty()) {
1803 if (integral + thousand_sep.size() * ((integral-1) / 3) < integral) {
1804 /* overflow */
1805 raise_error("String overflow");
1808 integral += ((integral-1) / 3) * thousand_sep.size();
1811 reslen = integral;
1813 if (dec) {
1814 reslen += dec;
1816 if (!dec_point.empty()) {
1817 if (reslen + dec_point.size() < dec_point.size()) {
1818 /* overflow */
1819 raise_error("String overflow");
1821 reslen += dec_point.size();
1825 /* add a byte for minus sign */
1826 if (is_negative) {
1827 reslen++;
1829 String resstr(reslen, ReserveString);
1830 resbuf = resstr.mutableData();
1832 s = tmpbuf+tmplen-1;
1833 t = resbuf+reslen-1;
1835 /* copy the decimal places.
1836 * Take care, as the sprintf implementation may return less places than
1837 * we requested due to internal buffer limitations */
1838 if (dec) {
1839 int declen = dp ? s - dp : 0;
1840 int topad = dec > declen ? dec - declen : 0;
1842 /* pad with '0's */
1843 while (topad--) {
1844 *t-- = '0';
1847 if (dp) {
1848 s -= declen + 1; /* +1 to skip the point */
1849 t -= declen;
1851 /* now copy the chars after the point */
1852 memcpy(t + 1, dp + 1, declen);
1855 /* add decimal point */
1856 if (!dec_point.empty()) {
1857 memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size());
1858 t -= dec_point.size();
1862 /* copy the numbers before the decimal point, adding thousand
1863 * separator every three digits */
1864 while(s >= tmpbuf) {
1865 *t-- = *s--;
1866 if (thousand_sep && (++count%3)==0 && s>=tmpbuf) {
1867 memcpy(t + (1 - thousand_sep.size()),
1868 thousand_sep.data(),
1869 thousand_sep.size());
1870 t -= thousand_sep.size();
1874 /* and a minus sign, if needed */
1875 if (is_negative) {
1876 *t-- = '-';
1879 resstr.setSize(reslen);
1880 return resstr;
1883 ///////////////////////////////////////////////////////////////////////////////
1884 // soundex
1886 /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
1887 String string_soundex(const String& str) {
1888 assertx(!str.empty());
1889 int _small, code, last;
1890 String retString(4, ReserveString);
1891 char* soundex = retString.mutableData();
1893 static char soundex_table[26] = {
1894 0, /* A */
1895 '1', /* B */
1896 '2', /* C */
1897 '3', /* D */
1898 0, /* E */
1899 '1', /* F */
1900 '2', /* G */
1901 0, /* H */
1902 0, /* I */
1903 '2', /* J */
1904 '2', /* K */
1905 '4', /* L */
1906 '5', /* M */
1907 '5', /* N */
1908 0, /* O */
1909 '1', /* P */
1910 '2', /* Q */
1911 '6', /* R */
1912 '2', /* S */
1913 '3', /* T */
1914 0, /* U */
1915 '1', /* V */
1916 0, /* W */
1917 '2', /* X */
1918 0, /* Y */
1919 '2' /* Z */
1922 /* build soundex string */
1923 last = -1;
1924 auto p = str.slice().data();
1925 for (_small = 0; *p && _small < 4; p++) {
1926 /* convert chars to upper case and strip non-letter chars */
1927 /* BUG: should also map here accented letters used in non */
1928 /* English words or names (also found in English text!): */
1929 /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
1930 code = toupper((int)(unsigned char)(*p));
1931 if (code >= 'A' && code <= 'Z') {
1932 if (_small == 0) {
1933 /* remember first valid char */
1934 soundex[_small++] = code;
1935 last = soundex_table[code - 'A'];
1936 } else {
1937 /* ignore sequences of consonants with same soundex */
1938 /* code in trail, and vowels unless they separate */
1939 /* consonant letters */
1940 code = soundex_table[code - 'A'];
1941 if (code != last) {
1942 if (code != 0) {
1943 soundex[_small++] = code;
1945 last = code;
1950 /* pad with '0' and terminate with 0 ;-) */
1951 while (_small < 4) {
1952 soundex[_small++] = '0';
1954 retString.setSize(4);
1955 return retString;
1958 ///////////////////////////////////////////////////////////////////////////////
1959 // metaphone
1962 * this is now the original code by Michael G Schwern:
1963 * i've changed it just a slightly bit (use emalloc,
1964 * get rid of includes etc)
1965 * - thies - 13.09.1999
1968 /*----------------------------- */
1969 /* this used to be "metaphone.h" */
1970 /*----------------------------- */
1972 /* Special encodings */
1973 #define SH 'X'
1974 #define TH '0'
1976 /*----------------------------- */
1977 /* end of "metaphone.h" */
1978 /*----------------------------- */
1980 /*----------------------------- */
1981 /* this used to be "metachar.h" */
1982 /*----------------------------- */
1984 /* Metachar.h ... little bits about characters for metaphone */
1985 /*-- Character encoding array & accessing macros --*/
1986 /* Stolen directly out of the book... */
1987 char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
1989 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
1991 #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
1993 /* These letters are passed through unchanged */
1994 #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
1996 /* These form dipthongs when preceding H */
1997 #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
1999 /* These make C and G soft */
2000 #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
2002 /* These prevent GH from becoming F */
2003 #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
2005 /*----------------------------- */
2006 /* end of "metachar.h" */
2007 /*----------------------------- */
2009 /* I suppose I could have been using a character pointer instead of
2010 * accesssing the array directly... */
2012 /* Look at the next letter in the word */
2013 #define Next_Letter ((char)toupper(word[w_idx+1]))
2014 /* Look at the current letter in the word */
2015 #define Curr_Letter ((char)toupper(word[w_idx]))
2016 /* Go N letters back. */
2017 #define Look_Back_Letter(n) (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
2018 /* Previous letter. I dunno, should this return null on failure? */
2019 #define Prev_Letter (Look_Back_Letter(1))
2020 /* Look two letters down. It makes sure you don't walk off the string. */
2021 #define After_Next_Letter (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
2022 : '\0')
2023 #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
2025 /* Allows us to safely look ahead an arbitrary # of letters */
2026 /* I probably could have just used strlen... */
2027 static char Lookahead(unsigned char *word, int how_far) {
2028 char letter_ahead = '\0'; /* null by default */
2029 int idx;
2030 for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
2031 /* Edge forward in the string... */
2033 letter_ahead = (char)word[idx]; /* idx will be either == to how_far or
2034 * at the end of the string
2036 return letter_ahead;
2039 /* phonize one letter
2040 * We don't know the buffers size in advance. On way to solve this is to just
2041 * re-allocate the buffer size. We're using an extra of 2 characters (this
2042 * could be one though; or more too). */
2043 #define Phonize(c) { buffer.append(c); }
2044 /* How long is the phoned word? */
2045 #define Phone_Len (buffer.size())
2047 /* Note is a letter is a 'break' in the word */
2048 #define Isbreak(c) (!isalpha(c))
2050 String string_metaphone(const char *input, int word_len, long max_phonemes,
2051 int traditional) {
2052 unsigned char *word = (unsigned char *)input;
2054 int w_idx = 0; /* point in the phonization we're at. */
2055 int max_buffer_len = 0; /* maximum length of the destination buffer */
2057 /*-- Parameter checks --*/
2058 /* Negative phoneme length is meaningless */
2060 if (max_phonemes < 0)
2061 return String();
2063 /* Empty/null string is meaningless */
2064 /* Overly paranoid */
2065 /* always_assert(word != NULL && word[0] != '\0'); */
2067 if (word == nullptr)
2068 return String();
2070 /*-- Allocate memory for our phoned_phrase --*/
2071 if (max_phonemes == 0) { /* Assume largest possible */
2072 max_buffer_len = word_len;
2073 } else {
2074 max_buffer_len = max_phonemes;
2076 StringBuffer buffer(max_buffer_len);
2078 /*-- The first phoneme has to be processed specially. --*/
2079 /* Find our first letter */
2080 for (; !isalpha(Curr_Letter); w_idx++) {
2081 /* On the off chance we were given nothing but crap... */
2082 if (Curr_Letter == '\0') {
2083 return buffer.detach(); /* For testing */
2087 switch (Curr_Letter) {
2088 /* AE becomes E */
2089 case 'A':
2090 if (Next_Letter == 'E') {
2091 Phonize('E');
2092 w_idx += 2;
2094 /* Remember, preserve vowels at the beginning */
2095 else {
2096 Phonize('A');
2097 w_idx++;
2099 break;
2100 /* [GKP]N becomes N */
2101 case 'G':
2102 case 'K':
2103 case 'P':
2104 if (Next_Letter == 'N') {
2105 Phonize('N');
2106 w_idx += 2;
2108 break;
2109 /* WH becomes H,
2110 WR becomes R
2111 W if followed by a vowel */
2112 case 'W':
2113 if (Next_Letter == 'H' ||
2114 Next_Letter == 'R') {
2115 Phonize(Next_Letter);
2116 w_idx += 2;
2117 } else if (isvowel(Next_Letter)) {
2118 Phonize('W');
2119 w_idx += 2;
2121 /* else ignore */
2122 break;
2123 /* X becomes S */
2124 case 'X':
2125 Phonize('S');
2126 w_idx++;
2127 break;
2128 /* Vowels are kept */
2129 /* We did A already
2130 case 'A':
2131 case 'a':
2133 case 'E':
2134 case 'I':
2135 case 'O':
2136 case 'U':
2137 Phonize(Curr_Letter);
2138 w_idx++;
2139 break;
2140 default:
2141 /* do nothing */
2142 break;
2145 /* On to the metaphoning */
2146 for (; Curr_Letter != '\0' &&
2147 (max_phonemes == 0 || Phone_Len < max_phonemes);
2148 w_idx++) {
2149 /* How many letters to skip because an eariler encoding handled
2150 * multiple letters */
2151 unsigned short int skip_letter = 0;
2154 /* THOUGHT: It would be nice if, rather than having things like...
2155 * well, SCI. For SCI you encode the S, then have to remember
2156 * to skip the C. So the phonome SCI invades both S and C. It would
2157 * be better, IMHO, to skip the C from the S part of the encoding.
2158 * Hell, I'm trying it.
2161 /* Ignore non-alphas */
2162 if (!isalpha(Curr_Letter))
2163 continue;
2165 /* Drop duplicates, except CC */
2166 if (Curr_Letter == Prev_Letter &&
2167 Curr_Letter != 'C')
2168 continue;
2170 switch (Curr_Letter) {
2171 /* B -> B unless in MB */
2172 case 'B':
2173 if (Prev_Letter != 'M')
2174 Phonize('B');
2175 break;
2176 /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
2177 * (SCHW is handled in S)
2178 * S if -CI-, -CE- or -CY-
2179 * dropped if -SCI-, SCE-, -SCY- (handed in S)
2180 * else K
2182 case 'C':
2183 if (MAKESOFT(Next_Letter)) { /* C[IEY] */
2184 if (After_Next_Letter == 'A' &&
2185 Next_Letter == 'I') { /* CIA */
2186 Phonize(SH);
2188 /* SC[IEY] */
2189 else if (Prev_Letter == 'S') {
2190 /* Dropped */
2191 } else {
2192 Phonize('S');
2194 } else if (Next_Letter == 'H') {
2195 if ((!traditional) && (After_Next_Letter == 'R' ||
2196 Prev_Letter == 'S')) { /* Christ, School */
2197 Phonize('K');
2198 } else {
2199 Phonize(SH);
2201 skip_letter++;
2202 } else {
2203 Phonize('K');
2205 break;
2206 /* J if in -DGE-, -DGI- or -DGY-
2207 * else T
2209 case 'D':
2210 if (Next_Letter == 'G' && MAKESOFT(After_Next_Letter)) {
2211 Phonize('J');
2212 skip_letter++;
2213 } else
2214 Phonize('T');
2215 break;
2216 /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
2217 * else dropped if -GNED, -GN,
2218 * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
2219 * else J if in -GE-, -GI, -GY and not GG
2220 * else K
2222 case 'G':
2223 if (Next_Letter == 'H') {
2224 if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) {
2225 Phonize('F');
2226 skip_letter++;
2227 } else {
2228 /* silent */
2230 } else if (Next_Letter == 'N') {
2231 if (Isbreak(After_Next_Letter) ||
2232 (After_Next_Letter == 'E' && Look_Ahead_Letter(3) == 'D')) {
2233 /* dropped */
2234 } else
2235 Phonize('K');
2236 } else if (MAKESOFT(Next_Letter) && Prev_Letter != 'G') {
2237 Phonize('J');
2238 } else {
2239 Phonize('K');
2241 break;
2242 /* H if before a vowel and not after C,G,P,S,T */
2243 case 'H':
2244 if (isvowel(Next_Letter) && !AFFECTH(Prev_Letter))
2245 Phonize('H');
2246 break;
2247 /* dropped if after C
2248 * else K
2250 case 'K':
2251 if (Prev_Letter != 'C')
2252 Phonize('K');
2253 break;
2254 /* F if before H
2255 * else P
2257 case 'P':
2258 if (Next_Letter == 'H') {
2259 Phonize('F');
2260 } else {
2261 Phonize('P');
2263 break;
2264 /* K
2266 case 'Q':
2267 Phonize('K');
2268 break;
2269 /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
2270 * else S
2272 case 'S':
2273 if (Next_Letter == 'I' &&
2274 (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2275 Phonize(SH);
2276 } else if (Next_Letter == 'H') {
2277 Phonize(SH);
2278 skip_letter++;
2279 } else if ((!traditional) &&
2280 (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' &&
2281 Look_Ahead_Letter(3) == 'W')) {
2282 Phonize(SH);
2283 skip_letter += 2;
2284 } else {
2285 Phonize('S');
2287 break;
2288 /* 'sh' in -TIA- or -TIO-
2289 * else 'th' before H
2290 * else T
2292 case 'T':
2293 if (Next_Letter == 'I' &&
2294 (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2295 Phonize(SH);
2296 } else if (Next_Letter == 'H') {
2297 Phonize(TH);
2298 skip_letter++;
2299 } else {
2300 Phonize('T');
2302 break;
2303 /* F */
2304 case 'V':
2305 Phonize('F');
2306 break;
2307 /* W before a vowel, else dropped */
2308 case 'W':
2309 if (isvowel(Next_Letter))
2310 Phonize('W');
2311 break;
2312 /* KS */
2313 case 'X':
2314 Phonize('K');
2315 Phonize('S');
2316 break;
2317 /* Y if followed by a vowel */
2318 case 'Y':
2319 if (isvowel(Next_Letter))
2320 Phonize('Y');
2321 break;
2322 /* S */
2323 case 'Z':
2324 Phonize('S');
2325 break;
2326 /* No transformation */
2327 case 'F':
2328 case 'J':
2329 case 'L':
2330 case 'M':
2331 case 'N':
2332 case 'R':
2333 Phonize(Curr_Letter);
2334 break;
2335 default:
2336 /* nothing */
2337 break;
2338 } /* END SWITCH */
2340 w_idx += skip_letter;
2341 } /* END FOR */
2343 return buffer.detach();
2346 ///////////////////////////////////////////////////////////////////////////////
2347 // Cyrillic
2350 * This is codetables for different Cyrillic charsets (relative to koi8-r).
2351 * Each table contains data for 128-255 symbols from ASCII table.
2352 * First 256 symbols are for conversion from koi8-r to corresponding charset,
2353 * second 256 symbols are for reverse conversion, from charset to koi8-r.
2355 * Here we have the following tables:
2356 * _cyr_win1251 - for windows-1251 charset
2357 * _cyr_iso88595 - for iso8859-5 charset
2358 * _cyr_cp866 - for x-cp866 charset
2359 * _cyr_mac - for x-mac-cyrillic charset
2361 typedef unsigned char _cyr_charset_table[512];
2363 static const _cyr_charset_table _cyr_win1251 = {
2364 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2365 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2366 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2367 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2368 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2369 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2370 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2371 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2372 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2373 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2374 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
2375 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
2376 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2377 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2378 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2379 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2380 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2381 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2382 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2383 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2384 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2385 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2386 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2387 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2388 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2389 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2390 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
2391 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
2392 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2393 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2394 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
2395 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
2398 static const _cyr_charset_table _cyr_cp866 = {
2399 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2400 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2401 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2402 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2403 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2404 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2405 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2406 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2407 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2408 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2409 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2410 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
2411 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
2412 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
2413 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2414 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
2415 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2416 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2417 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2418 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2419 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2420 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2421 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2422 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2423 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2424 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2425 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
2426 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
2427 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
2428 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
2429 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2430 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2433 static const _cyr_charset_table _cyr_iso88595 = {
2434 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2435 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2436 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2437 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2438 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2439 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2440 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2441 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2442 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2443 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2444 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2445 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2446 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2447 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2448 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2449 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2450 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2451 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2452 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2453 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2454 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2455 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2456 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2457 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2458 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2459 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2460 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
2461 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
2462 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
2463 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
2464 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
2465 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
2468 static const _cyr_charset_table _cyr_mac = {
2469 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2470 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2471 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2472 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2473 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2474 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2475 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2476 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2477 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2478 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2479 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2480 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2481 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2482 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
2483 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2484 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
2485 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2486 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2487 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2488 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2489 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2490 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2491 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2492 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2493 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2494 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2495 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
2496 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
2497 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2498 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2499 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2500 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2504 * This is the function that performs real in-place conversion of the string
2505 * between charsets.
2506 * Parameters:
2507 * str - string to be converted
2508 * from,to - one-symbol label of source and destination charset
2509 * The following symbols are used as labels:
2510 * k - koi8-r
2511 * w - windows-1251
2512 * i - iso8859-5
2513 * a - x-cp866
2514 * d - x-cp866
2515 * m - x-mac-cyrillic
2517 String string_convert_cyrillic_string(const String& input, char from, char to) {
2518 const unsigned char *from_table, *to_table;
2519 unsigned char tmp;
2520 auto uinput = (unsigned char*)input.slice().data();
2521 String retString(input.size(), ReserveString);
2522 unsigned char *str = (unsigned char *)retString.mutableData();
2524 from_table = nullptr;
2525 to_table = nullptr;
2527 switch (toupper((int)(unsigned char)from)) {
2528 case 'W': from_table = _cyr_win1251; break;
2529 case 'A':
2530 case 'D': from_table = _cyr_cp866; break;
2531 case 'I': from_table = _cyr_iso88595; break;
2532 case 'M': from_table = _cyr_mac; break;
2533 case 'K':
2534 break;
2535 default:
2536 throw_invalid_argument("Unknown source charset: %c", from);
2537 break;
2540 switch (toupper((int)(unsigned char)to)) {
2541 case 'W': to_table = _cyr_win1251; break;
2542 case 'A':
2543 case 'D': to_table = _cyr_cp866; break;
2544 case 'I': to_table = _cyr_iso88595; break;
2545 case 'M': to_table = _cyr_mac; break;
2546 case 'K':
2547 break;
2548 default:
2549 throw_invalid_argument("Unknown destination charset: %c", to);
2550 break;
2553 for (int i = 0; i < input.size(); i++) {
2554 tmp = from_table == nullptr ? uinput[i] : from_table[uinput[i]];
2555 str[i] = to_table == nullptr ? tmp : to_table[tmp + 256];
2557 retString.setSize(input.size());
2558 return retString;
2561 ///////////////////////////////////////////////////////////////////////////////
2562 // Hebrew
2564 #define HEB_BLOCK_TYPE_ENG 1
2565 #define HEB_BLOCK_TYPE_HEB 2
2567 #define isheb(c) \
2568 (((((unsigned char) c) >= 224) && (((unsigned char) c) <= 250)) ? 1 : 0)
2569 #define _isblank(c) \
2570 (((((unsigned char) c) == ' ' || ((unsigned char) c) == '\t')) ? 1 : 0)
2571 #define _isnewline(c) \
2572 (((((unsigned char) c) == '\n' || ((unsigned char) c) == '\r')) ? 1 : 0)
2575 * Converts Logical Hebrew text (Hebrew Windows style) to Visual text
2576 * Cheers/complaints/flames - Zeev Suraski <zeev@php.net>
2578 String
2579 string_convert_hebrew_string(const String& inStr, int /*max_chars_per_line*/,
2580 int convert_newlines) {
2581 assertx(!inStr.empty());
2582 auto str = inStr.data();
2583 auto str_len = inStr.size();
2584 const char *tmp;
2585 char *heb_str, *broken_str;
2586 char *target;
2587 int block_start, block_end, block_type, block_length, i;
2588 long max_chars=0;
2589 int begin, end, char_count, orig_begin;
2591 tmp = str;
2592 block_start=block_end=0;
2594 heb_str = (char *) req::malloc_noptrs(str_len + 1);
2595 SCOPE_EXIT { req::free(heb_str); };
2596 target = heb_str+str_len;
2597 *target = 0;
2598 target--;
2600 block_length=0;
2602 if (isheb(*tmp)) {
2603 block_type = HEB_BLOCK_TYPE_HEB;
2604 } else {
2605 block_type = HEB_BLOCK_TYPE_ENG;
2608 do {
2609 if (block_type == HEB_BLOCK_TYPE_HEB) {
2610 while ((isheb((int)*(tmp+1)) ||
2611 _isblank((int)*(tmp+1)) ||
2612 ispunct((int)*(tmp+1)) ||
2613 (int)*(tmp+1)=='\n' ) && block_end<str_len-1) {
2614 tmp++;
2615 block_end++;
2616 block_length++;
2618 for (i = block_start; i<= block_end; i++) {
2619 *target = str[i];
2620 switch (*target) {
2621 case '(': *target = ')'; break;
2622 case ')': *target = '('; break;
2623 case '[': *target = ']'; break;
2624 case ']': *target = '['; break;
2625 case '{': *target = '}'; break;
2626 case '}': *target = '{'; break;
2627 case '<': *target = '>'; break;
2628 case '>': *target = '<'; break;
2629 case '\\': *target = '/'; break;
2630 case '/': *target = '\\'; break;
2631 default:
2632 break;
2634 target--;
2636 block_type = HEB_BLOCK_TYPE_ENG;
2637 } else {
2638 while (!isheb(*(tmp+1)) &&
2639 (int)*(tmp+1)!='\n' && block_end < str_len-1) {
2640 tmp++;
2641 block_end++;
2642 block_length++;
2644 while ((_isblank((int)*tmp) ||
2645 ispunct((int)*tmp)) && *tmp!='/' &&
2646 *tmp!='-' && block_end > block_start) {
2647 tmp--;
2648 block_end--;
2650 for (i = block_end; i >= block_start; i--) {
2651 *target = str[i];
2652 target--;
2654 block_type = HEB_BLOCK_TYPE_HEB;
2656 block_start=block_end+1;
2657 } while (block_end < str_len-1);
2659 String brokenStr(str_len, ReserveString);
2660 broken_str = brokenStr.mutableData();
2661 begin=end=str_len-1;
2662 target = broken_str;
2664 while (1) {
2665 char_count=0;
2666 while ((!max_chars || char_count < max_chars) && begin > 0) {
2667 char_count++;
2668 begin--;
2669 if (begin <= 0 || _isnewline(heb_str[begin])) {
2670 while (begin > 0 && _isnewline(heb_str[begin-1])) {
2671 begin--;
2672 char_count++;
2674 break;
2677 if (char_count == max_chars) { /* try to avoid breaking words */
2678 int new_char_count=char_count, new_begin=begin;
2680 while (new_char_count > 0) {
2681 if (_isblank(heb_str[new_begin]) || _isnewline(heb_str[new_begin])) {
2682 break;
2684 new_begin++;
2685 new_char_count--;
2687 if (new_char_count > 0) {
2688 char_count=new_char_count;
2689 begin=new_begin;
2692 orig_begin=begin;
2694 if (_isblank(heb_str[begin])) {
2695 heb_str[begin]='\n';
2697 while (begin <= end && _isnewline(heb_str[begin])) {
2698 /* skip leading newlines */
2699 begin++;
2701 for (i = begin; i <= end; i++) { /* copy content */
2702 *target = heb_str[i];
2703 target++;
2705 for (i = orig_begin; i <= end && _isnewline(heb_str[i]); i++) {
2706 *target = heb_str[i];
2707 target++;
2709 begin=orig_begin;
2711 if (begin <= 0) {
2712 *target = 0;
2713 break;
2715 begin--;
2716 end=begin;
2719 if (convert_newlines) {
2720 int count;
2721 auto ret = string_replace(broken_str, str_len, "\n", strlen("\n"),
2722 "<br />\n", strlen("<br />\n"), count, true);
2723 if (!ret.isNull()) {
2724 return ret;
2727 brokenStr.setSize(str_len);
2728 return brokenStr;
2731 ///////////////////////////////////////////////////////////////////////////////