Rename smart:: namespace to req::
[hiphop-php.git] / hphp / runtime / base / zend-string.cpp
blobb96aa0a268834b1f493662c33765406eb8ff72df
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/base/zend-string.h"
19 #include "hphp/runtime/base/zend-printf.h"
20 #include "hphp/runtime/base/zend-math.h"
22 #include "hphp/util/lock.h"
23 #include "hphp/util/overflow.h"
24 #include <math.h>
25 #include <monetary.h>
27 #include "hphp/util/bstring.h"
28 #include "hphp/runtime/base/exceptions.h"
29 #include "hphp/runtime/base/string-buffer.h"
30 #include "hphp/runtime/base/runtime-error.h"
31 #include "hphp/runtime/base/type-conversions.h"
32 #include "hphp/runtime/base/string-util.h"
33 #include "hphp/runtime/base/builtin-functions.h"
35 #ifdef __APPLE__
36 #ifndef isnan
37 #define isnan(x) \
38 ( sizeof (x) == sizeof(float ) ? __inline_isnanf((float)(x)) \
39 : sizeof (x) == sizeof(double) ? __inline_isnand((double)(x)) \
40 : __inline_isnan ((long double)(x)))
41 #endif
43 #ifndef isinf
44 #define isinf(x) \
45 ( sizeof (x) == sizeof(float ) ? __inline_isinff((float)(x)) \
46 : sizeof (x) == sizeof(double) ? __inline_isinfd((double)(x)) \
47 : __inline_isinf ((long double)(x)))
48 #endif
49 #endif
52 #define PHP_QPRINT_MAXL 75
54 namespace HPHP {
55 ///////////////////////////////////////////////////////////////////////////////
56 // helpers
58 bool string_substr_check(int len, int &f, int &l) {
59 if (l < 0 && -l > len) {
60 return false;
61 } else if (l > len) {
62 l = len;
65 if (f > len) {
66 return false;
67 } else if (f < 0 && -f > len) {
68 f = 0;
71 if (l < 0 && (l + len - f) < 0) {
72 return false;
75 // if "from" position is negative, count start position from the end
76 if (f < 0) {
77 f += len;
78 if (f < 0) {
79 f = 0;
82 if (f >= len) {
83 return false;
86 // if "length" position is negative, set it to the length
87 // needed to stop that many chars from the end of the string
88 if (l < 0) {
89 l += len - f;
90 if (l < 0) {
91 l = 0;
94 if ((unsigned int)f + (unsigned int)l > (unsigned int)len) {
95 l = len - f;
97 return true;
100 void string_charmask(const char *sinput, int len, char *mask) {
101 const unsigned char *input = (unsigned char *)sinput;
102 const unsigned char *end;
103 unsigned char c;
105 memset(mask, 0, 256);
106 for (end = input+len; input < end; input++) {
107 c=*input;
108 if ((input+3 < end) && input[1] == '.' && input[2] == '.'
109 && input[3] >= c) {
110 memset(mask+c, 1, input[3] - c + 1);
111 input+=3;
112 } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') {
113 /* Error, try to be as helpful as possible:
114 (a range ending/starting with '.' won't be captured here) */
115 if (end-len >= input) { /* there was no 'left' char */
116 throw_invalid_argument
117 ("charlist: Invalid '..'-range, missing left of '..'");
118 continue;
120 if (input+2 >= end) { /* there is no 'right' char */
121 throw_invalid_argument
122 ("charlist: Invalid '..'-range, missing right of '..'");
123 continue;
125 if (input[-1] > input[2]) { /* wrong order */
126 throw_invalid_argument
127 ("charlist: '..'-range needs to be incrementing");
128 continue;
130 /* FIXME: better error (a..b..c is the only left possibility?) */
131 throw_invalid_argument("charlist: Invalid '..'-range");
132 continue;
133 } else {
134 mask[c]=1;
139 int string_copy(char *dst, const char *src, int siz) {
140 register char *d = dst;
141 register const char *s = src;
142 register size_t n = siz;
144 /* Copy as many bytes as will fit */
145 if (n != 0 && --n != 0) {
146 do {
147 if ((*d++ = *s++) == 0)
148 break;
149 } while (--n != 0);
152 /* Not enough room in dst, add NUL and traverse rest of src */
153 if (n == 0) {
154 if (siz != 0)
155 *d = '\0'; /* NUL-terminate dst */
156 while (*s++)
160 return(s - src - 1); /* count does not include NUL */
163 ///////////////////////////////////////////////////////////////////////////////
164 // comparisons
166 int string_ncmp(const char *s1, const char *s2, int len) {
167 for (int i = 0; i < len; i++) {
168 char c1 = s1[i];
169 char c2 = s2[i];
170 if (c1 > c2) return 1;
171 if (c1 < c2) return -1;
173 return 0;
176 static int compare_right(char const **a, char const *aend,
177 char const **b, char const *bend) {
178 int bias = 0;
180 /* The longest run of digits wins. That aside, the greatest
181 value wins, but we can't know that it will until we've scanned
182 both numbers to know that they have the same magnitude, so we
183 remember it in BIAS. */
184 for(;; (*a)++, (*b)++) {
185 if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
186 (*b == bend || !isdigit((int)(unsigned char)**b)))
187 return bias;
188 else if (*a == aend || !isdigit((int)(unsigned char)**a))
189 return -1;
190 else if (*b == bend || !isdigit((int)(unsigned char)**b))
191 return +1;
192 else if (**a < **b) {
193 if (!bias)
194 bias = -1;
195 } else if (**a > **b) {
196 if (!bias)
197 bias = +1;
201 return 0;
204 static int compare_left(char const **a, char const *aend,
205 char const **b, char const *bend) {
206 /* Compare two left-aligned numbers: the first to have a
207 different value wins. */
208 for(;; (*a)++, (*b)++) {
209 if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
210 (*b == bend || !isdigit((int)(unsigned char)**b)))
211 return 0;
212 else if (*a == aend || !isdigit((int)(unsigned char)**a))
213 return -1;
214 else if (*b == bend || !isdigit((int)(unsigned char)**b))
215 return +1;
216 else if (**a < **b)
217 return -1;
218 else if (**a > **b)
219 return +1;
222 return 0;
225 int string_natural_cmp(char const *a, size_t a_len,
226 char const *b, size_t b_len, int fold_case) {
227 char ca, cb;
228 char const *ap, *bp;
229 char const *aend = a + a_len, *bend = b + b_len;
230 int fractional, result;
232 if (a_len == 0 || b_len == 0)
233 return a_len - b_len;
235 ap = a;
236 bp = b;
237 while (1) {
238 ca = *ap; cb = *bp;
240 /* skip over leading spaces or zeros */
241 while (isspace((int)(unsigned char)ca))
242 ca = *++ap;
244 while (isspace((int)(unsigned char)cb))
245 cb = *++bp;
247 /* process run of digits */
248 if (isdigit((int)(unsigned char)ca) && isdigit((int)(unsigned char)cb)) {
249 fractional = (ca == '0' || cb == '0');
251 if (fractional)
252 result = compare_left(&ap, aend, &bp, bend);
253 else
254 result = compare_right(&ap, aend, &bp, bend);
256 if (result != 0)
257 return result;
258 else if (ap == aend && bp == bend)
259 /* End of the strings. Let caller sort them out. */
260 return 0;
261 else {
262 /* Keep on comparing from the current point. */
263 ca = *ap; cb = *bp;
267 if (fold_case) {
268 ca = toupper((int)(unsigned char)ca);
269 cb = toupper((int)(unsigned char)cb);
272 if (ca < cb)
273 return -1;
274 else if (ca > cb)
275 return +1;
277 ++ap; ++bp;
278 if (ap >= aend && bp >= bend)
279 /* The strings compare the same. Perhaps the caller
280 will want to call strcmp to break the tie. */
281 return 0;
282 else if (ap >= aend)
283 return -1;
284 else if (bp >= bend)
285 return 1;
289 ///////////////////////////////////////////////////////////////////////////////
291 void string_to_case(String& s, int (*tocase)(int)) {
292 assert(!s.isNull());
293 assert(tocase);
294 auto data = s.mutableData();
295 auto len = s.size();
296 for (int i = 0; i < len; i++) {
297 data[i] = tocase(data[i]);
301 ///////////////////////////////////////////////////////////////////////////////
303 #define STR_PAD_LEFT 0
304 #define STR_PAD_RIGHT 1
305 #define STR_PAD_BOTH 2
307 String string_pad(const char *input, int len, int pad_length,
308 const char *pad_string, int pad_str_len,
309 int pad_type) {
310 assert(input);
311 int num_pad_chars = pad_length - len;
313 /* If resulting string turns out to be shorter than input string,
314 we simply copy the input and return. */
315 if (pad_length < 0 || num_pad_chars < 0) {
316 return String(input, len, CopyString);
319 /* Setup the padding string values if specified. */
320 if (pad_str_len == 0) {
321 throw_invalid_argument("pad_string: (empty)");
322 return String();
325 String ret(pad_length, ReserveString);
326 char *result = ret.mutableData();
328 /* We need to figure out the left/right padding lengths. */
329 int left_pad, right_pad;
330 switch (pad_type) {
331 case STR_PAD_RIGHT:
332 left_pad = 0;
333 right_pad = num_pad_chars;
334 break;
335 case STR_PAD_LEFT:
336 left_pad = num_pad_chars;
337 right_pad = 0;
338 break;
339 case STR_PAD_BOTH:
340 left_pad = num_pad_chars / 2;
341 right_pad = num_pad_chars - left_pad;
342 break;
343 default:
344 throw_invalid_argument("pad_type: %d", pad_type);
345 return String();
348 /* First we pad on the left. */
349 int result_len = 0;
350 for (int i = 0; i < left_pad; i++) {
351 result[result_len++] = pad_string[i % pad_str_len];
354 /* Then we copy the input string. */
355 memcpy(result + result_len, input, len);
356 result_len += len;
358 /* Finally, we pad on the right. */
359 for (int i = 0; i < right_pad; i++) {
360 result[result_len++] = pad_string[i % pad_str_len];
362 ret.setSize(result_len);
363 return ret;
366 ///////////////////////////////////////////////////////////////////////////////
368 int string_find(const char *input, int len, char ch, int pos,
369 bool case_sensitive) {
370 assert(input);
371 if (pos < 0 || pos > len) {
372 return -1;
374 const void *ptr;
375 if (case_sensitive) {
376 ptr = memchr(input + pos, ch, len - pos);
377 } else {
378 ptr = bstrcasechr(input + pos, ch, len - pos);
380 if (ptr != nullptr) {
381 return (int)((const char *)ptr - input);
383 return -1;
386 int string_rfind(const char *input, int len, char ch, int pos,
387 bool case_sensitive) {
388 assert(input);
389 if (pos < -len || pos > len) {
390 return -1;
392 const void *ptr;
393 if (case_sensitive) {
394 if (pos >= 0) {
395 ptr = memrchr(input + pos, ch, len - pos);
396 } else {
397 ptr = memrchr(input, ch, len + pos + 1);
399 } else {
400 if (pos >= 0) {
401 ptr = bstrrcasechr(input + pos, ch, len - pos);
402 } else {
403 ptr = bstrrcasechr(input, ch, len + pos + 1);
406 if (ptr != nullptr) {
407 return (int)((const char *)ptr - input);
409 return -1;
412 int string_find(const char *input, int len, const char *s, int s_len,
413 int pos, bool case_sensitive) {
414 assert(input);
415 assert(s);
416 if (!s_len || pos < 0 || pos > len) {
417 return -1;
419 void *ptr;
420 if (case_sensitive) {
421 ptr = (void*)string_memnstr(input + pos, s, s_len, input + len);
422 } else {
423 ptr = bstrcasestr(input + pos, len - pos, s, s_len);
425 if (ptr != nullptr) {
426 return (int)((const char *)ptr - input);
428 return -1;
431 int string_rfind(const char *input, int len, const char *s, int s_len,
432 int pos, bool case_sensitive) {
433 assert(input);
434 assert(s);
435 if (!s_len || pos < -len || pos > len) {
436 return -1;
438 void *ptr;
439 if (case_sensitive) {
440 if (pos >= 0) {
441 ptr = bstrrstr(input + pos, len - pos, s, s_len);
442 } else {
443 ptr = bstrrstr(input, len + pos + s_len, s, s_len);
445 } else {
446 if (pos >= 0) {
447 ptr = bstrrcasestr(input + pos, len - pos, s, s_len);
448 } else {
449 ptr = bstrrcasestr(input, len + pos + s_len, s, s_len);
452 if (ptr != nullptr) {
453 return (int)((const char *)ptr - input);
455 return -1;
458 const char *string_memnstr(const char *haystack, const char *needle,
459 int needle_len, const char *end) {
460 const char *p = haystack;
461 char ne = needle[needle_len-1];
463 end -= needle_len;
464 while (p <= end) {
465 if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
466 if (!memcmp(needle, p, needle_len-1)) {
467 return p;
470 if (p == nullptr) {
471 return nullptr;
473 p++;
475 return nullptr;
478 String string_replace(const char *s, int len, int start, int length,
479 const char *replacement, int len_repl) {
480 assert(s);
481 assert(replacement);
482 assert(len >= 0);
484 // if "start" position is negative, count start position from the end
485 // of the string
486 if (start < 0) {
487 start = len + start;
488 if (start < 0) {
489 start = 0;
492 if (start > len) {
493 start = len;
495 // if "length" position is negative, set it to the length
496 // needed to stop that many chars from the end of the string
497 if (length < 0) {
498 length = (len - start) + length;
499 if (length < 0) {
500 length = 0;
503 // check if length is too large
504 if (length > len) {
505 length = len;
507 // check if the length is too large adjusting for non-zero start
508 // Write this way instead of start + length > len to avoid overflow
509 if (length > len - start) {
510 length = len - start;
513 String retString(len + len_repl - length, ReserveString);
514 char *ret = retString.mutableData();
516 int ret_len = 0;
517 if (start) {
518 memcpy(ret, s, start);
519 ret_len += start;
521 if (len_repl) {
522 memcpy(ret + ret_len, replacement, len_repl);
523 ret_len += len_repl;
525 len -= (start + length);
526 if (len) {
527 memcpy(ret + ret_len, s + start + length, len);
528 ret_len += len;
530 retString.setSize(ret_len);
531 return retString;
534 String string_replace(const char *input, int len,
535 const char *search, int len_search,
536 const char *replacement, int len_replace,
537 int &count, bool case_sensitive) {
538 assert(input);
539 assert(search && len_search);
540 assert(len >= 0);
541 assert(len_search >= 0);
542 assert(len_replace >= 0);
544 if (len == 0) {
545 return String();
548 req::vector<int> founds;
549 founds.reserve(16);
550 if (len_search == 1) {
551 for (int pos = string_find(input, len, *search, 0, case_sensitive);
552 pos >= 0;
553 pos = string_find(input, len, *search, pos + len_search,
554 case_sensitive)) {
555 founds.push_back(pos);
557 } else {
558 for (int pos = string_find(input, len, search, len_search, 0,
559 case_sensitive);
560 pos >= 0;
561 pos = string_find(input, len, search, len_search,
562 pos + len_search, case_sensitive)) {
563 founds.push_back(pos);
567 count = founds.size();
568 if (count == 0) {
569 return String(); // not found
572 int reserve;
574 // Make sure the new size of the string wouldn't overflow int32_t. Don't
575 // bother if the replacement wouldn't make the string longer.
576 if (len_replace > len_search) {
577 auto raise = [&] { raise_error("String too large"); };
578 if (mul_overflow(len_replace - len_search, count)) {
579 raise();
581 int diff = (len_replace - len_search) * count;
582 if (add_overflow(len, diff)) {
583 raise();
585 reserve = len + diff;
586 } else {
587 reserve = len + (len_replace - len_search) * count;
590 String retString(reserve, ReserveString);
591 char *ret = retString.mutableData();
592 char *p = ret;
593 int pos = 0; // last position in input that hasn't been copied over yet
594 int n;
595 for (unsigned int i = 0; i < founds.size(); i++) {
596 n = founds[i];
597 if (n > pos) {
598 n -= pos;
599 memcpy(p, input, n);
600 p += n;
601 input += n;
602 pos += n;
604 if (len_replace) {
605 memcpy(p, replacement, len_replace);
606 p += len_replace;
608 input += len_search;
609 pos += len_search;
611 n = len;
612 if (n > pos) {
613 n -= pos;
614 memcpy(p, input, n);
615 p += n;
617 retString.setSize(p - ret);
618 return retString;
621 ///////////////////////////////////////////////////////////////////////////////
623 String string_chunk_split(const char *src, int srclen, const char *end,
624 int endlen, int chunklen) {
625 int chunks = srclen / chunklen; // complete chunks!
626 int restlen = srclen - chunks * chunklen; /* srclen % chunklen */
628 String ret(
629 safe_address(
630 chunks + 1,
631 endlen,
632 srclen
634 ReserveString
636 char *dest = ret.mutableData();
638 const char *p; char *q;
639 const char *pMax = src + srclen - chunklen + 1;
640 for (p = src, q = dest; p < pMax; ) {
641 memcpy(q, p, chunklen);
642 q += chunklen;
643 memcpy(q, end, endlen);
644 q += endlen;
645 p += chunklen;
648 if (restlen) {
649 memcpy(q, p, restlen);
650 q += restlen;
651 memcpy(q, end, endlen);
652 q += endlen;
655 ret.setSize(q - dest);
656 return ret;
659 ///////////////////////////////////////////////////////////////////////////////
661 #define PHP_TAG_BUF_SIZE 1023
664 * Check if tag is in a set of tags
666 * states:
668 * 0 start tag
669 * 1 first non-whitespace char seen
671 static int string_tag_find(const char *tag, int len, const char *set) {
672 char c, *n;
673 const char *t;
674 int state=0, done=0;
675 char *norm;
677 if (len <= 0) {
678 return 0;
681 norm = (char *)req::malloc(len+1);
683 n = norm;
684 t = tag;
685 c = tolower(*t);
687 normalize the tag removing leading and trailing whitespace
688 and turn any <a whatever...> into just <a> and any </tag>
689 into <tag>
691 while (!done) {
692 switch (c) {
693 case '<':
694 *(n++) = c;
695 break;
696 case '>':
697 done =1;
698 break;
699 default:
700 if (!isspace((int)c)) {
701 if (state == 0) {
702 state=1;
704 if (c != '/') {
705 *(n++) = c;
707 } else {
708 if (state == 1)
709 done=1;
711 break;
713 c = tolower(*(++t));
715 *(n++) = '>';
716 *n = '\0';
717 if (strstr(set, norm)) {
718 done=1;
719 } else {
720 done=0;
722 req::free(norm);
723 return done;
727 * A simple little state-machine to strip out html and php tags
729 * State 0 is the output state, State 1 means we are inside a
730 * normal html tag and state 2 means we are inside a php tag.
732 * The state variable is passed in to allow a function like fgetss
733 * to maintain state across calls to the function.
735 * lc holds the last significant character read and br is a bracket
736 * counter.
738 * When an allow string is passed in we keep track of the string
739 * in state 1 and when the tag is closed check it against the
740 * allow string to see if we should allow it.
742 * swm: Added ability to strip <?xml tags without assuming it PHP
743 * code.
745 String string_strip_tags(const char *s, const int len,
746 const char *allow, const int allow_len,
747 bool allow_tag_spaces) {
748 const char *abuf, *p;
749 char *rbuf, *tbuf, *tp, *rp, c, lc;
751 int br, i=0, depth=0, in_q = 0;
752 int state = 0, pos;
754 assert(s);
755 assert(allow);
757 String retString(s, len, CopyString);
758 rbuf = retString.mutableData();
759 String allowString;
761 c = *s;
762 lc = '\0';
763 p = s;
764 rp = rbuf;
765 br = 0;
766 if (allow_len) {
767 assert(allow);
769 allowString = String(allow_len, ReserveString);
770 char *atmp = allowString.mutableData();
771 for (const char *tmp = allow; *tmp; tmp++, atmp++) {
772 *atmp = tolower((int)*(const unsigned char *)tmp);
774 allowString.setSize(allow_len);
775 abuf = allowString.data();
777 tbuf = (char *)req::malloc(PHP_TAG_BUF_SIZE+1);
778 tp = tbuf;
779 } else {
780 abuf = nullptr;
781 tbuf = tp = nullptr;
784 auto move = [&pos, &tbuf, &tp]() {
785 if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
786 pos = tp - tbuf;
787 tbuf = (char*)req::realloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
788 tp = tbuf + pos;
792 while (i < len) {
793 switch (c) {
794 case '\0':
795 break;
796 case '<':
797 if (isspace(*(p + 1)) && !allow_tag_spaces) {
798 goto reg_char;
800 if (state == 0) {
801 lc = '<';
802 state = 1;
803 if (allow_len) {
804 move();
805 *(tp++) = '<';
807 } else if (state == 1) {
808 depth++;
810 break;
812 case '(':
813 if (state == 2) {
814 if (lc != '"' && lc != '\'') {
815 lc = '(';
816 br++;
818 } else if (allow_len && state == 1) {
819 move();
820 *(tp++) = c;
821 } else if (state == 0) {
822 *(rp++) = c;
824 break;
826 case ')':
827 if (state == 2) {
828 if (lc != '"' && lc != '\'') {
829 lc = ')';
830 br--;
832 } else if (allow_len && state == 1) {
833 move();
834 *(tp++) = c;
835 } else if (state == 0) {
836 *(rp++) = c;
838 break;
840 case '>':
841 if (depth) {
842 depth--;
843 break;
846 if (in_q) {
847 break;
850 switch (state) {
851 case 1: /* HTML/XML */
852 lc = '>';
853 in_q = state = 0;
854 if (allow_len) {
855 move();
856 *(tp++) = '>';
857 *tp='\0';
858 if (string_tag_find(tbuf, tp-tbuf, abuf)) {
859 memcpy(rp, tbuf, tp-tbuf);
860 rp += tp-tbuf;
862 tp = tbuf;
864 break;
866 case 2: /* PHP */
867 if (!br && lc != '\"' && *(p-1) == '?') {
868 in_q = state = 0;
869 tp = tbuf;
871 break;
873 case 3:
874 in_q = state = 0;
875 tp = tbuf;
876 break;
878 case 4: /* JavaScript/CSS/etc... */
879 if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') {
880 in_q = state = 0;
881 tp = tbuf;
883 break;
885 default:
886 *(rp++) = c;
887 break;
889 break;
891 case '"':
892 case '\'':
893 if (state == 4) {
894 /* Inside <!-- comment --> */
895 break;
896 } else if (state == 2 && *(p-1) != '\\') {
897 if (lc == c) {
898 lc = '\0';
899 } else if (lc != '\\') {
900 lc = c;
902 } else if (state == 0) {
903 *(rp++) = c;
904 } else if (allow_len && state == 1) {
905 move();
906 *(tp++) = c;
908 if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) {
909 if (in_q) {
910 in_q = 0;
911 } else {
912 in_q = *p;
915 break;
917 case '!':
918 /* JavaScript & Other HTML scripting languages */
919 if (state == 1 && *(p-1) == '<') {
920 state = 3;
921 lc = c;
922 } else {
923 if (state == 0) {
924 *(rp++) = c;
925 } else if (allow_len && state == 1) {
926 move();
927 *(tp++) = c;
930 break;
932 case '-':
933 if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') {
934 state = 4;
935 } else {
936 goto reg_char;
938 break;
940 case '?':
942 if (state == 1 && *(p-1) == '<') {
943 br=0;
944 state=2;
945 break;
948 case 'E':
949 case 'e':
950 /* !DOCTYPE exception */
951 if (state==3 && p > s+6
952 && tolower(*(p-1)) == 'p'
953 && tolower(*(p-2)) == 'y'
954 && tolower(*(p-3)) == 't'
955 && tolower(*(p-4)) == 'c'
956 && tolower(*(p-5)) == 'o'
957 && tolower(*(p-6)) == 'd') {
958 state = 1;
959 break;
961 /* fall-through */
963 case 'l':
965 /* swm: If we encounter '<?xml' then we shouldn't be in
966 * state == 2 (PHP). Switch back to HTML.
969 if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') {
970 state = 1;
971 break;
974 /* fall-through */
975 default:
976 reg_char:
977 if (state == 0) {
978 *(rp++) = c;
979 } else if (allow_len && state == 1) {
980 move();
981 *(tp++) = c;
983 break;
985 c = *(++p);
986 i++;
988 if (rp < rbuf + len) {
989 *rp = '\0';
991 if (allow_len) {
992 req::free(tbuf);
995 retString.setSize(rp - rbuf);
996 return retString;
999 ///////////////////////////////////////////////////////////////////////////////
1001 String string_addslashes(const char *str, int length) {
1002 assert(str);
1003 if (length == 0) {
1004 return String();
1007 String retString((length << 1) + 1, ReserveString);
1008 char *new_str = retString.mutableData();
1009 const char *source = str;
1010 const char *end = source + length;
1011 char *target = new_str;
1013 while (source < end) {
1014 switch (*source) {
1015 case '\0':
1016 *target++ = '\\';
1017 *target++ = '0';
1018 break;
1019 case '\'':
1020 case '\"':
1021 case '\\':
1022 *target++ = '\\';
1023 /* break is missing *intentionally* */
1024 default:
1025 *target++ = *source;
1026 break;
1029 source++;
1032 retString.setSize(target - new_str);
1033 return retString;
1036 ///////////////////////////////////////////////////////////////////////////////
1038 static char string_hex2int(int c) {
1039 if (isdigit(c)) {
1040 return c - '0';
1042 if (c >= 'A' && c <= 'F') {
1043 return c - 'A' + 10;
1045 if (c >= 'a' && c <= 'f') {
1046 return c - 'a' + 10;
1048 return -1;
1051 String string_quoted_printable_encode(const char *input, int len) {
1052 size_t length = len;
1053 const unsigned char *str = (unsigned char*)input;
1055 unsigned long lp = 0;
1056 unsigned char c;
1057 char *d, *buffer;
1058 char *hex = "0123456789ABCDEF";
1060 String ret(
1061 safe_address(
1063 length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1),
1065 ReserveString
1067 d = buffer = ret.mutableData();
1069 while (length--) {
1070 if (((c = *str++) == '\015') && (*str == '\012') && length > 0) {
1071 *d++ = '\015';
1072 *d++ = *str++;
1073 length--;
1074 lp = 0;
1075 } else {
1076 if (iscntrl (c) || (c == 0x7f) || (c & 0x80) ||
1077 (c == '=') || ((c == ' ') && (*str == '\015'))) {
1078 if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f))
1079 || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL))
1080 || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL))
1081 || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) {
1082 *d++ = '=';
1083 *d++ = '\015';
1084 *d++ = '\012';
1085 lp = 3;
1087 *d++ = '=';
1088 *d++ = hex[c >> 4];
1089 *d++ = hex[c & 0xf];
1090 } else {
1091 if ((++lp) > PHP_QPRINT_MAXL) {
1092 *d++ = '=';
1093 *d++ = '\015';
1094 *d++ = '\012';
1095 lp = 1;
1097 *d++ = c;
1101 len = d - buffer;
1103 ret.setSize(len);
1104 return ret;
1107 String string_quoted_printable_decode(const char *input, int len, bool is_q) {
1108 assert(input);
1109 if (len == 0) {
1110 return String();
1113 int i = 0, j = 0, k;
1114 const char *str_in = input;
1115 String ret(len, ReserveString);
1116 char *str_out = ret.mutableData();
1117 while (i < len && str_in[i]) {
1118 switch (str_in[i]) {
1119 case '=':
1120 if (i + 2 < len && str_in[i + 1] && str_in[i + 2] &&
1121 isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2]))
1123 str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4)
1124 + string_hex2int((int) str_in[i + 2]);
1125 i += 3;
1126 } else /* check for soft line break according to RFC 2045*/ {
1127 k = 1;
1128 while (str_in[i + k] &&
1129 ((str_in[i + k] == 32) || (str_in[i + k] == 9))) {
1130 /* Possibly, skip spaces/tabs at the end of line */
1131 k++;
1133 if (!str_in[i + k]) {
1134 /* End of line reached */
1135 i += k;
1137 else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) {
1138 /* CRLF */
1139 i += k + 2;
1141 else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) {
1142 /* CR or LF */
1143 i += k + 1;
1145 else {
1146 str_out[j++] = str_in[i++];
1149 break;
1150 case '_':
1151 if (is_q) {
1152 str_out[j++] = ' ';
1153 i++;
1154 } else {
1155 str_out[j++] = str_in[i++];
1157 break;
1158 default:
1159 str_out[j++] = str_in[i++];
1162 ret.setSize(j);
1163 return ret;
1166 Variant string_base_to_numeric(const char *s, int len, int base) {
1167 int64_t num = 0;
1168 double fnum = 0;
1169 int mode = 0;
1170 int64_t cutoff;
1171 int cutlim;
1173 assert(string_validate_base(base));
1175 cutoff = LONG_MAX / base;
1176 cutlim = LONG_MAX % base;
1178 for (int i = len; i > 0; i--) {
1179 char c = *s++;
1181 /* might not work for EBCDIC */
1182 if (c >= '0' && c <= '9')
1183 c -= '0';
1184 else if (c >= 'A' && c <= 'Z')
1185 c -= 'A' - 10;
1186 else if (c >= 'a' && c <= 'z')
1187 c -= 'a' - 10;
1188 else
1189 continue;
1191 if (c >= base)
1192 continue;
1194 switch (mode) {
1195 case 0: /* Integer */
1196 if (num < cutoff || (num == cutoff && c <= cutlim)) {
1197 num = num * base + c;
1198 break;
1199 } else {
1200 fnum = num;
1201 mode = 1;
1203 /* fall-through */
1204 case 1: /* Float */
1205 fnum = fnum * base + c;
1209 if (mode == 1) {
1210 return fnum;
1212 return num;
1215 String string_long_to_base(unsigned long value, int base) {
1216 static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1217 char buf[(sizeof(unsigned long) << 3) + 1];
1218 char *ptr, *end;
1220 assert(string_validate_base(base));
1222 end = ptr = buf + sizeof(buf) - 1;
1224 do {
1225 *--ptr = digits[value % base];
1226 value /= base;
1227 } while (ptr > buf && value);
1229 return String(ptr, end - ptr, CopyString);
1232 String string_numeric_to_base(const Variant& value, int base) {
1233 static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1235 assert(string_validate_base(base));
1236 if ((!value.isInteger() && !value.isDouble())) {
1237 return empty_string();
1240 if (value.isDouble()) {
1241 double fvalue = floor(value.toDouble()); /* floor it just in case */
1242 char *ptr, *end;
1243 char buf[(sizeof(double) << 3) + 1];
1245 /* Don't try to convert +/- infinity */
1246 if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) {
1247 raise_warning("Number too large");
1248 return empty_string();
1251 end = ptr = buf + sizeof(buf) - 1;
1253 do {
1254 *--ptr = digits[(int) fmod(fvalue, base)];
1255 fvalue /= base;
1256 } while (ptr > buf && fabs(fvalue) >= 1);
1258 return String(ptr, end - ptr, CopyString);
1261 return string_long_to_base(value.toInt64(), base);
1264 ///////////////////////////////////////////////////////////////////////////////
1265 // uuencode
1267 #define PHP_UU_ENC(c) \
1268 ((c) ? ((c) & 077) + ' ' : '`')
1269 #define PHP_UU_ENC_C2(c) \
1270 PHP_UU_ENC(((*(c) << 4) & 060) | ((*((c) + 1) >> 4) & 017))
1271 #define PHP_UU_ENC_C3(c) \
1272 PHP_UU_ENC(((*(c + 1) << 2) & 074) | ((*((c) + 2) >> 6) & 03))
1273 #define PHP_UU_DEC(c) \
1274 (((c) - ' ') & 077)
1276 String string_uuencode(const char *src, int src_len) {
1277 assert(src);
1278 assert(src_len);
1280 int len = 45;
1281 char *p;
1282 const char *s, *e, *ee;
1283 char *dest;
1285 /* encoded length is ~ 38% greater than the original */
1286 String ret((int)ceil(src_len * 1.38) + 45, ReserveString);
1287 p = dest = ret.mutableData();
1288 s = src;
1289 e = src + src_len;
1291 while ((s + 3) < e) {
1292 ee = s + len;
1293 if (ee > e) {
1294 ee = e;
1295 len = ee - s;
1296 if (len % 3) {
1297 ee = s + (int) (floor(len / 3) * 3);
1300 *p++ = PHP_UU_ENC(len);
1302 while (s < ee) {
1303 *p++ = PHP_UU_ENC(*s >> 2);
1304 *p++ = PHP_UU_ENC_C2(s);
1305 *p++ = PHP_UU_ENC_C3(s);
1306 *p++ = PHP_UU_ENC(*(s + 2) & 077);
1308 s += 3;
1311 if (len == 45) {
1312 *p++ = '\n';
1316 if (s < e) {
1317 if (len == 45) {
1318 *p++ = PHP_UU_ENC(e - s);
1319 len = 0;
1322 *p++ = PHP_UU_ENC(*s >> 2);
1323 *p++ = PHP_UU_ENC_C2(s);
1324 *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0');
1325 *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0');
1328 if (len < 45) {
1329 *p++ = '\n';
1332 *p++ = PHP_UU_ENC('\0');
1333 *p++ = '\n';
1334 *p = '\0';
1336 ret.setSize(p - dest);
1337 return ret;
1340 String string_uudecode(const char *src, int src_len) {
1341 int total_len = 0;
1342 int len;
1343 const char *s, *e, *ee;
1344 char *p, *dest;
1346 String ret(ceil(src_len * 0.75), ReserveString);
1347 p = dest = ret.mutableData();
1348 s = src;
1349 e = src + src_len;
1351 while (s < e) {
1352 if ((len = PHP_UU_DEC(*s++)) <= 0) {
1353 break;
1355 /* sanity check */
1356 if (len > src_len) {
1357 goto err;
1360 total_len += len;
1362 ee = s + (len == 45 ? 60 : (int) floor(len * 1.33));
1363 /* sanity check */
1364 if (ee > e) {
1365 goto err;
1368 while (s < ee) {
1369 if (s + 4 > e) goto err;
1371 *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1372 *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1373 *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1374 s += 4;
1377 if (len < 45) {
1378 break;
1381 /* skip \n */
1382 s++;
1385 if ((len = total_len > (p - dest))) {
1386 *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1387 if (len > 1) {
1388 *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1389 if (len > 2) {
1390 *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1395 ret.setSize(total_len);
1396 return ret;
1398 err:
1399 return String();
1402 ///////////////////////////////////////////////////////////////////////////////
1403 // base64
1405 static const char base64_table[] = {
1406 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
1407 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
1408 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
1409 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
1410 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
1413 static const char base64_pad = '=';
1415 static const short base64_reverse_table[256] = {
1416 -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
1417 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1418 -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
1419 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
1420 -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1421 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
1422 -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
1423 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
1424 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1425 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1426 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1427 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1428 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1429 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1430 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1431 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
1434 static String php_base64_encode(const unsigned char *str, int length) {
1435 const unsigned char *current = str;
1436 unsigned char *p;
1437 unsigned char *result;
1439 if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
1440 return String();
1443 String ret(((length + 2) / 3) * 4, ReserveString);
1444 p = result = (unsigned char *)ret.mutableData();
1446 while (length > 2) { /* keep going until we have less than 24 bits */
1447 *p++ = base64_table[current[0] >> 2];
1448 *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1449 *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)];
1450 *p++ = base64_table[current[2] & 0x3f];
1452 current += 3;
1453 length -= 3; /* we just handle 3 octets of data */
1456 /* now deal with the tail end of things */
1457 if (length != 0) {
1458 *p++ = base64_table[current[0] >> 2];
1459 if (length > 1) {
1460 *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1461 *p++ = base64_table[(current[1] & 0x0f) << 2];
1462 *p++ = base64_pad;
1463 } else {
1464 *p++ = base64_table[(current[0] & 0x03) << 4];
1465 *p++ = base64_pad;
1466 *p++ = base64_pad;
1469 ret.setSize(p - result);
1470 return ret;
1473 static String php_base64_decode(const char *str, int length, bool strict) {
1474 const unsigned char *current = (unsigned char*)str;
1475 int ch, i = 0, j = 0, k;
1476 /* this sucks for threaded environments */
1478 String retString(length, ReserveString);
1479 unsigned char* result = (unsigned char*)retString.mutableData();
1481 /* run through the whole string, converting as we go */
1482 while ((ch = *current++) != '\0' && length-- > 0) {
1483 if (ch == base64_pad) {
1484 if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
1485 if ((i % 4) != 1) {
1486 while (isspace(*(++current))) {
1487 continue;
1489 if (*current == '\0') {
1490 continue;
1493 return String();
1495 continue;
1498 ch = base64_reverse_table[ch];
1499 if ((!strict && ch < 0) || ch == -1) {
1500 /* a space or some other separator character, we simply skip over */
1501 continue;
1502 } else if (ch == -2) {
1503 return String();
1506 switch(i % 4) {
1507 case 0:
1508 result[j] = ch << 2;
1509 break;
1510 case 1:
1511 result[j++] |= ch >> 4;
1512 result[j] = (ch & 0x0f) << 4;
1513 break;
1514 case 2:
1515 result[j++] |= ch >>2;
1516 result[j] = (ch & 0x03) << 6;
1517 break;
1518 case 3:
1519 result[j++] |= ch;
1520 break;
1522 i++;
1525 k = j;
1526 /* mop things up if we ended on a boundary */
1527 if (ch == base64_pad) {
1528 switch(i % 4) {
1529 case 1:
1530 return String();
1531 case 2:
1532 k++;
1533 case 3:
1534 result[k] = 0;
1537 retString.setSize(j);
1538 return retString;
1541 String string_base64_encode(const char *input, int len) {
1542 return php_base64_encode((unsigned char *)input, len);
1545 String string_base64_decode(const char *input, int len, bool strict) {
1546 return php_base64_decode(input, len, strict);
1549 ///////////////////////////////////////////////////////////////////////////////
1551 String string_escape_shell_arg(const char *str) {
1552 int x, y, l;
1553 char *cmd;
1555 y = 0;
1556 l = strlen(str);
1558 String ret(safe_address(l, 4, 3), ReserveString); /* worst case */
1559 cmd = ret.mutableData();
1561 cmd[y++] = '\'';
1563 for (x = 0; x < l; x++) {
1564 switch (str[x]) {
1565 case '\'':
1566 cmd[y++] = '\'';
1567 cmd[y++] = '\\';
1568 cmd[y++] = '\'';
1569 /* fall-through */
1570 default:
1571 cmd[y++] = str[x];
1574 cmd[y++] = '\'';
1575 ret.setSize(y);
1576 return ret;
1579 String string_escape_shell_cmd(const char *str) {
1580 register int x, y, l;
1581 char *cmd;
1582 char *p = nullptr;
1584 l = strlen(str);
1585 String ret(safe_address(l, 2, 1), ReserveString);
1586 cmd = ret.mutableData();
1588 for (x = 0, y = 0; x < l; x++) {
1589 switch (str[x]) {
1590 case '"':
1591 case '\'':
1592 if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) {
1593 /* noop */
1594 } else if (p && *p == str[x]) {
1595 p = nullptr;
1596 } else {
1597 cmd[y++] = '\\';
1599 cmd[y++] = str[x];
1600 break;
1601 case '#': /* This is character-set independent */
1602 case '&':
1603 case ';':
1604 case '`':
1605 case '|':
1606 case '*':
1607 case '?':
1608 case '~':
1609 case '<':
1610 case '>':
1611 case '^':
1612 case '(':
1613 case ')':
1614 case '[':
1615 case ']':
1616 case '{':
1617 case '}':
1618 case '$':
1619 case '\\':
1620 case '\x0A': /* excluding these two */
1621 case '\xFF':
1622 cmd[y++] = '\\';
1623 /* fall-through */
1624 default:
1625 cmd[y++] = str[x];
1628 ret.setSize(y);
1629 return ret;
1632 ///////////////////////////////////////////////////////////////////////////////
1634 static void string_similar_str(const char *txt1, int len1,
1635 const char *txt2, int len2,
1636 int *pos1, int *pos2, int *max) {
1637 const char *p, *q;
1638 const char *end1 = txt1 + len1;
1639 const char *end2 = txt2 + len2;
1640 int l;
1642 *max = 0;
1643 for (p = txt1; p < end1; p++) {
1644 for (q = txt2; q < end2; q++) {
1645 for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++);
1646 if (l > *max) {
1647 *max = l;
1648 *pos1 = p - txt1;
1649 *pos2 = q - txt2;
1655 static int string_similar_char(const char *txt1, int len1,
1656 const char *txt2, int len2) {
1657 int sum;
1658 int pos1 = 0, pos2 = 0, max;
1660 string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);
1661 if ((sum = max)) {
1662 if (pos1 && pos2) {
1663 sum += string_similar_char(txt1, pos1, txt2, pos2);
1665 if ((pos1 + max < len1) && (pos2 + max < len2)) {
1666 sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max,
1667 txt2 + pos2 + max, len2 - pos2 - max);
1671 return sum;
1674 int string_similar_text(const char *t1, int len1,
1675 const char *t2, int len2, float *percent) {
1676 if (len1 == 0 && len2 == 0) {
1677 if (percent) *percent = 0.0;
1678 return 0;
1681 int sim = string_similar_char(t1, len1, t2, len2);
1682 if (percent) *percent = sim * 200.0 / (len1 + len2);
1683 return sim;
1686 ///////////////////////////////////////////////////////////////////////////////
1688 #define LEVENSHTEIN_MAX_LENTH 255
1690 // reference implementation, only optimized for memory usage, not speed
1691 int string_levenshtein(const char *s1, int l1, const char *s2, int l2,
1692 int cost_ins, int cost_rep, int cost_del ) {
1693 int *p1, *p2, *tmp;
1694 int i1, i2, c0, c1, c2;
1696 if (l1==0) return l2*cost_ins;
1697 if (l2==0) return l1*cost_del;
1699 if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) {
1700 raise_warning("levenshtein(): Argument string(s) too long");
1701 return -1;
1704 p1 = (int*)req::malloc((l2+1) * sizeof(int));
1705 p2 = (int*)req::malloc((l2+1) * sizeof(int));
1707 for(i2=0;i2<=l2;i2++) {
1708 p1[i2] = i2*cost_ins;
1711 for(i1=0;i1<l1;i1++) {
1712 p2[0]=p1[0]+cost_del;
1713 for(i2=0;i2<l2;i2++) {
1714 c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep);
1715 c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1;
1716 c2=p2[i2]+cost_ins; if (c2<c0) c0=c2;
1717 p2[i2+1]=c0;
1719 tmp=p1; p1=p2; p2=tmp;
1722 c0=p1[l2];
1723 req::free(p1);
1724 req::free(p2);
1725 return c0;
1728 ///////////////////////////////////////////////////////////////////////////////
1730 String string_money_format(const char *format, double value) {
1731 bool check = false;
1732 const char *p = format;
1733 while ((p = strchr(p, '%'))) {
1734 if (*(p + 1) == '%') {
1735 p += 2;
1736 } else if (!check) {
1737 check = true;
1738 p++;
1739 } else {
1740 throw_invalid_argument
1741 ("format: Only a single %%i or %%n token can be used");
1742 return String();
1746 int format_len = strlen(format);
1747 int str_len = safe_address(format_len, 1, 1024);
1748 String ret(str_len, ReserveString);
1749 char *str = ret.mutableData();
1750 if ((str_len = strfmon(str, str_len, format, value)) < 0) {
1751 return String();
1753 ret.setSize(str_len);
1754 return ret;
1757 ///////////////////////////////////////////////////////////////////////////////
1759 String string_number_format(double d, int dec,
1760 const String& dec_point,
1761 const String& thousand_sep) {
1762 char *tmpbuf = nullptr, *resbuf;
1763 char *s, *t; /* source, target */
1764 char *dp;
1765 int integral;
1766 int tmplen, reslen=0;
1767 int count=0;
1768 int is_negative=0;
1770 if (d < 0) {
1771 is_negative = 1;
1772 d = -d;
1775 if (dec < 0) dec = 0;
1776 d = php_math_round(d, dec);
1778 // departure from PHP: we got rid of dependencies on spprintf() here.
1779 String tmpstr(63, ReserveString);
1780 tmpbuf = tmpstr.mutableData();
1781 tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d);
1782 if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1783 tmpstr.setSize(tmplen);
1784 return tmpstr;
1786 if (tmplen >= 64) {
1787 // Uncommon, asked for more than 64 chars worth of precision
1788 tmpstr = String(tmplen, ReserveString);
1789 tmpbuf = tmpstr.mutableData();
1790 tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d);
1791 if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1792 tmpstr.setSize(tmplen);
1793 return tmpstr;
1797 /* find decimal point, if expected */
1798 if (dec) {
1799 dp = strpbrk(tmpbuf, ".,");
1800 } else {
1801 dp = nullptr;
1804 /* calculate the length of the return buffer */
1805 if (dp) {
1806 integral = dp - tmpbuf;
1807 } else {
1808 /* no decimal point was found */
1809 integral = tmplen;
1812 /* allow for thousand separators */
1813 if (!thousand_sep.empty()) {
1814 integral += ((integral-1) / 3) * thousand_sep.size();
1817 reslen = integral;
1819 if (dec) {
1820 reslen += dec;
1822 if (!dec_point.empty()) {
1823 reslen += dec_point.size();
1827 /* add a byte for minus sign */
1828 if (is_negative) {
1829 reslen++;
1831 String resstr(reslen, ReserveString);
1832 resbuf = resstr.mutableData();
1834 s = tmpbuf+tmplen-1;
1835 t = resbuf+reslen-1;
1837 /* copy the decimal places.
1838 * Take care, as the sprintf implementation may return less places than
1839 * we requested due to internal buffer limitations */
1840 if (dec) {
1841 int declen = dp ? s - dp : 0;
1842 int topad = dec > declen ? dec - declen : 0;
1844 /* pad with '0's */
1845 while (topad--) {
1846 *t-- = '0';
1849 if (dp) {
1850 s -= declen + 1; /* +1 to skip the point */
1851 t -= declen;
1853 /* now copy the chars after the point */
1854 memcpy(t + 1, dp + 1, declen);
1857 /* add decimal point */
1858 if (!dec_point.empty()) {
1859 memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size());
1860 t -= dec_point.size();
1864 /* copy the numbers before the decimal point, adding thousand
1865 * separator every three digits */
1866 while(s >= tmpbuf) {
1867 *t-- = *s--;
1868 if (thousand_sep && (++count%3)==0 && s>=tmpbuf) {
1869 memcpy(t + (1 - thousand_sep.size()),
1870 thousand_sep.data(),
1871 thousand_sep.size());
1872 t -= thousand_sep.size();
1876 /* and a minus sign, if needed */
1877 if (is_negative) {
1878 *t-- = '-';
1881 resstr.setSize(reslen);
1882 return resstr;
1885 ///////////////////////////////////////////////////////////////////////////////
1886 // soundex
1888 /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
1889 String string_soundex(const String& str) {
1890 assert(!str.empty());
1891 int _small, code, last;
1892 String retString(4, ReserveString);
1893 char* soundex = retString.mutableData();
1895 static char soundex_table[26] = {
1896 0, /* A */
1897 '1', /* B */
1898 '2', /* C */
1899 '3', /* D */
1900 0, /* E */
1901 '1', /* F */
1902 '2', /* G */
1903 0, /* H */
1904 0, /* I */
1905 '2', /* J */
1906 '2', /* K */
1907 '4', /* L */
1908 '5', /* M */
1909 '5', /* N */
1910 0, /* O */
1911 '1', /* P */
1912 '2', /* Q */
1913 '6', /* R */
1914 '2', /* S */
1915 '3', /* T */
1916 0, /* U */
1917 '1', /* V */
1918 0, /* W */
1919 '2', /* X */
1920 0, /* Y */
1921 '2' /* Z */
1924 /* build soundex string */
1925 last = -1;
1926 const char *p = str.slice().ptr;
1927 for (_small = 0; *p && _small < 4; p++) {
1928 /* convert chars to upper case and strip non-letter chars */
1929 /* BUG: should also map here accented letters used in non */
1930 /* English words or names (also found in English text!): */
1931 /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
1932 code = toupper((int)(unsigned char)(*p));
1933 if (code >= 'A' && code <= 'Z') {
1934 if (_small == 0) {
1935 /* remember first valid char */
1936 soundex[_small++] = code;
1937 last = soundex_table[code - 'A'];
1938 } else {
1939 /* ignore sequences of consonants with same soundex */
1940 /* code in trail, and vowels unless they separate */
1941 /* consonant letters */
1942 code = soundex_table[code - 'A'];
1943 if (code != last) {
1944 if (code != 0) {
1945 soundex[_small++] = code;
1947 last = code;
1952 /* pad with '0' and terminate with 0 ;-) */
1953 while (_small < 4) {
1954 soundex[_small++] = '0';
1956 retString.setSize(4);
1957 return retString;
1960 ///////////////////////////////////////////////////////////////////////////////
1961 // metaphone
1964 * this is now the original code by Michael G Schwern:
1965 * i've changed it just a slightly bit (use emalloc,
1966 * get rid of includes etc)
1967 * - thies - 13.09.1999
1970 /*----------------------------- */
1971 /* this used to be "metaphone.h" */
1972 /*----------------------------- */
1974 /* Special encodings */
1975 #define SH 'X'
1976 #define TH '0'
1978 /*----------------------------- */
1979 /* end of "metaphone.h" */
1980 /*----------------------------- */
1982 /*----------------------------- */
1983 /* this used to be "metachar.h" */
1984 /*----------------------------- */
1986 /* Metachar.h ... little bits about characters for metaphone */
1987 /*-- Character encoding array & accessing macros --*/
1988 /* Stolen directly out of the book... */
1989 char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
1991 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
1993 #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
1995 /* These letters are passed through unchanged */
1996 #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
1998 /* These form dipthongs when preceding H */
1999 #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
2001 /* These make C and G soft */
2002 #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
2004 /* These prevent GH from becoming F */
2005 #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
2007 /*----------------------------- */
2008 /* end of "metachar.h" */
2009 /*----------------------------- */
2011 /* I suppose I could have been using a character pointer instead of
2012 * accesssing the array directly... */
2014 /* Look at the next letter in the word */
2015 #define Next_Letter ((char)toupper(word[w_idx+1]))
2016 /* Look at the current letter in the word */
2017 #define Curr_Letter ((char)toupper(word[w_idx]))
2018 /* Go N letters back. */
2019 #define Look_Back_Letter(n) (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
2020 /* Previous letter. I dunno, should this return null on failure? */
2021 #define Prev_Letter (Look_Back_Letter(1))
2022 /* Look two letters down. It makes sure you don't walk off the string. */
2023 #define After_Next_Letter (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
2024 : '\0')
2025 #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
2027 /* Allows us to safely look ahead an arbitrary # of letters */
2028 /* I probably could have just used strlen... */
2029 static char Lookahead(unsigned char *word, int how_far) {
2030 char letter_ahead = '\0'; /* null by default */
2031 int idx;
2032 for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
2033 /* Edge forward in the string... */
2035 letter_ahead = (char)word[idx]; /* idx will be either == to how_far or
2036 * at the end of the string
2038 return letter_ahead;
2041 /* phonize one letter
2042 * We don't know the buffers size in advance. On way to solve this is to just
2043 * re-allocate the buffer size. We're using an extra of 2 characters (this
2044 * could be one though; or more too). */
2045 #define Phonize(c) { buffer.append(c); }
2046 /* How long is the phoned word? */
2047 #define Phone_Len (buffer.size())
2049 /* Note is a letter is a 'break' in the word */
2050 #define Isbreak(c) (!isalpha(c))
2052 String string_metaphone(const char *input, int word_len, long max_phonemes,
2053 int traditional) {
2054 unsigned char *word = (unsigned char *)input;
2056 int w_idx = 0; /* point in the phonization we're at. */
2057 int max_buffer_len = 0; /* maximum length of the destination buffer */
2059 /*-- Parameter checks --*/
2060 /* Negative phoneme length is meaningless */
2062 if (max_phonemes < 0)
2063 return String();
2065 /* Empty/null string is meaningless */
2066 /* Overly paranoid */
2067 /* always_assert(word != NULL && word[0] != '\0'); */
2069 if (word == nullptr)
2070 return String();
2072 /*-- Allocate memory for our phoned_phrase --*/
2073 if (max_phonemes == 0) { /* Assume largest possible */
2074 max_buffer_len = word_len;
2075 } else {
2076 max_buffer_len = max_phonemes;
2078 StringBuffer buffer(max_buffer_len);
2080 /*-- The first phoneme has to be processed specially. --*/
2081 /* Find our first letter */
2082 for (; !isalpha(Curr_Letter); w_idx++) {
2083 /* On the off chance we were given nothing but crap... */
2084 if (Curr_Letter == '\0') {
2085 return buffer.detach(); /* For testing */
2089 switch (Curr_Letter) {
2090 /* AE becomes E */
2091 case 'A':
2092 if (Next_Letter == 'E') {
2093 Phonize('E');
2094 w_idx += 2;
2096 /* Remember, preserve vowels at the beginning */
2097 else {
2098 Phonize('A');
2099 w_idx++;
2101 break;
2102 /* [GKP]N becomes N */
2103 case 'G':
2104 case 'K':
2105 case 'P':
2106 if (Next_Letter == 'N') {
2107 Phonize('N');
2108 w_idx += 2;
2110 break;
2111 /* WH becomes H,
2112 WR becomes R
2113 W if followed by a vowel */
2114 case 'W':
2115 if (Next_Letter == 'H' ||
2116 Next_Letter == 'R') {
2117 Phonize(Next_Letter);
2118 w_idx += 2;
2119 } else if (isvowel(Next_Letter)) {
2120 Phonize('W');
2121 w_idx += 2;
2123 /* else ignore */
2124 break;
2125 /* X becomes S */
2126 case 'X':
2127 Phonize('S');
2128 w_idx++;
2129 break;
2130 /* Vowels are kept */
2131 /* We did A already
2132 case 'A':
2133 case 'a':
2135 case 'E':
2136 case 'I':
2137 case 'O':
2138 case 'U':
2139 Phonize(Curr_Letter);
2140 w_idx++;
2141 break;
2142 default:
2143 /* do nothing */
2144 break;
2147 /* On to the metaphoning */
2148 for (; Curr_Letter != '\0' &&
2149 (max_phonemes == 0 || Phone_Len < max_phonemes);
2150 w_idx++) {
2151 /* How many letters to skip because an eariler encoding handled
2152 * multiple letters */
2153 unsigned short int skip_letter = 0;
2156 /* THOUGHT: It would be nice if, rather than having things like...
2157 * well, SCI. For SCI you encode the S, then have to remember
2158 * to skip the C. So the phonome SCI invades both S and C. It would
2159 * be better, IMHO, to skip the C from the S part of the encoding.
2160 * Hell, I'm trying it.
2163 /* Ignore non-alphas */
2164 if (!isalpha(Curr_Letter))
2165 continue;
2167 /* Drop duplicates, except CC */
2168 if (Curr_Letter == Prev_Letter &&
2169 Curr_Letter != 'C')
2170 continue;
2172 switch (Curr_Letter) {
2173 /* B -> B unless in MB */
2174 case 'B':
2175 if (Prev_Letter != 'M')
2176 Phonize('B');
2177 break;
2178 /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
2179 * (SCHW is handled in S)
2180 * S if -CI-, -CE- or -CY-
2181 * dropped if -SCI-, SCE-, -SCY- (handed in S)
2182 * else K
2184 case 'C':
2185 if (MAKESOFT(Next_Letter)) { /* C[IEY] */
2186 if (After_Next_Letter == 'A' &&
2187 Next_Letter == 'I') { /* CIA */
2188 Phonize(SH);
2190 /* SC[IEY] */
2191 else if (Prev_Letter == 'S') {
2192 /* Dropped */
2193 } else {
2194 Phonize('S');
2196 } else if (Next_Letter == 'H') {
2197 if ((!traditional) && (After_Next_Letter == 'R' ||
2198 Prev_Letter == 'S')) { /* Christ, School */
2199 Phonize('K');
2200 } else {
2201 Phonize(SH);
2203 skip_letter++;
2204 } else {
2205 Phonize('K');
2207 break;
2208 /* J if in -DGE-, -DGI- or -DGY-
2209 * else T
2211 case 'D':
2212 if (Next_Letter == 'G' && MAKESOFT(After_Next_Letter)) {
2213 Phonize('J');
2214 skip_letter++;
2215 } else
2216 Phonize('T');
2217 break;
2218 /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
2219 * else dropped if -GNED, -GN,
2220 * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
2221 * else J if in -GE-, -GI, -GY and not GG
2222 * else K
2224 case 'G':
2225 if (Next_Letter == 'H') {
2226 if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) {
2227 Phonize('F');
2228 skip_letter++;
2229 } else {
2230 /* silent */
2232 } else if (Next_Letter == 'N') {
2233 if (Isbreak(After_Next_Letter) ||
2234 (After_Next_Letter == 'E' && Look_Ahead_Letter(3) == 'D')) {
2235 /* dropped */
2236 } else
2237 Phonize('K');
2238 } else if (MAKESOFT(Next_Letter) && Prev_Letter != 'G') {
2239 Phonize('J');
2240 } else {
2241 Phonize('K');
2243 break;
2244 /* H if before a vowel and not after C,G,P,S,T */
2245 case 'H':
2246 if (isvowel(Next_Letter) && !AFFECTH(Prev_Letter))
2247 Phonize('H');
2248 break;
2249 /* dropped if after C
2250 * else K
2252 case 'K':
2253 if (Prev_Letter != 'C')
2254 Phonize('K');
2255 break;
2256 /* F if before H
2257 * else P
2259 case 'P':
2260 if (Next_Letter == 'H') {
2261 Phonize('F');
2262 } else {
2263 Phonize('P');
2265 break;
2266 /* K
2268 case 'Q':
2269 Phonize('K');
2270 break;
2271 /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
2272 * else S
2274 case 'S':
2275 if (Next_Letter == 'I' &&
2276 (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2277 Phonize(SH);
2278 } else if (Next_Letter == 'H') {
2279 Phonize(SH);
2280 skip_letter++;
2281 } else if ((!traditional) &&
2282 (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' &&
2283 Look_Ahead_Letter(3) == 'W')) {
2284 Phonize(SH);
2285 skip_letter += 2;
2286 } else {
2287 Phonize('S');
2289 break;
2290 /* 'sh' in -TIA- or -TIO-
2291 * else 'th' before H
2292 * else T
2294 case 'T':
2295 if (Next_Letter == 'I' &&
2296 (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2297 Phonize(SH);
2298 } else if (Next_Letter == 'H') {
2299 Phonize(TH);
2300 skip_letter++;
2301 } else {
2302 Phonize('T');
2304 break;
2305 /* F */
2306 case 'V':
2307 Phonize('F');
2308 break;
2309 /* W before a vowel, else dropped */
2310 case 'W':
2311 if (isvowel(Next_Letter))
2312 Phonize('W');
2313 break;
2314 /* KS */
2315 case 'X':
2316 Phonize('K');
2317 Phonize('S');
2318 break;
2319 /* Y if followed by a vowel */
2320 case 'Y':
2321 if (isvowel(Next_Letter))
2322 Phonize('Y');
2323 break;
2324 /* S */
2325 case 'Z':
2326 Phonize('S');
2327 break;
2328 /* No transformation */
2329 case 'F':
2330 case 'J':
2331 case 'L':
2332 case 'M':
2333 case 'N':
2334 case 'R':
2335 Phonize(Curr_Letter);
2336 break;
2337 default:
2338 /* nothing */
2339 break;
2340 } /* END SWITCH */
2342 w_idx += skip_letter;
2343 } /* END FOR */
2345 return buffer.detach();
2348 ///////////////////////////////////////////////////////////////////////////////
2349 // Cyrillic
2352 * This is codetables for different Cyrillic charsets (relative to koi8-r).
2353 * Each table contains data for 128-255 symbols from ASCII table.
2354 * First 256 symbols are for conversion from koi8-r to corresponding charset,
2355 * second 256 symbols are for reverse conversion, from charset to koi8-r.
2357 * Here we have the following tables:
2358 * _cyr_win1251 - for windows-1251 charset
2359 * _cyr_iso88595 - for iso8859-5 charset
2360 * _cyr_cp866 - for x-cp866 charset
2361 * _cyr_mac - for x-mac-cyrillic charset
2363 typedef unsigned char _cyr_charset_table[512];
2365 static const _cyr_charset_table _cyr_win1251 = {
2366 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2367 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2368 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2369 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2370 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2371 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2372 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2373 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2374 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2375 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2376 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
2377 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
2378 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2379 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2380 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2381 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2382 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2383 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2384 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2385 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2386 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2387 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2388 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2389 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2390 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2391 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2392 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
2393 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
2394 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2395 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2396 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
2397 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
2400 static const _cyr_charset_table _cyr_cp866 = {
2401 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2402 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2403 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2404 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2405 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2406 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2407 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2408 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2409 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2410 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2411 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2412 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
2413 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
2414 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
2415 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2416 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
2417 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2418 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2419 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2420 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2421 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2422 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2423 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2424 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2425 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2426 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2427 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
2428 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
2429 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
2430 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
2431 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2432 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2435 static const _cyr_charset_table _cyr_iso88595 = {
2436 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2437 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2438 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2439 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2440 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2441 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2442 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2443 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2444 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2445 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2446 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2447 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2448 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2449 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2450 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2451 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2452 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2453 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2454 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2455 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2456 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2457 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2458 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2459 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2460 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2461 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2462 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
2463 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
2464 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
2465 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
2466 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
2467 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
2470 static const _cyr_charset_table _cyr_mac = {
2471 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2472 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2473 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2474 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2475 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2476 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2477 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2478 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2479 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2480 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2481 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2482 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2483 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2484 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
2485 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2486 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
2487 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2488 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2489 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2490 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2491 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2492 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2493 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2494 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2495 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2496 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2497 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
2498 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
2499 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2500 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2501 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2502 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2506 * This is the function that performs real in-place conversion of the string
2507 * between charsets.
2508 * Parameters:
2509 * str - string to be converted
2510 * from,to - one-symbol label of source and destination charset
2511 * The following symbols are used as labels:
2512 * k - koi8-r
2513 * w - windows-1251
2514 * i - iso8859-5
2515 * a - x-cp866
2516 * d - x-cp866
2517 * m - x-mac-cyrillic
2519 String string_convert_cyrillic_string(const String& input, char from, char to) {
2520 const unsigned char *from_table, *to_table;
2521 unsigned char tmp;
2522 const unsigned char *uinput = (unsigned char *)input.slice().ptr;
2523 String retString(input.size(), ReserveString);
2524 unsigned char *str = (unsigned char *)retString.mutableData();
2526 from_table = nullptr;
2527 to_table = nullptr;
2529 switch (toupper((int)(unsigned char)from)) {
2530 case 'W': from_table = _cyr_win1251; break;
2531 case 'A':
2532 case 'D': from_table = _cyr_cp866; break;
2533 case 'I': from_table = _cyr_iso88595; break;
2534 case 'M': from_table = _cyr_mac; break;
2535 case 'K':
2536 break;
2537 default:
2538 throw_invalid_argument("Unknown source charset: %c", from);
2539 break;
2542 switch (toupper((int)(unsigned char)to)) {
2543 case 'W': to_table = _cyr_win1251; break;
2544 case 'A':
2545 case 'D': to_table = _cyr_cp866; break;
2546 case 'I': to_table = _cyr_iso88595; break;
2547 case 'M': to_table = _cyr_mac; break;
2548 case 'K':
2549 break;
2550 default:
2551 throw_invalid_argument("Unknown destination charset: %c", to);
2552 break;
2555 for (int i = 0; i < input.size(); i++) {
2556 tmp = from_table == nullptr ? uinput[i] : from_table[uinput[i]];
2557 str[i] = to_table == nullptr ? tmp : to_table[tmp + 256];
2559 retString.setSize(input.size());
2560 return retString;
2563 ///////////////////////////////////////////////////////////////////////////////
2564 // Hebrew
2566 #define HEB_BLOCK_TYPE_ENG 1
2567 #define HEB_BLOCK_TYPE_HEB 2
2569 #define isheb(c) \
2570 (((((unsigned char) c) >= 224) && (((unsigned char) c) <= 250)) ? 1 : 0)
2571 #define _isblank(c) \
2572 (((((unsigned char) c) == ' ' || ((unsigned char) c) == '\t')) ? 1 : 0)
2573 #define _isnewline(c) \
2574 (((((unsigned char) c) == '\n' || ((unsigned char) c) == '\r')) ? 1 : 0)
2577 * Converts Logical Hebrew text (Hebrew Windows style) to Visual text
2578 * Cheers/complaints/flames - Zeev Suraski <zeev@php.net>
2580 String string_convert_hebrew_string(const String& inStr,
2581 int max_chars_per_line,
2582 int convert_newlines) {
2583 assert(!inStr.empty());
2584 auto str = inStr.data();
2585 auto str_len = inStr.size();
2586 const char *tmp;
2587 char *heb_str, *broken_str;
2588 char *target;
2589 int block_start, block_end, block_type, block_length, i;
2590 long max_chars=0;
2591 int begin, end, char_count, orig_begin;
2593 tmp = str;
2594 block_start=block_end=0;
2596 heb_str = (char *) req::malloc(str_len + 1);
2597 SCOPE_EXIT { req::free(heb_str); };
2598 target = heb_str+str_len;
2599 *target = 0;
2600 target--;
2602 block_length=0;
2604 if (isheb(*tmp)) {
2605 block_type = HEB_BLOCK_TYPE_HEB;
2606 } else {
2607 block_type = HEB_BLOCK_TYPE_ENG;
2610 do {
2611 if (block_type == HEB_BLOCK_TYPE_HEB) {
2612 while ((isheb((int)*(tmp+1)) ||
2613 _isblank((int)*(tmp+1)) ||
2614 ispunct((int)*(tmp+1)) ||
2615 (int)*(tmp+1)=='\n' ) && block_end<str_len-1) {
2616 tmp++;
2617 block_end++;
2618 block_length++;
2620 for (i = block_start; i<= block_end; i++) {
2621 *target = str[i];
2622 switch (*target) {
2623 case '(': *target = ')'; break;
2624 case ')': *target = '('; break;
2625 case '[': *target = ']'; break;
2626 case ']': *target = '['; break;
2627 case '{': *target = '}'; break;
2628 case '}': *target = '{'; break;
2629 case '<': *target = '>'; break;
2630 case '>': *target = '<'; break;
2631 case '\\': *target = '/'; break;
2632 case '/': *target = '\\'; break;
2633 default:
2634 break;
2636 target--;
2638 block_type = HEB_BLOCK_TYPE_ENG;
2639 } else {
2640 while (!isheb(*(tmp+1)) &&
2641 (int)*(tmp+1)!='\n' && block_end < str_len-1) {
2642 tmp++;
2643 block_end++;
2644 block_length++;
2646 while ((_isblank((int)*tmp) ||
2647 ispunct((int)*tmp)) && *tmp!='/' &&
2648 *tmp!='-' && block_end > block_start) {
2649 tmp--;
2650 block_end--;
2652 for (i = block_end; i >= block_start; i--) {
2653 *target = str[i];
2654 target--;
2656 block_type = HEB_BLOCK_TYPE_HEB;
2658 block_start=block_end+1;
2659 } while (block_end < str_len-1);
2661 String brokenStr(str_len, ReserveString);
2662 broken_str = brokenStr.mutableData();
2663 begin=end=str_len-1;
2664 target = broken_str;
2666 while (1) {
2667 char_count=0;
2668 while ((!max_chars || char_count < max_chars) && begin > 0) {
2669 char_count++;
2670 begin--;
2671 if (begin <= 0 || _isnewline(heb_str[begin])) {
2672 while (begin > 0 && _isnewline(heb_str[begin-1])) {
2673 begin--;
2674 char_count++;
2676 break;
2679 if (char_count == max_chars) { /* try to avoid breaking words */
2680 int new_char_count=char_count, new_begin=begin;
2682 while (new_char_count > 0) {
2683 if (_isblank(heb_str[new_begin]) || _isnewline(heb_str[new_begin])) {
2684 break;
2686 new_begin++;
2687 new_char_count--;
2689 if (new_char_count > 0) {
2690 char_count=new_char_count;
2691 begin=new_begin;
2694 orig_begin=begin;
2696 if (_isblank(heb_str[begin])) {
2697 heb_str[begin]='\n';
2699 while (begin <= end && _isnewline(heb_str[begin])) {
2700 /* skip leading newlines */
2701 begin++;
2703 for (i = begin; i <= end; i++) { /* copy content */
2704 *target = heb_str[i];
2705 target++;
2707 for (i = orig_begin; i <= end && _isnewline(heb_str[i]); i++) {
2708 *target = heb_str[i];
2709 target++;
2711 begin=orig_begin;
2713 if (begin <= 0) {
2714 *target = 0;
2715 break;
2717 begin--;
2718 end=begin;
2721 if (convert_newlines) {
2722 int count;
2723 auto ret = string_replace(broken_str, str_len, "\n", strlen("\n"),
2724 "<br />\n", strlen("<br />\n"), count, true);
2725 if (!ret.isNull()) {
2726 return ret;
2729 brokenStr.setSize(str_len);
2730 return brokenStr;
2733 #if defined(__APPLE__)
2735 void *memrchr(const void *s, int c, size_t n) {
2736 for (const char *p = (const char *)s + n - 1; p >= s; p--) {
2737 if (*p == c) return (void *)p;
2739 return nullptr;
2742 #endif
2744 ///////////////////////////////////////////////////////////////////////////////