2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/base/zend-string.h"
19 #include "hphp/runtime/base/zend-printf.h"
20 #include "hphp/runtime/base/zend-math.h"
22 #include "hphp/util/lock.h"
23 #include "hphp/util/overflow.h"
27 #include "hphp/util/bstring.h"
28 #include "hphp/runtime/base/exceptions.h"
29 #include "hphp/runtime/base/string-buffer.h"
30 #include "hphp/runtime/base/runtime-error.h"
31 #include "hphp/runtime/base/type-conversions.h"
32 #include "hphp/runtime/base/string-util.h"
33 #include "hphp/runtime/base/builtin-functions.h"
38 ( sizeof (x) == sizeof(float ) ? __inline_isnanf((float)(x)) \
39 : sizeof (x) == sizeof(double) ? __inline_isnand((double)(x)) \
40 : __inline_isnan ((long double)(x)))
45 ( sizeof (x) == sizeof(float ) ? __inline_isinff((float)(x)) \
46 : sizeof (x) == sizeof(double) ? __inline_isinfd((double)(x)) \
47 : __inline_isinf ((long double)(x)))
52 #define PHP_QPRINT_MAXL 75
55 ///////////////////////////////////////////////////////////////////////////////
58 bool string_substr_check(int len
, int &f
, int &l
) {
59 if (l
< 0 && -l
> len
) {
67 } else if (f
< 0 && -f
> len
) {
71 if (l
< 0 && (l
+ len
- f
) < 0) {
75 // if "from" position is negative, count start position from the end
86 // if "length" position is negative, set it to the length
87 // needed to stop that many chars from the end of the string
94 if ((unsigned int)f
+ (unsigned int)l
> (unsigned int)len
) {
100 void string_charmask(const char *sinput
, int len
, char *mask
) {
101 const unsigned char *input
= (unsigned char *)sinput
;
102 const unsigned char *end
;
105 memset(mask
, 0, 256);
106 for (end
= input
+len
; input
< end
; input
++) {
108 if ((input
+3 < end
) && input
[1] == '.' && input
[2] == '.'
110 memset(mask
+c
, 1, input
[3] - c
+ 1);
112 } else if ((input
+1 < end
) && input
[0] == '.' && input
[1] == '.') {
113 /* Error, try to be as helpful as possible:
114 (a range ending/starting with '.' won't be captured here) */
115 if (end
-len
>= input
) { /* there was no 'left' char */
116 throw_invalid_argument
117 ("charlist: Invalid '..'-range, missing left of '..'");
120 if (input
+2 >= end
) { /* there is no 'right' char */
121 throw_invalid_argument
122 ("charlist: Invalid '..'-range, missing right of '..'");
125 if (input
[-1] > input
[2]) { /* wrong order */
126 throw_invalid_argument
127 ("charlist: '..'-range needs to be incrementing");
130 /* FIXME: better error (a..b..c is the only left possibility?) */
131 throw_invalid_argument("charlist: Invalid '..'-range");
139 int string_copy(char *dst
, const char *src
, int siz
) {
140 register char *d
= dst
;
141 register const char *s
= src
;
142 register size_t n
= siz
;
144 /* Copy as many bytes as will fit */
145 if (n
!= 0 && --n
!= 0) {
147 if ((*d
++ = *s
++) == 0)
152 /* Not enough room in dst, add NUL and traverse rest of src */
155 *d
= '\0'; /* NUL-terminate dst */
160 return(s
- src
- 1); /* count does not include NUL */
163 ///////////////////////////////////////////////////////////////////////////////
166 int string_ncmp(const char *s1
, const char *s2
, int len
) {
167 for (int i
= 0; i
< len
; i
++) {
170 if (c1
> c2
) return 1;
171 if (c1
< c2
) return -1;
176 static int compare_right(char const **a
, char const *aend
,
177 char const **b
, char const *bend
) {
180 /* The longest run of digits wins. That aside, the greatest
181 value wins, but we can't know that it will until we've scanned
182 both numbers to know that they have the same magnitude, so we
183 remember it in BIAS. */
184 for(;; (*a
)++, (*b
)++) {
185 if ((*a
== aend
|| !isdigit((int)(unsigned char)**a
)) &&
186 (*b
== bend
|| !isdigit((int)(unsigned char)**b
)))
188 else if (*a
== aend
|| !isdigit((int)(unsigned char)**a
))
190 else if (*b
== bend
|| !isdigit((int)(unsigned char)**b
))
192 else if (**a
< **b
) {
195 } else if (**a
> **b
) {
204 static int compare_left(char const **a
, char const *aend
,
205 char const **b
, char const *bend
) {
206 /* Compare two left-aligned numbers: the first to have a
207 different value wins. */
208 for(;; (*a
)++, (*b
)++) {
209 if ((*a
== aend
|| !isdigit((int)(unsigned char)**a
)) &&
210 (*b
== bend
|| !isdigit((int)(unsigned char)**b
)))
212 else if (*a
== aend
|| !isdigit((int)(unsigned char)**a
))
214 else if (*b
== bend
|| !isdigit((int)(unsigned char)**b
))
225 int string_natural_cmp(char const *a
, size_t a_len
,
226 char const *b
, size_t b_len
, int fold_case
) {
229 char const *aend
= a
+ a_len
, *bend
= b
+ b_len
;
230 int fractional
, result
;
232 if (a_len
== 0 || b_len
== 0)
233 return a_len
- b_len
;
240 /* skip over leading spaces or zeros */
241 while (isspace((int)(unsigned char)ca
))
244 while (isspace((int)(unsigned char)cb
))
247 /* process run of digits */
248 if (isdigit((int)(unsigned char)ca
) && isdigit((int)(unsigned char)cb
)) {
249 fractional
= (ca
== '0' || cb
== '0');
252 result
= compare_left(&ap
, aend
, &bp
, bend
);
254 result
= compare_right(&ap
, aend
, &bp
, bend
);
258 else if (ap
== aend
&& bp
== bend
)
259 /* End of the strings. Let caller sort them out. */
262 /* Keep on comparing from the current point. */
268 ca
= toupper((int)(unsigned char)ca
);
269 cb
= toupper((int)(unsigned char)cb
);
278 if (ap
>= aend
&& bp
>= bend
)
279 /* The strings compare the same. Perhaps the caller
280 will want to call strcmp to break the tie. */
289 ///////////////////////////////////////////////////////////////////////////////
291 void string_to_case(String
& s
, int (*tocase
)(int)) {
294 auto data
= s
.mutableData();
296 for (int i
= 0; i
< len
; i
++) {
297 data
[i
] = tocase(data
[i
]);
301 ///////////////////////////////////////////////////////////////////////////////
303 #define STR_PAD_LEFT 0
304 #define STR_PAD_RIGHT 1
305 #define STR_PAD_BOTH 2
307 String
string_pad(const char *input
, int len
, int pad_length
,
308 const char *pad_string
, int pad_str_len
,
311 int num_pad_chars
= pad_length
- len
;
313 /* If resulting string turns out to be shorter than input string,
314 we simply copy the input and return. */
315 if (pad_length
< 0 || num_pad_chars
< 0) {
316 return String(input
, len
, CopyString
);
319 /* Setup the padding string values if specified. */
320 if (pad_str_len
== 0) {
321 throw_invalid_argument("pad_string: (empty)");
325 String
ret(pad_length
, ReserveString
);
326 char *result
= ret
.mutableData();
328 /* We need to figure out the left/right padding lengths. */
329 int left_pad
, right_pad
;
333 right_pad
= num_pad_chars
;
336 left_pad
= num_pad_chars
;
340 left_pad
= num_pad_chars
/ 2;
341 right_pad
= num_pad_chars
- left_pad
;
344 throw_invalid_argument("pad_type: %d", pad_type
);
348 /* First we pad on the left. */
350 for (int i
= 0; i
< left_pad
; i
++) {
351 result
[result_len
++] = pad_string
[i
% pad_str_len
];
354 /* Then we copy the input string. */
355 memcpy(result
+ result_len
, input
, len
);
358 /* Finally, we pad on the right. */
359 for (int i
= 0; i
< right_pad
; i
++) {
360 result
[result_len
++] = pad_string
[i
% pad_str_len
];
362 ret
.setSize(result_len
);
366 ///////////////////////////////////////////////////////////////////////////////
368 int string_find(const char *input
, int len
, char ch
, int pos
,
369 bool case_sensitive
) {
371 if (pos
< 0 || pos
> len
) {
375 if (case_sensitive
) {
376 ptr
= memchr(input
+ pos
, ch
, len
- pos
);
378 ptr
= bstrcasechr(input
+ pos
, ch
, len
- pos
);
380 if (ptr
!= nullptr) {
381 return (int)((const char *)ptr
- input
);
386 int string_rfind(const char *input
, int len
, char ch
, int pos
,
387 bool case_sensitive
) {
389 if (pos
< -len
|| pos
> len
) {
393 if (case_sensitive
) {
395 ptr
= memrchr(input
+ pos
, ch
, len
- pos
);
397 ptr
= memrchr(input
, ch
, len
+ pos
+ 1);
401 ptr
= bstrrcasechr(input
+ pos
, ch
, len
- pos
);
403 ptr
= bstrrcasechr(input
, ch
, len
+ pos
+ 1);
406 if (ptr
!= nullptr) {
407 return (int)((const char *)ptr
- input
);
412 int string_find(const char *input
, int len
, const char *s
, int s_len
,
413 int pos
, bool case_sensitive
) {
416 if (!s_len
|| pos
< 0 || pos
> len
) {
420 if (case_sensitive
) {
421 ptr
= (void*)string_memnstr(input
+ pos
, s
, s_len
, input
+ len
);
423 ptr
= bstrcasestr(input
+ pos
, len
- pos
, s
, s_len
);
425 if (ptr
!= nullptr) {
426 return (int)((const char *)ptr
- input
);
431 int string_rfind(const char *input
, int len
, const char *s
, int s_len
,
432 int pos
, bool case_sensitive
) {
435 if (!s_len
|| pos
< -len
|| pos
> len
) {
439 if (case_sensitive
) {
441 ptr
= bstrrstr(input
+ pos
, len
- pos
, s
, s_len
);
443 ptr
= bstrrstr(input
, len
+ pos
+ s_len
, s
, s_len
);
447 ptr
= bstrrcasestr(input
+ pos
, len
- pos
, s
, s_len
);
449 ptr
= bstrrcasestr(input
, len
+ pos
+ s_len
, s
, s_len
);
452 if (ptr
!= nullptr) {
453 return (int)((const char *)ptr
- input
);
458 const char *string_memnstr(const char *haystack
, const char *needle
,
459 int needle_len
, const char *end
) {
460 const char *p
= haystack
;
461 char ne
= needle
[needle_len
-1];
465 if ((p
= (char *)memchr(p
, *needle
, (end
-p
+1))) && ne
== p
[needle_len
-1]) {
466 if (!memcmp(needle
, p
, needle_len
-1)) {
478 String
string_replace(const char *s
, int len
, int start
, int length
,
479 const char *replacement
, int len_repl
) {
484 // if "start" position is negative, count start position from the end
495 // if "length" position is negative, set it to the length
496 // needed to stop that many chars from the end of the string
498 length
= (len
- start
) + length
;
503 // check if length is too large
507 // check if the length is too large adjusting for non-zero start
508 // Write this way instead of start + length > len to avoid overflow
509 if (length
> len
- start
) {
510 length
= len
- start
;
513 String
retString(len
+ len_repl
- length
, ReserveString
);
514 char *ret
= retString
.mutableData();
518 memcpy(ret
, s
, start
);
522 memcpy(ret
+ ret_len
, replacement
, len_repl
);
525 len
-= (start
+ length
);
527 memcpy(ret
+ ret_len
, s
+ start
+ length
, len
);
530 retString
.setSize(ret_len
);
534 String
string_replace(const char *input
, int len
,
535 const char *search
, int len_search
,
536 const char *replacement
, int len_replace
,
537 int &count
, bool case_sensitive
) {
539 assert(search
&& len_search
);
541 assert(len_search
>= 0);
542 assert(len_replace
>= 0);
548 req::vector
<int> founds
;
550 if (len_search
== 1) {
551 for (int pos
= string_find(input
, len
, *search
, 0, case_sensitive
);
553 pos
= string_find(input
, len
, *search
, pos
+ len_search
,
555 founds
.push_back(pos
);
558 for (int pos
= string_find(input
, len
, search
, len_search
, 0,
561 pos
= string_find(input
, len
, search
, len_search
,
562 pos
+ len_search
, case_sensitive
)) {
563 founds
.push_back(pos
);
567 count
= founds
.size();
569 return String(); // not found
574 // Make sure the new size of the string wouldn't overflow int32_t. Don't
575 // bother if the replacement wouldn't make the string longer.
576 if (len_replace
> len_search
) {
577 auto raise
= [&] { raise_error("String too large"); };
578 if (mul_overflow(len_replace
- len_search
, count
)) {
581 int diff
= (len_replace
- len_search
) * count
;
582 if (add_overflow(len
, diff
)) {
585 reserve
= len
+ diff
;
587 reserve
= len
+ (len_replace
- len_search
) * count
;
590 String
retString(reserve
, ReserveString
);
591 char *ret
= retString
.mutableData();
593 int pos
= 0; // last position in input that hasn't been copied over yet
595 for (unsigned int i
= 0; i
< founds
.size(); i
++) {
605 memcpy(p
, replacement
, len_replace
);
617 retString
.setSize(p
- ret
);
621 ///////////////////////////////////////////////////////////////////////////////
623 String
string_chunk_split(const char *src
, int srclen
, const char *end
,
624 int endlen
, int chunklen
) {
625 int chunks
= srclen
/ chunklen
; // complete chunks!
626 int restlen
= srclen
- chunks
* chunklen
; /* srclen % chunklen */
636 char *dest
= ret
.mutableData();
638 const char *p
; char *q
;
639 const char *pMax
= src
+ srclen
- chunklen
+ 1;
640 for (p
= src
, q
= dest
; p
< pMax
; ) {
641 memcpy(q
, p
, chunklen
);
643 memcpy(q
, end
, endlen
);
649 memcpy(q
, p
, restlen
);
651 memcpy(q
, end
, endlen
);
655 ret
.setSize(q
- dest
);
659 ///////////////////////////////////////////////////////////////////////////////
661 #define PHP_TAG_BUF_SIZE 1023
664 * Check if tag is in a set of tags
669 * 1 first non-whitespace char seen
671 static int string_tag_find(const char *tag
, int len
, const char *set
) {
681 norm
= (char *)req::malloc(len
+1);
687 normalize the tag removing leading and trailing whitespace
688 and turn any <a whatever...> into just <a> and any </tag>
700 if (!isspace((int)c
)) {
717 if (strstr(set
, norm
)) {
727 * A simple little state-machine to strip out html and php tags
729 * State 0 is the output state, State 1 means we are inside a
730 * normal html tag and state 2 means we are inside a php tag.
732 * The state variable is passed in to allow a function like fgetss
733 * to maintain state across calls to the function.
735 * lc holds the last significant character read and br is a bracket
738 * When an allow string is passed in we keep track of the string
739 * in state 1 and when the tag is closed check it against the
740 * allow string to see if we should allow it.
742 * swm: Added ability to strip <?xml tags without assuming it PHP
745 String
string_strip_tags(const char *s
, const int len
,
746 const char *allow
, const int allow_len
,
747 bool allow_tag_spaces
) {
748 const char *abuf
, *p
;
749 char *rbuf
, *tbuf
, *tp
, *rp
, c
, lc
;
751 int br
, i
=0, depth
=0, in_q
= 0;
757 String
retString(s
, len
, CopyString
);
758 rbuf
= retString
.mutableData();
769 allowString
= String(allow_len
, ReserveString
);
770 char *atmp
= allowString
.mutableData();
771 for (const char *tmp
= allow
; *tmp
; tmp
++, atmp
++) {
772 *atmp
= tolower((int)*(const unsigned char *)tmp
);
774 allowString
.setSize(allow_len
);
775 abuf
= allowString
.data();
777 tbuf
= (char *)req::malloc(PHP_TAG_BUF_SIZE
+1);
784 auto move
= [&pos
, &tbuf
, &tp
]() {
785 if (tp
- tbuf
>= PHP_TAG_BUF_SIZE
) {
787 tbuf
= (char*)req::realloc(tbuf
, (tp
- tbuf
) + PHP_TAG_BUF_SIZE
+ 1);
797 if (isspace(*(p
+ 1)) && !allow_tag_spaces
) {
807 } else if (state
== 1) {
814 if (lc
!= '"' && lc
!= '\'') {
818 } else if (allow_len
&& state
== 1) {
821 } else if (state
== 0) {
828 if (lc
!= '"' && lc
!= '\'') {
832 } else if (allow_len
&& state
== 1) {
835 } else if (state
== 0) {
851 case 1: /* HTML/XML */
858 if (string_tag_find(tbuf
, tp
-tbuf
, abuf
)) {
859 memcpy(rp
, tbuf
, tp
-tbuf
);
867 if (!br
&& lc
!= '\"' && *(p
-1) == '?') {
878 case 4: /* JavaScript/CSS/etc... */
879 if (p
>= s
+ 2 && *(p
-1) == '-' && *(p
-2) == '-') {
894 /* Inside <!-- comment --> */
896 } else if (state
== 2 && *(p
-1) != '\\') {
899 } else if (lc
!= '\\') {
902 } else if (state
== 0) {
904 } else if (allow_len
&& state
== 1) {
908 if (state
&& p
!= s
&& *(p
-1) != '\\' && (!in_q
|| *p
== in_q
)) {
918 /* JavaScript & Other HTML scripting languages */
919 if (state
== 1 && *(p
-1) == '<') {
925 } else if (allow_len
&& state
== 1) {
933 if (state
== 3 && p
>= s
+ 2 && *(p
-1) == '-' && *(p
-2) == '!') {
942 if (state
== 1 && *(p
-1) == '<') {
950 /* !DOCTYPE exception */
951 if (state
==3 && p
> s
+6
952 && tolower(*(p
-1)) == 'p'
953 && tolower(*(p
-2)) == 'y'
954 && tolower(*(p
-3)) == 't'
955 && tolower(*(p
-4)) == 'c'
956 && tolower(*(p
-5)) == 'o'
957 && tolower(*(p
-6)) == 'd') {
965 /* swm: If we encounter '<?xml' then we shouldn't be in
966 * state == 2 (PHP). Switch back to HTML.
969 if (state
== 2 && p
> s
+2 && *(p
-1) == 'm' && *(p
-2) == 'x') {
979 } else if (allow_len
&& state
== 1) {
988 if (rp
< rbuf
+ len
) {
995 retString
.setSize(rp
- rbuf
);
999 ///////////////////////////////////////////////////////////////////////////////
1001 String
string_addslashes(const char *str
, int length
) {
1007 String
retString((length
<< 1) + 1, ReserveString
);
1008 char *new_str
= retString
.mutableData();
1009 const char *source
= str
;
1010 const char *end
= source
+ length
;
1011 char *target
= new_str
;
1013 while (source
< end
) {
1023 /* break is missing *intentionally* */
1025 *target
++ = *source
;
1032 retString
.setSize(target
- new_str
);
1036 ///////////////////////////////////////////////////////////////////////////////
1038 static char string_hex2int(int c
) {
1042 if (c
>= 'A' && c
<= 'F') {
1043 return c
- 'A' + 10;
1045 if (c
>= 'a' && c
<= 'f') {
1046 return c
- 'a' + 10;
1051 String
string_quoted_printable_encode(const char *input
, int len
) {
1052 size_t length
= len
;
1053 const unsigned char *str
= (unsigned char*)input
;
1055 unsigned long lp
= 0;
1058 char *hex
= "0123456789ABCDEF";
1063 length
+ ((safe_address(3, length
, 0)/(PHP_QPRINT_MAXL
-9)) + 1),
1067 d
= buffer
= ret
.mutableData();
1070 if (((c
= *str
++) == '\015') && (*str
== '\012') && length
> 0) {
1076 if (iscntrl (c
) || (c
== 0x7f) || (c
& 0x80) ||
1077 (c
== '=') || ((c
== ' ') && (*str
== '\015'))) {
1078 if ((((lp
+= 3) > PHP_QPRINT_MAXL
) && (c
<= 0x7f))
1079 || ((c
> 0x7f) && (c
<= 0xdf) && ((lp
+ 3) > PHP_QPRINT_MAXL
))
1080 || ((c
> 0xdf) && (c
<= 0xef) && ((lp
+ 6) > PHP_QPRINT_MAXL
))
1081 || ((c
> 0xef) && (c
<= 0xf4) && ((lp
+ 9) > PHP_QPRINT_MAXL
))) {
1089 *d
++ = hex
[c
& 0xf];
1091 if ((++lp
) > PHP_QPRINT_MAXL
) {
1107 String
string_quoted_printable_decode(const char *input
, int len
, bool is_q
) {
1113 int i
= 0, j
= 0, k
;
1114 const char *str_in
= input
;
1115 String
ret(len
, ReserveString
);
1116 char *str_out
= ret
.mutableData();
1117 while (i
< len
&& str_in
[i
]) {
1118 switch (str_in
[i
]) {
1120 if (i
+ 2 < len
&& str_in
[i
+ 1] && str_in
[i
+ 2] &&
1121 isxdigit((int) str_in
[i
+ 1]) && isxdigit((int) str_in
[i
+ 2]))
1123 str_out
[j
++] = (string_hex2int((int) str_in
[i
+ 1]) << 4)
1124 + string_hex2int((int) str_in
[i
+ 2]);
1126 } else /* check for soft line break according to RFC 2045*/ {
1128 while (str_in
[i
+ k
] &&
1129 ((str_in
[i
+ k
] == 32) || (str_in
[i
+ k
] == 9))) {
1130 /* Possibly, skip spaces/tabs at the end of line */
1133 if (!str_in
[i
+ k
]) {
1134 /* End of line reached */
1137 else if ((str_in
[i
+ k
] == 13) && (str_in
[i
+ k
+ 1] == 10)) {
1141 else if ((str_in
[i
+ k
] == 13) || (str_in
[i
+ k
] == 10)) {
1146 str_out
[j
++] = str_in
[i
++];
1155 str_out
[j
++] = str_in
[i
++];
1159 str_out
[j
++] = str_in
[i
++];
1166 Variant
string_base_to_numeric(const char *s
, int len
, int base
) {
1173 assert(string_validate_base(base
));
1175 cutoff
= LONG_MAX
/ base
;
1176 cutlim
= LONG_MAX
% base
;
1178 for (int i
= len
; i
> 0; i
--) {
1181 /* might not work for EBCDIC */
1182 if (c
>= '0' && c
<= '9')
1184 else if (c
>= 'A' && c
<= 'Z')
1186 else if (c
>= 'a' && c
<= 'z')
1195 case 0: /* Integer */
1196 if (num
< cutoff
|| (num
== cutoff
&& c
<= cutlim
)) {
1197 num
= num
* base
+ c
;
1205 fnum
= fnum
* base
+ c
;
1215 String
string_long_to_base(unsigned long value
, int base
) {
1216 static char digits
[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1217 char buf
[(sizeof(unsigned long) << 3) + 1];
1220 assert(string_validate_base(base
));
1222 end
= ptr
= buf
+ sizeof(buf
) - 1;
1225 *--ptr
= digits
[value
% base
];
1227 } while (ptr
> buf
&& value
);
1229 return String(ptr
, end
- ptr
, CopyString
);
1232 String
string_numeric_to_base(const Variant
& value
, int base
) {
1233 static char digits
[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1235 assert(string_validate_base(base
));
1236 if ((!value
.isInteger() && !value
.isDouble())) {
1237 return empty_string();
1240 if (value
.isDouble()) {
1241 double fvalue
= floor(value
.toDouble()); /* floor it just in case */
1243 char buf
[(sizeof(double) << 3) + 1];
1245 /* Don't try to convert +/- infinity */
1246 if (fvalue
== HUGE_VAL
|| fvalue
== -HUGE_VAL
) {
1247 raise_warning("Number too large");
1248 return empty_string();
1251 end
= ptr
= buf
+ sizeof(buf
) - 1;
1254 *--ptr
= digits
[(int) fmod(fvalue
, base
)];
1256 } while (ptr
> buf
&& fabs(fvalue
) >= 1);
1258 return String(ptr
, end
- ptr
, CopyString
);
1261 return string_long_to_base(value
.toInt64(), base
);
1264 ///////////////////////////////////////////////////////////////////////////////
1267 #define PHP_UU_ENC(c) \
1268 ((c) ? ((c) & 077) + ' ' : '`')
1269 #define PHP_UU_ENC_C2(c) \
1270 PHP_UU_ENC(((*(c) << 4) & 060) | ((*((c) + 1) >> 4) & 017))
1271 #define PHP_UU_ENC_C3(c) \
1272 PHP_UU_ENC(((*(c + 1) << 2) & 074) | ((*((c) + 2) >> 6) & 03))
1273 #define PHP_UU_DEC(c) \
1276 String
string_uuencode(const char *src
, int src_len
) {
1282 const char *s
, *e
, *ee
;
1285 /* encoded length is ~ 38% greater than the original */
1286 String
ret((int)ceil(src_len
* 1.38) + 45, ReserveString
);
1287 p
= dest
= ret
.mutableData();
1291 while ((s
+ 3) < e
) {
1297 ee
= s
+ (int) (floor(len
/ 3) * 3);
1300 *p
++ = PHP_UU_ENC(len
);
1303 *p
++ = PHP_UU_ENC(*s
>> 2);
1304 *p
++ = PHP_UU_ENC_C2(s
);
1305 *p
++ = PHP_UU_ENC_C3(s
);
1306 *p
++ = PHP_UU_ENC(*(s
+ 2) & 077);
1318 *p
++ = PHP_UU_ENC(e
- s
);
1322 *p
++ = PHP_UU_ENC(*s
>> 2);
1323 *p
++ = PHP_UU_ENC_C2(s
);
1324 *p
++ = ((e
- s
) > 1) ? PHP_UU_ENC_C3(s
) : PHP_UU_ENC('\0');
1325 *p
++ = ((e
- s
) > 2) ? PHP_UU_ENC(*(s
+ 2) & 077) : PHP_UU_ENC('\0');
1332 *p
++ = PHP_UU_ENC('\0');
1336 ret
.setSize(p
- dest
);
1340 String
string_uudecode(const char *src
, int src_len
) {
1343 const char *s
, *e
, *ee
;
1346 String
ret(ceil(src_len
* 0.75), ReserveString
);
1347 p
= dest
= ret
.mutableData();
1352 if ((len
= PHP_UU_DEC(*s
++)) <= 0) {
1356 if (len
> src_len
) {
1362 ee
= s
+ (len
== 45 ? 60 : (int) floor(len
* 1.33));
1369 if (s
+ 4 > e
) goto err
;
1371 *p
++ = PHP_UU_DEC(*s
) << 2 | PHP_UU_DEC(*(s
+ 1)) >> 4;
1372 *p
++ = PHP_UU_DEC(*(s
+ 1)) << 4 | PHP_UU_DEC(*(s
+ 2)) >> 2;
1373 *p
++ = PHP_UU_DEC(*(s
+ 2)) << 6 | PHP_UU_DEC(*(s
+ 3));
1385 if ((len
= total_len
> (p
- dest
))) {
1386 *p
++ = PHP_UU_DEC(*s
) << 2 | PHP_UU_DEC(*(s
+ 1)) >> 4;
1388 *p
++ = PHP_UU_DEC(*(s
+ 1)) << 4 | PHP_UU_DEC(*(s
+ 2)) >> 2;
1390 *p
++ = PHP_UU_DEC(*(s
+ 2)) << 6 | PHP_UU_DEC(*(s
+ 3));
1395 ret
.setSize(total_len
);
1402 ///////////////////////////////////////////////////////////////////////////////
1405 static const char base64_table
[] = {
1406 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
1407 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
1408 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
1409 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
1410 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
1413 static const char base64_pad
= '=';
1415 static const short base64_reverse_table
[256] = {
1416 -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
1417 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1418 -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
1419 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
1420 -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1421 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
1422 -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
1423 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
1424 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1425 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1426 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1427 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1428 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1429 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1430 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1431 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
1434 static String
php_base64_encode(const unsigned char *str
, int length
) {
1435 const unsigned char *current
= str
;
1437 unsigned char *result
;
1439 if ((length
+ 2) < 0 || ((length
+ 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
1443 String
ret(((length
+ 2) / 3) * 4, ReserveString
);
1444 p
= result
= (unsigned char *)ret
.mutableData();
1446 while (length
> 2) { /* keep going until we have less than 24 bits */
1447 *p
++ = base64_table
[current
[0] >> 2];
1448 *p
++ = base64_table
[((current
[0] & 0x03) << 4) + (current
[1] >> 4)];
1449 *p
++ = base64_table
[((current
[1] & 0x0f) << 2) + (current
[2] >> 6)];
1450 *p
++ = base64_table
[current
[2] & 0x3f];
1453 length
-= 3; /* we just handle 3 octets of data */
1456 /* now deal with the tail end of things */
1458 *p
++ = base64_table
[current
[0] >> 2];
1460 *p
++ = base64_table
[((current
[0] & 0x03) << 4) + (current
[1] >> 4)];
1461 *p
++ = base64_table
[(current
[1] & 0x0f) << 2];
1464 *p
++ = base64_table
[(current
[0] & 0x03) << 4];
1469 ret
.setSize(p
- result
);
1473 static String
php_base64_decode(const char *str
, int length
, bool strict
) {
1474 const unsigned char *current
= (unsigned char*)str
;
1475 int ch
, i
= 0, j
= 0, k
;
1476 /* this sucks for threaded environments */
1478 String
retString(length
, ReserveString
);
1479 unsigned char* result
= (unsigned char*)retString
.mutableData();
1481 /* run through the whole string, converting as we go */
1482 while ((ch
= *current
++) != '\0' && length
-- > 0) {
1483 if (ch
== base64_pad
) {
1484 if (*current
!= '=' && ((i
% 4) == 1 || (strict
&& length
> 0))) {
1486 while (isspace(*(++current
))) {
1489 if (*current
== '\0') {
1498 ch
= base64_reverse_table
[ch
];
1499 if ((!strict
&& ch
< 0) || ch
== -1) {
1500 /* a space or some other separator character, we simply skip over */
1502 } else if (ch
== -2) {
1508 result
[j
] = ch
<< 2;
1511 result
[j
++] |= ch
>> 4;
1512 result
[j
] = (ch
& 0x0f) << 4;
1515 result
[j
++] |= ch
>>2;
1516 result
[j
] = (ch
& 0x03) << 6;
1526 /* mop things up if we ended on a boundary */
1527 if (ch
== base64_pad
) {
1537 retString
.setSize(j
);
1541 String
string_base64_encode(const char *input
, int len
) {
1542 return php_base64_encode((unsigned char *)input
, len
);
1545 String
string_base64_decode(const char *input
, int len
, bool strict
) {
1546 return php_base64_decode(input
, len
, strict
);
1549 ///////////////////////////////////////////////////////////////////////////////
1551 String
string_escape_shell_arg(const char *str
) {
1558 String
ret(safe_address(l
, 4, 3), ReserveString
); /* worst case */
1559 cmd
= ret
.mutableData();
1563 for (x
= 0; x
< l
; x
++) {
1579 String
string_escape_shell_cmd(const char *str
) {
1580 register int x
, y
, l
;
1585 String
ret(safe_address(l
, 2, 1), ReserveString
);
1586 cmd
= ret
.mutableData();
1588 for (x
= 0, y
= 0; x
< l
; x
++) {
1592 if (!p
&& (p
= (char *)memchr(str
+ x
+ 1, str
[x
], l
- x
- 1))) {
1594 } else if (p
&& *p
== str
[x
]) {
1601 case '#': /* This is character-set independent */
1620 case '\x0A': /* excluding these two */
1632 ///////////////////////////////////////////////////////////////////////////////
1634 static void string_similar_str(const char *txt1
, int len1
,
1635 const char *txt2
, int len2
,
1636 int *pos1
, int *pos2
, int *max
) {
1638 const char *end1
= txt1
+ len1
;
1639 const char *end2
= txt2
+ len2
;
1643 for (p
= txt1
; p
< end1
; p
++) {
1644 for (q
= txt2
; q
< end2
; q
++) {
1645 for (l
= 0; (p
+ l
< end1
) && (q
+ l
< end2
) && (p
[l
] == q
[l
]); l
++);
1655 static int string_similar_char(const char *txt1
, int len1
,
1656 const char *txt2
, int len2
) {
1658 int pos1
= 0, pos2
= 0, max
;
1660 string_similar_str(txt1
, len1
, txt2
, len2
, &pos1
, &pos2
, &max
);
1663 sum
+= string_similar_char(txt1
, pos1
, txt2
, pos2
);
1665 if ((pos1
+ max
< len1
) && (pos2
+ max
< len2
)) {
1666 sum
+= string_similar_char(txt1
+ pos1
+ max
, len1
- pos1
- max
,
1667 txt2
+ pos2
+ max
, len2
- pos2
- max
);
1674 int string_similar_text(const char *t1
, int len1
,
1675 const char *t2
, int len2
, float *percent
) {
1676 if (len1
== 0 && len2
== 0) {
1677 if (percent
) *percent
= 0.0;
1681 int sim
= string_similar_char(t1
, len1
, t2
, len2
);
1682 if (percent
) *percent
= sim
* 200.0 / (len1
+ len2
);
1686 ///////////////////////////////////////////////////////////////////////////////
1688 #define LEVENSHTEIN_MAX_LENTH 255
1690 // reference implementation, only optimized for memory usage, not speed
1691 int string_levenshtein(const char *s1
, int l1
, const char *s2
, int l2
,
1692 int cost_ins
, int cost_rep
, int cost_del
) {
1694 int i1
, i2
, c0
, c1
, c2
;
1696 if (l1
==0) return l2
*cost_ins
;
1697 if (l2
==0) return l1
*cost_del
;
1699 if ((l1
>LEVENSHTEIN_MAX_LENTH
)||(l2
>LEVENSHTEIN_MAX_LENTH
)) {
1700 raise_warning("levenshtein(): Argument string(s) too long");
1704 p1
= (int*)req::malloc((l2
+1) * sizeof(int));
1705 p2
= (int*)req::malloc((l2
+1) * sizeof(int));
1707 for(i2
=0;i2
<=l2
;i2
++) {
1708 p1
[i2
] = i2
*cost_ins
;
1711 for(i1
=0;i1
<l1
;i1
++) {
1712 p2
[0]=p1
[0]+cost_del
;
1713 for(i2
=0;i2
<l2
;i2
++) {
1714 c0
=p1
[i2
]+((s1
[i1
]==s2
[i2
])?0:cost_rep
);
1715 c1
=p1
[i2
+1]+cost_del
; if (c1
<c0
) c0
=c1
;
1716 c2
=p2
[i2
]+cost_ins
; if (c2
<c0
) c0
=c2
;
1719 tmp
=p1
; p1
=p2
; p2
=tmp
;
1728 ///////////////////////////////////////////////////////////////////////////////
1730 String
string_money_format(const char *format
, double value
) {
1732 const char *p
= format
;
1733 while ((p
= strchr(p
, '%'))) {
1734 if (*(p
+ 1) == '%') {
1736 } else if (!check
) {
1740 throw_invalid_argument
1741 ("format: Only a single %%i or %%n token can be used");
1746 int format_len
= strlen(format
);
1747 int str_len
= safe_address(format_len
, 1, 1024);
1748 String
ret(str_len
, ReserveString
);
1749 char *str
= ret
.mutableData();
1750 if ((str_len
= strfmon(str
, str_len
, format
, value
)) < 0) {
1753 ret
.setSize(str_len
);
1757 ///////////////////////////////////////////////////////////////////////////////
1759 String
string_number_format(double d
, int dec
,
1760 const String
& dec_point
,
1761 const String
& thousand_sep
) {
1762 char *tmpbuf
= nullptr, *resbuf
;
1763 char *s
, *t
; /* source, target */
1766 int tmplen
, reslen
=0;
1775 if (dec
< 0) dec
= 0;
1776 d
= php_math_round(d
, dec
);
1778 // departure from PHP: we got rid of dependencies on spprintf() here.
1779 String
tmpstr(63, ReserveString
);
1780 tmpbuf
= tmpstr
.mutableData();
1781 tmplen
= snprintf(tmpbuf
, 64, "%.*F", dec
, d
);
1782 if (tmpbuf
== nullptr || !isdigit((int)tmpbuf
[0])) {
1783 tmpstr
.setSize(tmplen
);
1787 // Uncommon, asked for more than 64 chars worth of precision
1788 tmpstr
= String(tmplen
, ReserveString
);
1789 tmpbuf
= tmpstr
.mutableData();
1790 tmplen
= snprintf(tmpbuf
, tmplen
+ 1, "%.*F", dec
, d
);
1791 if (tmpbuf
== nullptr || !isdigit((int)tmpbuf
[0])) {
1792 tmpstr
.setSize(tmplen
);
1797 /* find decimal point, if expected */
1799 dp
= strpbrk(tmpbuf
, ".,");
1804 /* calculate the length of the return buffer */
1806 integral
= dp
- tmpbuf
;
1808 /* no decimal point was found */
1812 /* allow for thousand separators */
1813 if (!thousand_sep
.empty()) {
1814 integral
+= ((integral
-1) / 3) * thousand_sep
.size();
1822 if (!dec_point
.empty()) {
1823 reslen
+= dec_point
.size();
1827 /* add a byte for minus sign */
1831 String
resstr(reslen
, ReserveString
);
1832 resbuf
= resstr
.mutableData();
1834 s
= tmpbuf
+tmplen
-1;
1835 t
= resbuf
+reslen
-1;
1837 /* copy the decimal places.
1838 * Take care, as the sprintf implementation may return less places than
1839 * we requested due to internal buffer limitations */
1841 int declen
= dp
? s
- dp
: 0;
1842 int topad
= dec
> declen
? dec
- declen
: 0;
1850 s
-= declen
+ 1; /* +1 to skip the point */
1853 /* now copy the chars after the point */
1854 memcpy(t
+ 1, dp
+ 1, declen
);
1857 /* add decimal point */
1858 if (!dec_point
.empty()) {
1859 memcpy(t
+ (1 - dec_point
.size()), dec_point
.data(), dec_point
.size());
1860 t
-= dec_point
.size();
1864 /* copy the numbers before the decimal point, adding thousand
1865 * separator every three digits */
1866 while(s
>= tmpbuf
) {
1868 if (thousand_sep
&& (++count
%3)==0 && s
>=tmpbuf
) {
1869 memcpy(t
+ (1 - thousand_sep
.size()),
1870 thousand_sep
.data(),
1871 thousand_sep
.size());
1872 t
-= thousand_sep
.size();
1876 /* and a minus sign, if needed */
1881 resstr
.setSize(reslen
);
1885 ///////////////////////////////////////////////////////////////////////////////
1888 /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
1889 String
string_soundex(const String
& str
) {
1890 assert(!str
.empty());
1891 int _small
, code
, last
;
1892 String
retString(4, ReserveString
);
1893 char* soundex
= retString
.mutableData();
1895 static char soundex_table
[26] = {
1924 /* build soundex string */
1926 const char *p
= str
.slice().ptr
;
1927 for (_small
= 0; *p
&& _small
< 4; p
++) {
1928 /* convert chars to upper case and strip non-letter chars */
1929 /* BUG: should also map here accented letters used in non */
1930 /* English words or names (also found in English text!): */
1931 /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
1932 code
= toupper((int)(unsigned char)(*p
));
1933 if (code
>= 'A' && code
<= 'Z') {
1935 /* remember first valid char */
1936 soundex
[_small
++] = code
;
1937 last
= soundex_table
[code
- 'A'];
1939 /* ignore sequences of consonants with same soundex */
1940 /* code in trail, and vowels unless they separate */
1941 /* consonant letters */
1942 code
= soundex_table
[code
- 'A'];
1945 soundex
[_small
++] = code
;
1952 /* pad with '0' and terminate with 0 ;-) */
1953 while (_small
< 4) {
1954 soundex
[_small
++] = '0';
1956 retString
.setSize(4);
1960 ///////////////////////////////////////////////////////////////////////////////
1964 * this is now the original code by Michael G Schwern:
1965 * i've changed it just a slightly bit (use emalloc,
1966 * get rid of includes etc)
1967 * - thies - 13.09.1999
1970 /*----------------------------- */
1971 /* this used to be "metaphone.h" */
1972 /*----------------------------- */
1974 /* Special encodings */
1978 /*----------------------------- */
1979 /* end of "metaphone.h" */
1980 /*----------------------------- */
1982 /*----------------------------- */
1983 /* this used to be "metachar.h" */
1984 /*----------------------------- */
1986 /* Metachar.h ... little bits about characters for metaphone */
1987 /*-- Character encoding array & accessing macros --*/
1988 /* Stolen directly out of the book... */
1989 char _codes
[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
1991 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
1993 #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
1995 /* These letters are passed through unchanged */
1996 #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
1998 /* These form dipthongs when preceding H */
1999 #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
2001 /* These make C and G soft */
2002 #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
2004 /* These prevent GH from becoming F */
2005 #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
2007 /*----------------------------- */
2008 /* end of "metachar.h" */
2009 /*----------------------------- */
2011 /* I suppose I could have been using a character pointer instead of
2012 * accesssing the array directly... */
2014 /* Look at the next letter in the word */
2015 #define Next_Letter ((char)toupper(word[w_idx+1]))
2016 /* Look at the current letter in the word */
2017 #define Curr_Letter ((char)toupper(word[w_idx]))
2018 /* Go N letters back. */
2019 #define Look_Back_Letter(n) (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
2020 /* Previous letter. I dunno, should this return null on failure? */
2021 #define Prev_Letter (Look_Back_Letter(1))
2022 /* Look two letters down. It makes sure you don't walk off the string. */
2023 #define After_Next_Letter (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
2025 #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
2027 /* Allows us to safely look ahead an arbitrary # of letters */
2028 /* I probably could have just used strlen... */
2029 static char Lookahead(unsigned char *word
, int how_far
) {
2030 char letter_ahead
= '\0'; /* null by default */
2032 for (idx
= 0; word
[idx
] != '\0' && idx
< how_far
; idx
++);
2033 /* Edge forward in the string... */
2035 letter_ahead
= (char)word
[idx
]; /* idx will be either == to how_far or
2036 * at the end of the string
2038 return letter_ahead
;
2041 /* phonize one letter
2042 * We don't know the buffers size in advance. On way to solve this is to just
2043 * re-allocate the buffer size. We're using an extra of 2 characters (this
2044 * could be one though; or more too). */
2045 #define Phonize(c) { buffer.append(c); }
2046 /* How long is the phoned word? */
2047 #define Phone_Len (buffer.size())
2049 /* Note is a letter is a 'break' in the word */
2050 #define Isbreak(c) (!isalpha(c))
2052 String
string_metaphone(const char *input
, int word_len
, long max_phonemes
,
2054 unsigned char *word
= (unsigned char *)input
;
2056 int w_idx
= 0; /* point in the phonization we're at. */
2057 int max_buffer_len
= 0; /* maximum length of the destination buffer */
2059 /*-- Parameter checks --*/
2060 /* Negative phoneme length is meaningless */
2062 if (max_phonemes
< 0)
2065 /* Empty/null string is meaningless */
2066 /* Overly paranoid */
2067 /* always_assert(word != NULL && word[0] != '\0'); */
2069 if (word
== nullptr)
2072 /*-- Allocate memory for our phoned_phrase --*/
2073 if (max_phonemes
== 0) { /* Assume largest possible */
2074 max_buffer_len
= word_len
;
2076 max_buffer_len
= max_phonemes
;
2078 StringBuffer
buffer(max_buffer_len
);
2080 /*-- The first phoneme has to be processed specially. --*/
2081 /* Find our first letter */
2082 for (; !isalpha(Curr_Letter
); w_idx
++) {
2083 /* On the off chance we were given nothing but crap... */
2084 if (Curr_Letter
== '\0') {
2085 return buffer
.detach(); /* For testing */
2089 switch (Curr_Letter
) {
2092 if (Next_Letter
== 'E') {
2096 /* Remember, preserve vowels at the beginning */
2102 /* [GKP]N becomes N */
2106 if (Next_Letter
== 'N') {
2113 W if followed by a vowel */
2115 if (Next_Letter
== 'H' ||
2116 Next_Letter
== 'R') {
2117 Phonize(Next_Letter
);
2119 } else if (isvowel(Next_Letter
)) {
2130 /* Vowels are kept */
2139 Phonize(Curr_Letter
);
2147 /* On to the metaphoning */
2148 for (; Curr_Letter
!= '\0' &&
2149 (max_phonemes
== 0 || Phone_Len
< max_phonemes
);
2151 /* How many letters to skip because an eariler encoding handled
2152 * multiple letters */
2153 unsigned short int skip_letter
= 0;
2156 /* THOUGHT: It would be nice if, rather than having things like...
2157 * well, SCI. For SCI you encode the S, then have to remember
2158 * to skip the C. So the phonome SCI invades both S and C. It would
2159 * be better, IMHO, to skip the C from the S part of the encoding.
2160 * Hell, I'm trying it.
2163 /* Ignore non-alphas */
2164 if (!isalpha(Curr_Letter
))
2167 /* Drop duplicates, except CC */
2168 if (Curr_Letter
== Prev_Letter
&&
2172 switch (Curr_Letter
) {
2173 /* B -> B unless in MB */
2175 if (Prev_Letter
!= 'M')
2178 /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
2179 * (SCHW is handled in S)
2180 * S if -CI-, -CE- or -CY-
2181 * dropped if -SCI-, SCE-, -SCY- (handed in S)
2185 if (MAKESOFT(Next_Letter
)) { /* C[IEY] */
2186 if (After_Next_Letter
== 'A' &&
2187 Next_Letter
== 'I') { /* CIA */
2191 else if (Prev_Letter
== 'S') {
2196 } else if (Next_Letter
== 'H') {
2197 if ((!traditional
) && (After_Next_Letter
== 'R' ||
2198 Prev_Letter
== 'S')) { /* Christ, School */
2208 /* J if in -DGE-, -DGI- or -DGY-
2212 if (Next_Letter
== 'G' && MAKESOFT(After_Next_Letter
)) {
2218 /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
2219 * else dropped if -GNED, -GN,
2220 * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
2221 * else J if in -GE-, -GI, -GY and not GG
2225 if (Next_Letter
== 'H') {
2226 if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) {
2232 } else if (Next_Letter
== 'N') {
2233 if (Isbreak(After_Next_Letter
) ||
2234 (After_Next_Letter
== 'E' && Look_Ahead_Letter(3) == 'D')) {
2238 } else if (MAKESOFT(Next_Letter
) && Prev_Letter
!= 'G') {
2244 /* H if before a vowel and not after C,G,P,S,T */
2246 if (isvowel(Next_Letter
) && !AFFECTH(Prev_Letter
))
2249 /* dropped if after C
2253 if (Prev_Letter
!= 'C')
2260 if (Next_Letter
== 'H') {
2271 /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
2275 if (Next_Letter
== 'I' &&
2276 (After_Next_Letter
== 'O' || After_Next_Letter
== 'A')) {
2278 } else if (Next_Letter
== 'H') {
2281 } else if ((!traditional
) &&
2282 (Next_Letter
== 'C' && Look_Ahead_Letter(2) == 'H' &&
2283 Look_Ahead_Letter(3) == 'W')) {
2290 /* 'sh' in -TIA- or -TIO-
2291 * else 'th' before H
2295 if (Next_Letter
== 'I' &&
2296 (After_Next_Letter
== 'O' || After_Next_Letter
== 'A')) {
2298 } else if (Next_Letter
== 'H') {
2309 /* W before a vowel, else dropped */
2311 if (isvowel(Next_Letter
))
2319 /* Y if followed by a vowel */
2321 if (isvowel(Next_Letter
))
2328 /* No transformation */
2335 Phonize(Curr_Letter
);
2342 w_idx
+= skip_letter
;
2345 return buffer
.detach();
2348 ///////////////////////////////////////////////////////////////////////////////
2352 * This is codetables for different Cyrillic charsets (relative to koi8-r).
2353 * Each table contains data for 128-255 symbols from ASCII table.
2354 * First 256 symbols are for conversion from koi8-r to corresponding charset,
2355 * second 256 symbols are for reverse conversion, from charset to koi8-r.
2357 * Here we have the following tables:
2358 * _cyr_win1251 - for windows-1251 charset
2359 * _cyr_iso88595 - for iso8859-5 charset
2360 * _cyr_cp866 - for x-cp866 charset
2361 * _cyr_mac - for x-mac-cyrillic charset
2363 typedef unsigned char _cyr_charset_table
[512];
2365 static const _cyr_charset_table _cyr_win1251
= {
2366 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2367 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2368 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2369 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2370 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2371 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2372 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2373 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2374 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2375 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2376 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
2377 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
2378 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2379 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2380 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2381 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2382 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2383 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2384 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2385 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2386 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2387 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2388 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2389 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2390 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2391 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2392 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
2393 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
2394 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2395 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2396 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
2397 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
2400 static const _cyr_charset_table _cyr_cp866
= {
2401 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2402 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2403 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2404 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2405 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2406 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2407 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2408 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2409 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2410 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2411 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2412 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
2413 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
2414 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
2415 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2416 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
2417 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2418 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2419 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2420 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2421 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2422 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2423 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2424 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2425 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2426 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2427 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
2428 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
2429 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
2430 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
2431 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2432 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2435 static const _cyr_charset_table _cyr_iso88595
= {
2436 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2437 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2438 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2439 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2440 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2441 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2442 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2443 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2444 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2445 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2446 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2447 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2448 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2449 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2450 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2451 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2452 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2453 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2454 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2455 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2456 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2457 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2458 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2459 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2460 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2461 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2462 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
2463 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
2464 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
2465 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
2466 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
2467 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
2470 static const _cyr_charset_table _cyr_mac
= {
2471 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2472 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2473 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2474 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2475 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2476 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2477 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2478 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2479 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2480 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2481 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2482 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2483 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2484 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
2485 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2486 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
2487 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2488 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2489 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2490 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2491 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2492 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2493 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2494 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2495 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2496 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2497 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
2498 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
2499 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2500 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2501 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2502 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2506 * This is the function that performs real in-place conversion of the string
2509 * str - string to be converted
2510 * from,to - one-symbol label of source and destination charset
2511 * The following symbols are used as labels:
2517 * m - x-mac-cyrillic
2519 String
string_convert_cyrillic_string(const String
& input
, char from
, char to
) {
2520 const unsigned char *from_table
, *to_table
;
2522 const unsigned char *uinput
= (unsigned char *)input
.slice().ptr
;
2523 String
retString(input
.size(), ReserveString
);
2524 unsigned char *str
= (unsigned char *)retString
.mutableData();
2526 from_table
= nullptr;
2529 switch (toupper((int)(unsigned char)from
)) {
2530 case 'W': from_table
= _cyr_win1251
; break;
2532 case 'D': from_table
= _cyr_cp866
; break;
2533 case 'I': from_table
= _cyr_iso88595
; break;
2534 case 'M': from_table
= _cyr_mac
; break;
2538 throw_invalid_argument("Unknown source charset: %c", from
);
2542 switch (toupper((int)(unsigned char)to
)) {
2543 case 'W': to_table
= _cyr_win1251
; break;
2545 case 'D': to_table
= _cyr_cp866
; break;
2546 case 'I': to_table
= _cyr_iso88595
; break;
2547 case 'M': to_table
= _cyr_mac
; break;
2551 throw_invalid_argument("Unknown destination charset: %c", to
);
2555 for (int i
= 0; i
< input
.size(); i
++) {
2556 tmp
= from_table
== nullptr ? uinput
[i
] : from_table
[uinput
[i
]];
2557 str
[i
] = to_table
== nullptr ? tmp
: to_table
[tmp
+ 256];
2559 retString
.setSize(input
.size());
2563 ///////////////////////////////////////////////////////////////////////////////
2566 #define HEB_BLOCK_TYPE_ENG 1
2567 #define HEB_BLOCK_TYPE_HEB 2
2570 (((((unsigned char) c) >= 224) && (((unsigned char) c) <= 250)) ? 1 : 0)
2571 #define _isblank(c) \
2572 (((((unsigned char) c) == ' ' || ((unsigned char) c) == '\t')) ? 1 : 0)
2573 #define _isnewline(c) \
2574 (((((unsigned char) c) == '\n' || ((unsigned char) c) == '\r')) ? 1 : 0)
2577 * Converts Logical Hebrew text (Hebrew Windows style) to Visual text
2578 * Cheers/complaints/flames - Zeev Suraski <zeev@php.net>
2580 String
string_convert_hebrew_string(const String
& inStr
,
2581 int max_chars_per_line
,
2582 int convert_newlines
) {
2583 assert(!inStr
.empty());
2584 auto str
= inStr
.data();
2585 auto str_len
= inStr
.size();
2587 char *heb_str
, *broken_str
;
2589 int block_start
, block_end
, block_type
, block_length
, i
;
2591 int begin
, end
, char_count
, orig_begin
;
2594 block_start
=block_end
=0;
2596 heb_str
= (char *) req::malloc(str_len
+ 1);
2597 SCOPE_EXIT
{ req::free(heb_str
); };
2598 target
= heb_str
+str_len
;
2605 block_type
= HEB_BLOCK_TYPE_HEB
;
2607 block_type
= HEB_BLOCK_TYPE_ENG
;
2611 if (block_type
== HEB_BLOCK_TYPE_HEB
) {
2612 while ((isheb((int)*(tmp
+1)) ||
2613 _isblank((int)*(tmp
+1)) ||
2614 ispunct((int)*(tmp
+1)) ||
2615 (int)*(tmp
+1)=='\n' ) && block_end
<str_len
-1) {
2620 for (i
= block_start
; i
<= block_end
; i
++) {
2623 case '(': *target
= ')'; break;
2624 case ')': *target
= '('; break;
2625 case '[': *target
= ']'; break;
2626 case ']': *target
= '['; break;
2627 case '{': *target
= '}'; break;
2628 case '}': *target
= '{'; break;
2629 case '<': *target
= '>'; break;
2630 case '>': *target
= '<'; break;
2631 case '\\': *target
= '/'; break;
2632 case '/': *target
= '\\'; break;
2638 block_type
= HEB_BLOCK_TYPE_ENG
;
2640 while (!isheb(*(tmp
+1)) &&
2641 (int)*(tmp
+1)!='\n' && block_end
< str_len
-1) {
2646 while ((_isblank((int)*tmp
) ||
2647 ispunct((int)*tmp
)) && *tmp
!='/' &&
2648 *tmp
!='-' && block_end
> block_start
) {
2652 for (i
= block_end
; i
>= block_start
; i
--) {
2656 block_type
= HEB_BLOCK_TYPE_HEB
;
2658 block_start
=block_end
+1;
2659 } while (block_end
< str_len
-1);
2661 String
brokenStr(str_len
, ReserveString
);
2662 broken_str
= brokenStr
.mutableData();
2663 begin
=end
=str_len
-1;
2664 target
= broken_str
;
2668 while ((!max_chars
|| char_count
< max_chars
) && begin
> 0) {
2671 if (begin
<= 0 || _isnewline(heb_str
[begin
])) {
2672 while (begin
> 0 && _isnewline(heb_str
[begin
-1])) {
2679 if (char_count
== max_chars
) { /* try to avoid breaking words */
2680 int new_char_count
=char_count
, new_begin
=begin
;
2682 while (new_char_count
> 0) {
2683 if (_isblank(heb_str
[new_begin
]) || _isnewline(heb_str
[new_begin
])) {
2689 if (new_char_count
> 0) {
2690 char_count
=new_char_count
;
2696 if (_isblank(heb_str
[begin
])) {
2697 heb_str
[begin
]='\n';
2699 while (begin
<= end
&& _isnewline(heb_str
[begin
])) {
2700 /* skip leading newlines */
2703 for (i
= begin
; i
<= end
; i
++) { /* copy content */
2704 *target
= heb_str
[i
];
2707 for (i
= orig_begin
; i
<= end
&& _isnewline(heb_str
[i
]); i
++) {
2708 *target
= heb_str
[i
];
2721 if (convert_newlines
) {
2723 auto ret
= string_replace(broken_str
, str_len
, "\n", strlen("\n"),
2724 "<br />\n", strlen("<br />\n"), count
, true);
2725 if (!ret
.isNull()) {
2729 brokenStr
.setSize(str_len
);
2733 #if defined(__APPLE__)
2735 void *memrchr(const void *s
, int c
, size_t n
) {
2736 for (const char *p
= (const char *)s
+ n
- 1; p
>= s
; p
--) {
2737 if (*p
== c
) return (void *)p
;
2744 ///////////////////////////////////////////////////////////////////////////////