2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 2.00 of the Zend license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.zend.com/license/2_00.txt. |
12 | If you did not receive a copy of the Zend license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@zend.com so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/base/zend-string.h"
19 #include "hphp/runtime/base/zend-printf.h"
20 #include "hphp/runtime/base/zend-math.h"
22 #include "hphp/util/lock.h"
23 #include "hphp/util/overflow.h"
30 #include "hphp/util/bstring.h"
31 #include "hphp/runtime/base/exceptions.h"
32 #include "hphp/runtime/base/string-buffer.h"
33 #include "hphp/runtime/base/runtime-error.h"
34 #include "hphp/runtime/base/string-util.h"
35 #include "hphp/runtime/base/builtin-functions.h"
37 #include <folly/portability/String.h>
39 #define PHP_QPRINT_MAXL 75
42 ///////////////////////////////////////////////////////////////////////////////
45 void string_charmask(const char *sinput
, int len
, char *mask
) {
46 const unsigned char *input
= (unsigned char *)sinput
;
47 const unsigned char *end
;
51 for (end
= input
+len
; input
< end
; input
++) {
53 if ((input
+3 < end
) && input
[1] == '.' && input
[2] == '.'
55 memset(mask
+c
, 1, input
[3] - c
+ 1);
57 } else if ((input
+1 < end
) && input
[0] == '.' && input
[1] == '.') {
58 /* Error, try to be as helpful as possible:
59 (a range ending/starting with '.' won't be captured here) */
60 if (end
-len
>= input
) { /* there was no 'left' char */
61 throw_invalid_argument
62 ("charlist: Invalid '..'-range, missing left of '..'");
65 if (input
+2 >= end
) { /* there is no 'right' char */
66 throw_invalid_argument
67 ("charlist: Invalid '..'-range, missing right of '..'");
70 if (input
[-1] > input
[2]) { /* wrong order */
71 throw_invalid_argument
72 ("charlist: '..'-range needs to be incrementing");
75 /* FIXME: better error (a..b..c is the only left possibility?) */
76 throw_invalid_argument("charlist: Invalid '..'-range");
84 int string_copy(char *dst
, const char *src
, int siz
) {
85 register char *d
= dst
;
86 register const char *s
= src
;
87 register size_t n
= siz
;
89 /* Copy as many bytes as will fit */
90 if (n
!= 0 && --n
!= 0) {
92 if ((*d
++ = *s
++) == 0)
97 /* Not enough room in dst, add NUL and traverse rest of src */
100 *d
= '\0'; /* NUL-terminate dst */
105 return(s
- src
- 1); /* count does not include NUL */
108 ///////////////////////////////////////////////////////////////////////////////
111 int string_ncmp(const char *s1
, const char *s2
, int len
) {
112 for (int i
= 0; i
< len
; i
++) {
115 if (c1
> c2
) return 1;
116 if (c1
< c2
) return -1;
121 static int compare_right(char const **a
, char const *aend
,
122 char const **b
, char const *bend
) {
125 /* The longest run of digits wins. That aside, the greatest
126 value wins, but we can't know that it will until we've scanned
127 both numbers to know that they have the same magnitude, so we
128 remember it in BIAS. */
129 for(;; (*a
)++, (*b
)++) {
130 if ((*a
== aend
|| !isdigit((int)(unsigned char)**a
)) &&
131 (*b
== bend
|| !isdigit((int)(unsigned char)**b
)))
133 else if (*a
== aend
|| !isdigit((int)(unsigned char)**a
))
135 else if (*b
== bend
|| !isdigit((int)(unsigned char)**b
))
137 else if (**a
< **b
) {
140 } else if (**a
> **b
) {
149 static int compare_left(char const **a
, char const *aend
,
150 char const **b
, char const *bend
) {
151 /* Compare two left-aligned numbers: the first to have a
152 different value wins. */
153 for(;; (*a
)++, (*b
)++) {
154 if ((*a
== aend
|| !isdigit((int)(unsigned char)**a
)) &&
155 (*b
== bend
|| !isdigit((int)(unsigned char)**b
)))
157 else if (*a
== aend
|| !isdigit((int)(unsigned char)**a
))
159 else if (*b
== bend
|| !isdigit((int)(unsigned char)**b
))
170 int string_natural_cmp(char const *a
, size_t a_len
,
171 char const *b
, size_t b_len
, int fold_case
) {
174 char const *aend
= a
+ a_len
, *bend
= b
+ b_len
;
175 int fractional
, result
;
177 if (a_len
== 0 || b_len
== 0)
178 return a_len
- b_len
;
185 /* skip over leading spaces or zeros */
186 while (isspace((int)(unsigned char)ca
))
189 while (isspace((int)(unsigned char)cb
))
192 /* process run of digits */
193 if (isdigit((int)(unsigned char)ca
) && isdigit((int)(unsigned char)cb
)) {
194 fractional
= (ca
== '0' || cb
== '0');
197 result
= compare_left(&ap
, aend
, &bp
, bend
);
199 result
= compare_right(&ap
, aend
, &bp
, bend
);
203 else if (ap
== aend
&& bp
== bend
)
204 /* End of the strings. Let caller sort them out. */
207 /* Keep on comparing from the current point. */
213 ca
= toupper((int)(unsigned char)ca
);
214 cb
= toupper((int)(unsigned char)cb
);
223 if (ap
>= aend
&& bp
>= bend
)
224 /* The strings compare the same. Perhaps the caller
225 will want to call strcmp to break the tie. */
234 ///////////////////////////////////////////////////////////////////////////////
236 void string_to_case(String
& s
, int (*tocase
)(int)) {
237 assertx(!s
.isNull());
239 auto data
= s
.mutableData();
241 for (int i
= 0; i
< len
; i
++) {
242 data
[i
] = tocase(data
[i
]);
246 ///////////////////////////////////////////////////////////////////////////////
248 #define STR_PAD_LEFT 0
249 #define STR_PAD_RIGHT 1
250 #define STR_PAD_BOTH 2
252 String
string_pad(const char *input
, int len
, int pad_length
,
253 const char *pad_string
, int pad_str_len
,
256 int num_pad_chars
= pad_length
- len
;
258 /* If resulting string turns out to be shorter than input string,
259 we simply copy the input and return. */
260 if (pad_length
< 0 || num_pad_chars
< 0) {
261 return String(input
, len
, CopyString
);
264 /* Setup the padding string values if specified. */
265 if (pad_str_len
== 0) {
266 throw_invalid_argument("pad_string: (empty)");
270 String
ret(pad_length
, ReserveString
);
271 char *result
= ret
.mutableData();
273 /* We need to figure out the left/right padding lengths. */
274 int left_pad
, right_pad
;
278 right_pad
= num_pad_chars
;
281 left_pad
= num_pad_chars
;
285 left_pad
= num_pad_chars
/ 2;
286 right_pad
= num_pad_chars
- left_pad
;
289 throw_invalid_argument("pad_type: %d", pad_type
);
293 /* First we pad on the left. */
295 for (int i
= 0; i
< left_pad
; i
++) {
296 result
[result_len
++] = pad_string
[i
% pad_str_len
];
299 /* Then we copy the input string. */
300 memcpy(result
+ result_len
, input
, len
);
303 /* Finally, we pad on the right. */
304 for (int i
= 0; i
< right_pad
; i
++) {
305 result
[result_len
++] = pad_string
[i
% pad_str_len
];
307 ret
.setSize(result_len
);
311 ///////////////////////////////////////////////////////////////////////////////
313 int string_find(const char *input
, int len
, char ch
, int pos
,
314 bool case_sensitive
) {
316 if (pos
< 0 || pos
> len
) {
320 if (case_sensitive
) {
321 ptr
= memchr(input
+ pos
, ch
, len
- pos
);
323 ptr
= bstrcasechr(input
+ pos
, ch
, len
- pos
);
325 if (ptr
!= nullptr) {
326 return (int)((const char *)ptr
- input
);
331 int string_rfind(const char *input
, int len
, char ch
, int pos
,
332 bool case_sensitive
) {
334 if (pos
< -len
|| pos
> len
) {
338 if (case_sensitive
) {
340 ptr
= memrchr(input
+ pos
, ch
, len
- pos
);
342 ptr
= memrchr(input
, ch
, len
+ pos
+ 1);
346 ptr
= bstrrcasechr(input
+ pos
, ch
, len
- pos
);
348 ptr
= bstrrcasechr(input
, ch
, len
+ pos
+ 1);
351 if (ptr
!= nullptr) {
352 return (int)((const char *)ptr
- input
);
357 int string_find(const char *input
, int len
, const char *s
, int s_len
,
358 int pos
, bool case_sensitive
) {
361 if (!s_len
|| pos
< 0 || pos
> len
) {
365 if (case_sensitive
) {
366 ptr
= (void*)string_memnstr(input
+ pos
, s
, s_len
, input
+ len
);
368 ptr
= bstrcasestr(input
+ pos
, len
- pos
, s
, s_len
);
370 if (ptr
!= nullptr) {
371 return (int)((const char *)ptr
- input
);
376 int string_rfind(const char *input
, int len
, const char *s
, int s_len
,
377 int pos
, bool case_sensitive
) {
380 if (!s_len
|| pos
< -len
|| pos
> len
) {
384 if (case_sensitive
) {
386 ptr
= bstrrstr(input
+ pos
, len
- pos
, s
, s_len
);
388 ptr
= bstrrstr(input
, len
+ pos
+ s_len
, s
, s_len
);
392 ptr
= bstrrcasestr(input
+ pos
, len
- pos
, s
, s_len
);
394 ptr
= bstrrcasestr(input
, len
+ pos
+ s_len
, s
, s_len
);
397 if (ptr
!= nullptr) {
398 return (int)((const char *)ptr
- input
);
403 const char *string_memnstr(const char *haystack
, const char *needle
,
404 int needle_len
, const char *end
) {
405 const char *p
= haystack
;
406 char ne
= needle
[needle_len
-1];
410 if ((p
= (char *)memchr(p
, *needle
, (end
-p
+1))) && ne
== p
[needle_len
-1]) {
411 if (!memcmp(needle
, p
, needle_len
-1)) {
423 String
string_replace(const char *s
, int len
, int start
, int length
,
424 const char *replacement
, int len_repl
) {
426 assertx(replacement
);
429 // if "start" position is negative, count start position from the end
440 // if "length" position is negative, set it to the length
441 // needed to stop that many chars from the end of the string
443 length
= (len
- start
) + length
;
448 // check if length is too large
452 // check if the length is too large adjusting for non-zero start
453 // Write this way instead of start + length > len to avoid overflow
454 if (length
> len
- start
) {
455 length
= len
- start
;
458 String
retString(len
+ len_repl
- length
, ReserveString
);
459 char *ret
= retString
.mutableData();
463 memcpy(ret
, s
, start
);
467 memcpy(ret
+ ret_len
, replacement
, len_repl
);
470 len
-= (start
+ length
);
472 memcpy(ret
+ ret_len
, s
+ start
+ length
, len
);
475 retString
.setSize(ret_len
);
479 String
string_replace(const char *input
, int len
,
480 const char *search
, int len_search
,
481 const char *replacement
, int len_replace
,
482 int &count
, bool case_sensitive
) {
484 assertx(search
&& len_search
);
486 assertx(len_search
>= 0);
487 assertx(len_replace
>= 0);
493 req::vector
<int> founds
;
495 if (len_search
== 1) {
496 for (int pos
= string_find(input
, len
, *search
, 0, case_sensitive
);
498 pos
= string_find(input
, len
, *search
, pos
+ len_search
,
500 founds
.push_back(pos
);
503 for (int pos
= string_find(input
, len
, search
, len_search
, 0,
506 pos
= string_find(input
, len
, search
, len_search
,
507 pos
+ len_search
, case_sensitive
)) {
508 founds
.push_back(pos
);
512 count
= founds
.size();
514 return String(); // not found
519 // Make sure the new size of the string wouldn't overflow int32_t. Don't
520 // bother if the replacement wouldn't make the string longer.
521 if (len_replace
> len_search
) {
522 auto raise
= [&] { raise_error("String too large"); };
523 if (mul_overflow(len_replace
- len_search
, count
)) {
526 int diff
= (len_replace
- len_search
) * count
;
527 if (add_overflow(len
, diff
)) {
530 reserve
= len
+ diff
;
532 reserve
= len
+ (len_replace
- len_search
) * count
;
535 String
retString(reserve
, ReserveString
);
536 char *ret
= retString
.mutableData();
538 int pos
= 0; // last position in input that hasn't been copied over yet
540 for (unsigned int i
= 0; i
< founds
.size(); i
++) {
550 memcpy(p
, replacement
, len_replace
);
562 retString
.setSize(p
- ret
);
566 ///////////////////////////////////////////////////////////////////////////////
568 String
string_chunk_split(const char *src
, int srclen
, const char *end
,
569 int endlen
, int chunklen
) {
570 int chunks
= srclen
/ chunklen
; // complete chunks!
571 int restlen
= srclen
- chunks
* chunklen
; /* srclen % chunklen */
581 char *dest
= ret
.mutableData();
583 const char *p
; char *q
;
584 const char *pMax
= src
+ srclen
- chunklen
+ 1;
585 for (p
= src
, q
= dest
; p
< pMax
; ) {
586 memcpy(q
, p
, chunklen
);
588 memcpy(q
, end
, endlen
);
594 memcpy(q
, p
, restlen
);
596 memcpy(q
, end
, endlen
);
600 ret
.setSize(q
- dest
);
604 ///////////////////////////////////////////////////////////////////////////////
606 #define PHP_TAG_BUF_SIZE 1023
609 * Check if tag is in a set of tags
614 * 1 first non-whitespace char seen
616 static int string_tag_find(const char *tag
, int len
, const char *set
) {
626 norm
= (char *)req::malloc_noptrs(len
+1);
627 SCOPE_EXIT
{ req::free(norm
); };
633 normalize the tag removing leading and trailing whitespace
634 and turn any <a whatever...> into just <a> and any </tag>
646 if (!isspace((int)c
)) {
663 if (strstr(set
, norm
)) {
672 * A simple little state-machine to strip out html and php tags
674 * State 0 is the output state, State 1 means we are inside a
675 * normal html tag and state 2 means we are inside a php tag.
677 * The state variable is passed in to allow a function like fgetss
678 * to maintain state across calls to the function.
680 * lc holds the last significant character read and br is a bracket
683 * When an allow string is passed in we keep track of the string
684 * in state 1 and when the tag is closed check it against the
685 * allow string to see if we should allow it.
687 * swm: Added ability to strip <?xml tags without assuming it PHP
690 String
string_strip_tags(const char *s
, const int len
,
691 const char *allow
, const int allow_len
,
692 bool allow_tag_spaces
) {
693 const char *abuf
, *p
;
694 char *rbuf
, *tbuf
, *tp
, *rp
, c
, lc
;
696 int br
, i
=0, depth
=0, in_q
= 0;
702 String
retString(s
, len
, CopyString
);
703 rbuf
= retString
.mutableData();
714 allowString
= String(allow_len
, ReserveString
);
715 char *atmp
= allowString
.mutableData();
716 for (const char *tmp
= allow
; *tmp
; tmp
++, atmp
++) {
717 *atmp
= tolower((int)*(const unsigned char *)tmp
);
719 allowString
.setSize(allow_len
);
720 abuf
= allowString
.data();
722 tbuf
= (char *)req::malloc_noptrs(PHP_TAG_BUF_SIZE
+1);
729 auto move
= [&pos
, &tbuf
, &tp
]() {
730 if (tp
- tbuf
>= PHP_TAG_BUF_SIZE
) {
732 tbuf
= (char*)req::realloc_noptrs(tbuf
,
733 (tp
- tbuf
) + PHP_TAG_BUF_SIZE
+ 1);
743 if (isspace(*(p
+ 1)) && !allow_tag_spaces
) {
753 } else if (state
== 1) {
760 if (lc
!= '"' && lc
!= '\'') {
764 } else if (allow_len
&& state
== 1) {
767 } else if (state
== 0) {
774 if (lc
!= '"' && lc
!= '\'') {
778 } else if (allow_len
&& state
== 1) {
781 } else if (state
== 0) {
797 case 1: /* HTML/XML */
804 if (string_tag_find(tbuf
, tp
-tbuf
, abuf
)) {
805 memcpy(rp
, tbuf
, tp
-tbuf
);
813 if (!br
&& lc
!= '\"' && *(p
-1) == '?') {
824 case 4: /* JavaScript/CSS/etc... */
825 if (p
>= s
+ 2 && *(p
-1) == '-' && *(p
-2) == '-') {
840 /* Inside <!-- comment --> */
842 } else if (state
== 2 && *(p
-1) != '\\') {
845 } else if (lc
!= '\\') {
848 } else if (state
== 0) {
850 } else if (allow_len
&& state
== 1) {
854 if (state
&& p
!= s
&& *(p
-1) != '\\' && (!in_q
|| *p
== in_q
)) {
864 /* JavaScript & Other HTML scripting languages */
865 if (state
== 1 && *(p
-1) == '<') {
871 } else if (allow_len
&& state
== 1) {
879 if (state
== 3 && p
>= s
+ 2 && *(p
-1) == '-' && *(p
-2) == '!') {
888 if (state
== 1 && *(p
-1) == '<') {
896 /* !DOCTYPE exception */
897 if (state
==3 && p
> s
+6
898 && tolower(*(p
-1)) == 'p'
899 && tolower(*(p
-2)) == 'y'
900 && tolower(*(p
-3)) == 't'
901 && tolower(*(p
-4)) == 'c'
902 && tolower(*(p
-5)) == 'o'
903 && tolower(*(p
-6)) == 'd') {
911 /* swm: If we encounter '<?xml' then we shouldn't be in
912 * state == 2 (PHP). Switch back to HTML.
915 if (state
== 2 && p
> s
+2 && *(p
-1) == 'm' && *(p
-2) == 'x') {
925 } else if (allow_len
&& state
== 1) {
934 if (rp
< rbuf
+ len
) {
941 retString
.setSize(rp
- rbuf
);
945 ///////////////////////////////////////////////////////////////////////////////
947 static char string_hex2int(int c
) {
951 if (c
>= 'A' && c
<= 'F') {
954 if (c
>= 'a' && c
<= 'f') {
960 String
string_quoted_printable_encode(const char *input
, int len
) {
962 const unsigned char *str
= (unsigned char*)input
;
964 unsigned long lp
= 0;
967 char *hex
= "0123456789ABCDEF";
972 length
+ ((safe_address(3, length
, 0)/(PHP_QPRINT_MAXL
-9)) + 1),
976 d
= buffer
= ret
.mutableData();
979 if (((c
= *str
++) == '\015') && (*str
== '\012') && length
> 0) {
985 if (iscntrl (c
) || (c
== 0x7f) || (c
& 0x80) ||
986 (c
== '=') || ((c
== ' ') && (*str
== '\015'))) {
987 if ((((lp
+= 3) > PHP_QPRINT_MAXL
) && (c
<= 0x7f))
988 || ((c
> 0x7f) && (c
<= 0xdf) && ((lp
+ 3) > PHP_QPRINT_MAXL
))
989 || ((c
> 0xdf) && (c
<= 0xef) && ((lp
+ 6) > PHP_QPRINT_MAXL
))
990 || ((c
> 0xef) && (c
<= 0xf4) && ((lp
+ 9) > PHP_QPRINT_MAXL
))) {
1000 if ((++lp
) > PHP_QPRINT_MAXL
) {
1016 String
string_quoted_printable_decode(const char *input
, int len
, bool is_q
) {
1022 int i
= 0, j
= 0, k
;
1023 const char *str_in
= input
;
1024 String
ret(len
, ReserveString
);
1025 char *str_out
= ret
.mutableData();
1026 while (i
< len
&& str_in
[i
]) {
1027 switch (str_in
[i
]) {
1029 if (i
+ 2 < len
&& str_in
[i
+ 1] && str_in
[i
+ 2] &&
1030 isxdigit((int) str_in
[i
+ 1]) && isxdigit((int) str_in
[i
+ 2]))
1032 str_out
[j
++] = (string_hex2int((int) str_in
[i
+ 1]) << 4)
1033 + string_hex2int((int) str_in
[i
+ 2]);
1035 } else /* check for soft line break according to RFC 2045*/ {
1037 while (str_in
[i
+ k
] &&
1038 ((str_in
[i
+ k
] == 32) || (str_in
[i
+ k
] == 9))) {
1039 /* Possibly, skip spaces/tabs at the end of line */
1042 if (!str_in
[i
+ k
]) {
1043 /* End of line reached */
1046 else if ((str_in
[i
+ k
] == 13) && (str_in
[i
+ k
+ 1] == 10)) {
1050 else if ((str_in
[i
+ k
] == 13) || (str_in
[i
+ k
] == 10)) {
1055 str_out
[j
++] = str_in
[i
++];
1064 str_out
[j
++] = str_in
[i
++];
1068 str_out
[j
++] = str_in
[i
++];
1075 Variant
string_base_to_numeric(const char *s
, int len
, int base
) {
1082 assertx(string_validate_base(base
));
1084 cutoff
= LONG_MAX
/ base
;
1085 cutlim
= LONG_MAX
% base
;
1087 for (int i
= len
; i
> 0; i
--) {
1090 /* might not work for EBCDIC */
1091 if (c
>= '0' && c
<= '9')
1093 else if (c
>= 'A' && c
<= 'Z')
1095 else if (c
>= 'a' && c
<= 'z')
1104 case 0: /* Integer */
1105 if (num
< cutoff
|| (num
== cutoff
&& c
<= cutlim
)) {
1106 num
= num
* base
+ c
;
1114 fnum
= fnum
* base
+ c
;
1124 String
string_long_to_base(unsigned long value
, int base
) {
1125 static char digits
[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1126 char buf
[(sizeof(unsigned long) << 3) + 1];
1129 assertx(string_validate_base(base
));
1131 end
= ptr
= buf
+ sizeof(buf
) - 1;
1134 *--ptr
= digits
[value
% base
];
1136 } while (ptr
> buf
&& value
);
1138 return String(ptr
, end
- ptr
, CopyString
);
1141 String
string_numeric_to_base(const Variant
& value
, int base
) {
1142 static char digits
[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1144 assertx(string_validate_base(base
));
1145 if ((!value
.isInteger() && !value
.isDouble())) {
1146 return empty_string();
1149 if (value
.isDouble()) {
1150 double fvalue
= floor(value
.toDouble()); /* floor it just in case */
1152 char buf
[(sizeof(double) << 3) + 1];
1154 /* Don't try to convert +/- infinity */
1155 if (fvalue
== HUGE_VAL
|| fvalue
== -HUGE_VAL
) {
1156 raise_warning("Number too large");
1157 return empty_string();
1160 end
= ptr
= buf
+ sizeof(buf
) - 1;
1163 *--ptr
= digits
[(int) fmod(fvalue
, base
)];
1165 } while (ptr
> buf
&& fabs(fvalue
) >= 1);
1167 return String(ptr
, end
- ptr
, CopyString
);
1170 return string_long_to_base(value
.toInt64(), base
);
1173 ///////////////////////////////////////////////////////////////////////////////
1176 #define PHP_UU_ENC(c) \
1177 ((c) ? ((c) & 077) + ' ' : '`')
1178 #define PHP_UU_ENC_C2(c) \
1179 PHP_UU_ENC(((*(c) << 4) & 060) | ((*((c) + 1) >> 4) & 017))
1180 #define PHP_UU_ENC_C3(c) \
1181 PHP_UU_ENC(((*(c + 1) << 2) & 074) | ((*((c) + 2) >> 6) & 03))
1182 #define PHP_UU_DEC(c) \
1185 String
string_uuencode(const char *src
, int src_len
) {
1191 const char *s
, *e
, *ee
;
1194 /* encoded length is ~ 38% greater than the original */
1195 String
ret((int)ceil(src_len
* 1.38) + 45, ReserveString
);
1196 p
= dest
= ret
.mutableData();
1200 while ((s
+ 3) < e
) {
1206 ee
= s
+ (int) (floor(len
/ 3) * 3);
1209 *p
++ = PHP_UU_ENC(len
);
1212 *p
++ = PHP_UU_ENC(*s
>> 2);
1213 *p
++ = PHP_UU_ENC_C2(s
);
1214 *p
++ = PHP_UU_ENC_C3(s
);
1215 *p
++ = PHP_UU_ENC(*(s
+ 2) & 077);
1227 *p
++ = PHP_UU_ENC(e
- s
);
1231 *p
++ = PHP_UU_ENC(*s
>> 2);
1232 *p
++ = PHP_UU_ENC_C2(s
);
1233 *p
++ = ((e
- s
) > 1) ? PHP_UU_ENC_C3(s
) : PHP_UU_ENC('\0');
1234 *p
++ = ((e
- s
) > 2) ? PHP_UU_ENC(*(s
+ 2) & 077) : PHP_UU_ENC('\0');
1241 *p
++ = PHP_UU_ENC('\0');
1245 ret
.setSize(p
- dest
);
1249 String
string_uudecode(const char *src
, int src_len
) {
1252 const char *s
, *e
, *ee
;
1255 String
ret(ceil(src_len
* 0.75), ReserveString
);
1256 p
= dest
= ret
.mutableData();
1261 if ((len
= PHP_UU_DEC(*s
++)) <= 0) {
1265 if (len
> src_len
) {
1271 ee
= s
+ (len
== 45 ? 60 : (int) floor(len
* 1.33));
1278 if (s
+ 4 > e
) goto err
;
1280 *p
++ = PHP_UU_DEC(*s
) << 2 | PHP_UU_DEC(*(s
+ 1)) >> 4;
1281 *p
++ = PHP_UU_DEC(*(s
+ 1)) << 4 | PHP_UU_DEC(*(s
+ 2)) >> 2;
1282 *p
++ = PHP_UU_DEC(*(s
+ 2)) << 6 | PHP_UU_DEC(*(s
+ 3));
1294 if ((len
= total_len
> (p
- dest
))) {
1295 *p
++ = PHP_UU_DEC(*s
) << 2 | PHP_UU_DEC(*(s
+ 1)) >> 4;
1297 *p
++ = PHP_UU_DEC(*(s
+ 1)) << 4 | PHP_UU_DEC(*(s
+ 2)) >> 2;
1299 *p
++ = PHP_UU_DEC(*(s
+ 2)) << 6 | PHP_UU_DEC(*(s
+ 3));
1304 ret
.setSize(total_len
);
1311 ///////////////////////////////////////////////////////////////////////////////
1316 const char base64_table
[] = {
1317 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
1318 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
1319 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
1320 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
1321 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
1324 const char base64_pad
= '=';
1326 const short base64_reverse_table
[256] = {
1327 -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
1328 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1329 -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
1330 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
1331 -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1332 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
1333 -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
1334 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
1335 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1336 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1337 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1338 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1339 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1340 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1341 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1342 -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
1345 folly::Optional
<int> maxEncodedSize(int length
) {
1346 if ((length
+ 2) < 0 || ((length
+ 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
1349 return ((length
+ 2) / 3) * 4;
1352 // outstr must be at least maxEncodedSize(length) bytes
1353 size_t php_base64_encode(const unsigned char *str
, int length
,
1354 unsigned char* outstr
) {
1355 const unsigned char *current
= str
;
1356 unsigned char *p
= outstr
;
1358 while (length
> 2) { /* keep going until we have less than 24 bits */
1359 *p
++ = base64_table
[current
[0] >> 2];
1360 *p
++ = base64_table
[((current
[0] & 0x03) << 4) + (current
[1] >> 4)];
1361 *p
++ = base64_table
[((current
[1] & 0x0f) << 2) + (current
[2] >> 6)];
1362 *p
++ = base64_table
[current
[2] & 0x3f];
1365 length
-= 3; /* we just handle 3 octets of data */
1368 /* now deal with the tail end of things */
1370 *p
++ = base64_table
[current
[0] >> 2];
1372 *p
++ = base64_table
[((current
[0] & 0x03) << 4) + (current
[1] >> 4)];
1373 *p
++ = base64_table
[(current
[1] & 0x0f) << 2];
1376 *p
++ = base64_table
[(current
[0] & 0x03) << 4];
1384 // outstr must be at least length bytes
1385 ssize_t
php_base64_decode(const char *str
, int length
, bool strict
,
1386 unsigned char* outstr
) {
1387 const unsigned char *current
= (unsigned char*)str
;
1388 int ch
, i
= 0, j
= 0, k
;
1389 /* this sucks for threaded environments */
1391 unsigned char* result
= outstr
;
1393 /* run through the whole string, converting as we go */
1394 while ((ch
= *current
++) != '\0' && length
-- > 0) {
1395 if (ch
== base64_pad
) {
1396 if (*current
!= '=' && ((i
% 4) == 1 || (strict
&& length
> 0))) {
1398 while (isspace(*(++current
))) {
1401 if (*current
== '\0') {
1410 ch
= base64_reverse_table
[ch
];
1411 if ((!strict
&& ch
< 0) || ch
== -1) {
1412 /* a space or some other separator character, we simply skip over */
1414 } else if (ch
== -2) {
1420 result
[j
] = ch
<< 2;
1423 result
[j
++] |= ch
>> 4;
1424 result
[j
] = (ch
& 0x0f) << 4;
1427 result
[j
++] |= ch
>>2;
1428 result
[j
] = (ch
& 0x03) << 6;
1438 /* mop things up if we ended on a boundary */
1439 if (ch
== base64_pad
) {
1454 String
string_base64_encode(const char* input
, int len
) {
1455 if (auto const wantedSize
= maxEncodedSize(len
)) {
1456 String
ret(*wantedSize
, ReserveString
);
1457 auto actualSize
= php_base64_encode((unsigned char*)input
, len
,
1458 (unsigned char*)ret
.mutableData());
1459 ret
.setSize(actualSize
);
1465 String
string_base64_decode(const char* input
, int len
, bool strict
) {
1466 String
ret(len
, ReserveString
);
1467 auto actualSize
= php_base64_decode(input
, len
, strict
,
1468 (unsigned char*)ret
.mutableData());
1469 if (actualSize
< 0) return String();
1471 ret
.setSize(actualSize
);
1475 std::string
base64_encode(const char* input
, int len
) {
1476 if (auto const wantedSize
= maxEncodedSize(len
)) {
1478 ret
.resize(*wantedSize
);
1479 auto actualSize
= php_base64_encode((unsigned char*)input
, len
,
1480 (unsigned char*)ret
.data());
1481 ret
.resize(actualSize
);
1484 return std::string();
1487 std::string
base64_decode(const char* input
, int len
, bool strict
) {
1488 if (!len
) return std::string();
1491 auto actualSize
= php_base64_decode(input
, len
, strict
,
1492 (unsigned char*)ret
.data());
1493 if (!actualSize
) return std::string();
1495 ret
.resize(actualSize
);
1499 ///////////////////////////////////////////////////////////////////////////////
1501 String
string_escape_shell_arg(const char *str
) {
1508 String
ret(safe_address(l
, 4, 3), ReserveString
); /* worst case */
1509 cmd
= ret
.mutableData();
1517 for (x
= 0; x
< l
; x
++) {
1537 if (y
> 0 && '\\' == cmd
[y
- 1]) {
1538 int k
= 0, n
= y
- 1;
1539 for (; n
>= 0 && '\\' == cmd
[n
]; n
--, k
++);
1553 String
string_escape_shell_cmd(const char *str
) {
1554 register int x
, y
, l
;
1559 String
ret(safe_address(l
, 2, 1), ReserveString
);
1560 cmd
= ret
.mutableData();
1562 for (x
= 0, y
= 0; x
< l
; x
++) {
1567 if (!p
&& (p
= (char *)memchr(str
+ x
+ 1, str
[x
], l
- x
- 1))) {
1569 } else if (p
&& *p
== str
[x
]) {
1577 /* % is Windows specific for environmental variables, ^%PATH% will
1578 output PATH while ^%PATH^% will not. escapeshellcmd->val will
1586 case '#': /* This is character-set independent */
1605 case '\x0A': /* excluding these two */
1621 ///////////////////////////////////////////////////////////////////////////////
1623 static void string_similar_str(const char *txt1
, int len1
,
1624 const char *txt2
, int len2
,
1625 int *pos1
, int *pos2
, int *max
) {
1627 const char *end1
= txt1
+ len1
;
1628 const char *end2
= txt2
+ len2
;
1632 for (p
= txt1
; p
< end1
; p
++) {
1633 for (q
= txt2
; q
< end2
; q
++) {
1634 for (l
= 0; (p
+ l
< end1
) && (q
+ l
< end2
) && (p
[l
] == q
[l
]); l
++);
1644 static int string_similar_char(const char *txt1
, int len1
,
1645 const char *txt2
, int len2
) {
1647 int pos1
= 0, pos2
= 0, max
;
1649 string_similar_str(txt1
, len1
, txt2
, len2
, &pos1
, &pos2
, &max
);
1652 sum
+= string_similar_char(txt1
, pos1
, txt2
, pos2
);
1654 if ((pos1
+ max
< len1
) && (pos2
+ max
< len2
)) {
1655 sum
+= string_similar_char(txt1
+ pos1
+ max
, len1
- pos1
- max
,
1656 txt2
+ pos2
+ max
, len2
- pos2
- max
);
1663 int string_similar_text(const char *t1
, int len1
,
1664 const char *t2
, int len2
, float *percent
) {
1665 if (len1
== 0 && len2
== 0) {
1666 if (percent
) *percent
= 0.0;
1670 int sim
= string_similar_char(t1
, len1
, t2
, len2
);
1671 if (percent
) *percent
= sim
* 200.0 / (len1
+ len2
);
1675 ///////////////////////////////////////////////////////////////////////////////
1677 #define LEVENSHTEIN_MAX_LENTH 255
1679 // reference implementation, only optimized for memory usage, not speed
1680 int string_levenshtein(const char *s1
, int l1
, const char *s2
, int l2
,
1681 int cost_ins
, int cost_rep
, int cost_del
) {
1683 int i1
, i2
, c0
, c1
, c2
;
1685 if (l1
==0) return l2
*cost_ins
;
1686 if (l2
==0) return l1
*cost_del
;
1688 if ((l1
>LEVENSHTEIN_MAX_LENTH
)||(l2
>LEVENSHTEIN_MAX_LENTH
)) {
1689 raise_warning("levenshtein(): Argument string(s) too long");
1693 p1
= (int*)req::malloc_noptrs((l2
+1) * sizeof(int));
1694 SCOPE_EXIT
{ req::free(p1
); };
1695 p2
= (int*)req::malloc_noptrs((l2
+1) * sizeof(int));
1696 SCOPE_EXIT
{ req::free(p2
); };
1698 for(i2
=0;i2
<=l2
;i2
++) {
1699 p1
[i2
] = i2
*cost_ins
;
1702 for(i1
=0;i1
<l1
;i1
++) {
1703 p2
[0]=p1
[0]+cost_del
;
1704 for(i2
=0;i2
<l2
;i2
++) {
1705 c0
=p1
[i2
]+((s1
[i1
]==s2
[i2
])?0:cost_rep
);
1706 c1
=p1
[i2
+1]+cost_del
; if (c1
<c0
) c0
=c1
;
1707 c2
=p2
[i2
]+cost_ins
; if (c2
<c0
) c0
=c2
;
1710 tmp
=p1
; p1
=p2
; p2
=tmp
;
1717 ///////////////////////////////////////////////////////////////////////////////
1719 String
string_money_format(const char *format
, double value
) {
1721 const char *p
= format
;
1722 while ((p
= strchr(p
, '%'))) {
1723 if (*(p
+ 1) == '%') {
1725 } else if (!check
) {
1729 throw_invalid_argument
1730 ("format: Only a single %%i or %%n token can be used");
1735 int format_len
= strlen(format
);
1736 int str_len
= safe_address(format_len
, 1, 1024);
1737 String
ret(str_len
, ReserveString
);
1738 char *str
= ret
.mutableData();
1739 if ((str_len
= strfmon(str
, str_len
, format
, value
)) < 0) {
1742 ret
.setSize(str_len
);
1746 ///////////////////////////////////////////////////////////////////////////////
1748 String
string_number_format(double d
, int dec
,
1749 const String
& dec_point
,
1750 const String
& thousand_sep
) {
1751 char *tmpbuf
= nullptr, *resbuf
;
1752 char *s
, *t
; /* source, target */
1755 int tmplen
, reslen
=0;
1764 if (dec
< 0) dec
= 0;
1765 d
= php_math_round(d
, dec
);
1767 // departure from PHP: we got rid of dependencies on spprintf() here.
1768 String
tmpstr(63, ReserveString
);
1769 tmpbuf
= tmpstr
.mutableData();
1770 tmplen
= snprintf(tmpbuf
, 64, "%.*F", dec
, d
);
1771 if (tmpbuf
== nullptr || !isdigit((int)tmpbuf
[0])) {
1772 tmpstr
.setSize(tmplen
);
1776 // Uncommon, asked for more than 64 chars worth of precision
1777 tmpstr
= String(tmplen
, ReserveString
);
1778 tmpbuf
= tmpstr
.mutableData();
1779 tmplen
= snprintf(tmpbuf
, tmplen
+ 1, "%.*F", dec
, d
);
1780 if (tmpbuf
== nullptr || !isdigit((int)tmpbuf
[0])) {
1781 tmpstr
.setSize(tmplen
);
1786 /* find decimal point, if expected */
1788 dp
= strpbrk(tmpbuf
, ".,");
1793 /* calculate the length of the return buffer */
1795 integral
= dp
- tmpbuf
;
1797 /* no decimal point was found */
1801 /* allow for thousand separators */
1802 if (!thousand_sep
.empty()) {
1803 if (integral
+ thousand_sep
.size() * ((integral
-1) / 3) < integral
) {
1805 raise_error("String overflow");
1808 integral
+= ((integral
-1) / 3) * thousand_sep
.size();
1816 if (!dec_point
.empty()) {
1817 if (reslen
+ dec_point
.size() < dec_point
.size()) {
1819 raise_error("String overflow");
1821 reslen
+= dec_point
.size();
1825 /* add a byte for minus sign */
1829 String
resstr(reslen
, ReserveString
);
1830 resbuf
= resstr
.mutableData();
1832 s
= tmpbuf
+tmplen
-1;
1833 t
= resbuf
+reslen
-1;
1835 /* copy the decimal places.
1836 * Take care, as the sprintf implementation may return less places than
1837 * we requested due to internal buffer limitations */
1839 int declen
= dp
? s
- dp
: 0;
1840 int topad
= dec
> declen
? dec
- declen
: 0;
1848 s
-= declen
+ 1; /* +1 to skip the point */
1851 /* now copy the chars after the point */
1852 memcpy(t
+ 1, dp
+ 1, declen
);
1855 /* add decimal point */
1856 if (!dec_point
.empty()) {
1857 memcpy(t
+ (1 - dec_point
.size()), dec_point
.data(), dec_point
.size());
1858 t
-= dec_point
.size();
1862 /* copy the numbers before the decimal point, adding thousand
1863 * separator every three digits */
1864 while(s
>= tmpbuf
) {
1866 if (thousand_sep
&& (++count
%3)==0 && s
>=tmpbuf
) {
1867 memcpy(t
+ (1 - thousand_sep
.size()),
1868 thousand_sep
.data(),
1869 thousand_sep
.size());
1870 t
-= thousand_sep
.size();
1874 /* and a minus sign, if needed */
1879 resstr
.setSize(reslen
);
1883 ///////////////////////////////////////////////////////////////////////////////
1886 /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
1887 String
string_soundex(const String
& str
) {
1888 assertx(!str
.empty());
1889 int _small
, code
, last
;
1890 String
retString(4, ReserveString
);
1891 char* soundex
= retString
.mutableData();
1893 static char soundex_table
[26] = {
1922 /* build soundex string */
1924 auto p
= str
.slice().data();
1925 for (_small
= 0; *p
&& _small
< 4; p
++) {
1926 /* convert chars to upper case and strip non-letter chars */
1927 /* BUG: should also map here accented letters used in non */
1928 /* English words or names (also found in English text!): */
1929 /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
1930 code
= toupper((int)(unsigned char)(*p
));
1931 if (code
>= 'A' && code
<= 'Z') {
1933 /* remember first valid char */
1934 soundex
[_small
++] = code
;
1935 last
= soundex_table
[code
- 'A'];
1937 /* ignore sequences of consonants with same soundex */
1938 /* code in trail, and vowels unless they separate */
1939 /* consonant letters */
1940 code
= soundex_table
[code
- 'A'];
1943 soundex
[_small
++] = code
;
1950 /* pad with '0' and terminate with 0 ;-) */
1951 while (_small
< 4) {
1952 soundex
[_small
++] = '0';
1954 retString
.setSize(4);
1958 ///////////////////////////////////////////////////////////////////////////////
1962 * this is now the original code by Michael G Schwern:
1963 * i've changed it just a slightly bit (use emalloc,
1964 * get rid of includes etc)
1965 * - thies - 13.09.1999
1968 /*----------------------------- */
1969 /* this used to be "metaphone.h" */
1970 /*----------------------------- */
1972 /* Special encodings */
1976 /*----------------------------- */
1977 /* end of "metaphone.h" */
1978 /*----------------------------- */
1980 /*----------------------------- */
1981 /* this used to be "metachar.h" */
1982 /*----------------------------- */
1984 /* Metachar.h ... little bits about characters for metaphone */
1985 /*-- Character encoding array & accessing macros --*/
1986 /* Stolen directly out of the book... */
1987 char _codes
[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
1989 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
1991 #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */
1993 /* These letters are passed through unchanged */
1994 #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */
1996 /* These form dipthongs when preceding H */
1997 #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */
1999 /* These make C and G soft */
2000 #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */
2002 /* These prevent GH from becoming F */
2003 #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */
2005 /*----------------------------- */
2006 /* end of "metachar.h" */
2007 /*----------------------------- */
2009 /* I suppose I could have been using a character pointer instead of
2010 * accesssing the array directly... */
2012 /* Look at the next letter in the word */
2013 #define Next_Letter ((char)toupper(word[w_idx+1]))
2014 /* Look at the current letter in the word */
2015 #define Curr_Letter ((char)toupper(word[w_idx]))
2016 /* Go N letters back. */
2017 #define Look_Back_Letter(n) (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
2018 /* Previous letter. I dunno, should this return null on failure? */
2019 #define Prev_Letter (Look_Back_Letter(1))
2020 /* Look two letters down. It makes sure you don't walk off the string. */
2021 #define After_Next_Letter (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
2023 #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
2025 /* Allows us to safely look ahead an arbitrary # of letters */
2026 /* I probably could have just used strlen... */
2027 static char Lookahead(unsigned char *word
, int how_far
) {
2028 char letter_ahead
= '\0'; /* null by default */
2030 for (idx
= 0; word
[idx
] != '\0' && idx
< how_far
; idx
++);
2031 /* Edge forward in the string... */
2033 letter_ahead
= (char)word
[idx
]; /* idx will be either == to how_far or
2034 * at the end of the string
2036 return letter_ahead
;
2039 /* phonize one letter
2040 * We don't know the buffers size in advance. On way to solve this is to just
2041 * re-allocate the buffer size. We're using an extra of 2 characters (this
2042 * could be one though; or more too). */
2043 #define Phonize(c) { buffer.append(c); }
2044 /* How long is the phoned word? */
2045 #define Phone_Len (buffer.size())
2047 /* Note is a letter is a 'break' in the word */
2048 #define Isbreak(c) (!isalpha(c))
2050 String
string_metaphone(const char *input
, int word_len
, long max_phonemes
,
2052 unsigned char *word
= (unsigned char *)input
;
2054 int w_idx
= 0; /* point in the phonization we're at. */
2055 int max_buffer_len
= 0; /* maximum length of the destination buffer */
2057 /*-- Parameter checks --*/
2058 /* Negative phoneme length is meaningless */
2060 if (max_phonemes
< 0)
2063 /* Empty/null string is meaningless */
2064 /* Overly paranoid */
2065 /* always_assert(word != NULL && word[0] != '\0'); */
2067 if (word
== nullptr)
2070 /*-- Allocate memory for our phoned_phrase --*/
2071 if (max_phonemes
== 0) { /* Assume largest possible */
2072 max_buffer_len
= word_len
;
2074 max_buffer_len
= max_phonemes
;
2076 StringBuffer
buffer(max_buffer_len
);
2078 /*-- The first phoneme has to be processed specially. --*/
2079 /* Find our first letter */
2080 for (; !isalpha(Curr_Letter
); w_idx
++) {
2081 /* On the off chance we were given nothing but crap... */
2082 if (Curr_Letter
== '\0') {
2083 return buffer
.detach(); /* For testing */
2087 switch (Curr_Letter
) {
2090 if (Next_Letter
== 'E') {
2094 /* Remember, preserve vowels at the beginning */
2100 /* [GKP]N becomes N */
2104 if (Next_Letter
== 'N') {
2111 W if followed by a vowel */
2113 if (Next_Letter
== 'H' ||
2114 Next_Letter
== 'R') {
2115 Phonize(Next_Letter
);
2117 } else if (isvowel(Next_Letter
)) {
2128 /* Vowels are kept */
2137 Phonize(Curr_Letter
);
2145 /* On to the metaphoning */
2146 for (; Curr_Letter
!= '\0' &&
2147 (max_phonemes
== 0 || Phone_Len
< max_phonemes
);
2149 /* How many letters to skip because an eariler encoding handled
2150 * multiple letters */
2151 unsigned short int skip_letter
= 0;
2154 /* THOUGHT: It would be nice if, rather than having things like...
2155 * well, SCI. For SCI you encode the S, then have to remember
2156 * to skip the C. So the phonome SCI invades both S and C. It would
2157 * be better, IMHO, to skip the C from the S part of the encoding.
2158 * Hell, I'm trying it.
2161 /* Ignore non-alphas */
2162 if (!isalpha(Curr_Letter
))
2165 /* Drop duplicates, except CC */
2166 if (Curr_Letter
== Prev_Letter
&&
2170 switch (Curr_Letter
) {
2171 /* B -> B unless in MB */
2173 if (Prev_Letter
!= 'M')
2176 /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
2177 * (SCHW is handled in S)
2178 * S if -CI-, -CE- or -CY-
2179 * dropped if -SCI-, SCE-, -SCY- (handed in S)
2183 if (MAKESOFT(Next_Letter
)) { /* C[IEY] */
2184 if (After_Next_Letter
== 'A' &&
2185 Next_Letter
== 'I') { /* CIA */
2189 else if (Prev_Letter
== 'S') {
2194 } else if (Next_Letter
== 'H') {
2195 if ((!traditional
) && (After_Next_Letter
== 'R' ||
2196 Prev_Letter
== 'S')) { /* Christ, School */
2206 /* J if in -DGE-, -DGI- or -DGY-
2210 if (Next_Letter
== 'G' && MAKESOFT(After_Next_Letter
)) {
2216 /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
2217 * else dropped if -GNED, -GN,
2218 * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
2219 * else J if in -GE-, -GI, -GY and not GG
2223 if (Next_Letter
== 'H') {
2224 if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) {
2230 } else if (Next_Letter
== 'N') {
2231 if (Isbreak(After_Next_Letter
) ||
2232 (After_Next_Letter
== 'E' && Look_Ahead_Letter(3) == 'D')) {
2236 } else if (MAKESOFT(Next_Letter
) && Prev_Letter
!= 'G') {
2242 /* H if before a vowel and not after C,G,P,S,T */
2244 if (isvowel(Next_Letter
) && !AFFECTH(Prev_Letter
))
2247 /* dropped if after C
2251 if (Prev_Letter
!= 'C')
2258 if (Next_Letter
== 'H') {
2269 /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
2273 if (Next_Letter
== 'I' &&
2274 (After_Next_Letter
== 'O' || After_Next_Letter
== 'A')) {
2276 } else if (Next_Letter
== 'H') {
2279 } else if ((!traditional
) &&
2280 (Next_Letter
== 'C' && Look_Ahead_Letter(2) == 'H' &&
2281 Look_Ahead_Letter(3) == 'W')) {
2288 /* 'sh' in -TIA- or -TIO-
2289 * else 'th' before H
2293 if (Next_Letter
== 'I' &&
2294 (After_Next_Letter
== 'O' || After_Next_Letter
== 'A')) {
2296 } else if (Next_Letter
== 'H') {
2307 /* W before a vowel, else dropped */
2309 if (isvowel(Next_Letter
))
2317 /* Y if followed by a vowel */
2319 if (isvowel(Next_Letter
))
2326 /* No transformation */
2333 Phonize(Curr_Letter
);
2340 w_idx
+= skip_letter
;
2343 return buffer
.detach();
2346 ///////////////////////////////////////////////////////////////////////////////
2350 * This is codetables for different Cyrillic charsets (relative to koi8-r).
2351 * Each table contains data for 128-255 symbols from ASCII table.
2352 * First 256 symbols are for conversion from koi8-r to corresponding charset,
2353 * second 256 symbols are for reverse conversion, from charset to koi8-r.
2355 * Here we have the following tables:
2356 * _cyr_win1251 - for windows-1251 charset
2357 * _cyr_iso88595 - for iso8859-5 charset
2358 * _cyr_cp866 - for x-cp866 charset
2359 * _cyr_mac - for x-mac-cyrillic charset
2361 typedef unsigned char _cyr_charset_table
[512];
2363 static const _cyr_charset_table _cyr_win1251
= {
2364 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2365 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2366 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2367 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2368 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2369 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2370 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2371 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2372 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2373 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2374 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
2375 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
2376 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2377 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2378 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2379 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2380 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2381 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2382 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2383 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2384 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2385 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2386 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2387 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2388 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2389 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2390 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
2391 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
2392 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2393 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2394 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
2395 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
2398 static const _cyr_charset_table _cyr_cp866
= {
2399 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2400 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2401 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2402 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2403 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2404 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2405 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2406 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2407 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2408 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2409 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2410 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
2411 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
2412 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
2413 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2414 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
2415 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2416 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2417 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2418 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2419 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2420 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2421 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2422 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2423 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2424 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2425 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
2426 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
2427 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
2428 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
2429 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2430 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2433 static const _cyr_charset_table _cyr_iso88595
= {
2434 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2435 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2436 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2437 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2438 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2439 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2440 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2441 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2442 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2443 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2444 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2445 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2446 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2447 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2448 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2449 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2450 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2451 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2452 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2453 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2454 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2455 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2456 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2457 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2458 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2459 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2460 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
2461 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
2462 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
2463 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
2464 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
2465 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
2468 static const _cyr_charset_table _cyr_mac
= {
2469 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2470 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2471 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2472 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2473 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2474 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2475 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2476 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2477 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2478 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2479 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2480 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2481 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2482 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
2483 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2484 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
2485 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2486 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2487 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2488 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2489 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2490 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2491 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2492 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2493 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2494 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2495 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
2496 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
2497 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2498 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2499 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2500 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2504 * This is the function that performs real in-place conversion of the string
2507 * str - string to be converted
2508 * from,to - one-symbol label of source and destination charset
2509 * The following symbols are used as labels:
2515 * m - x-mac-cyrillic
2517 String
string_convert_cyrillic_string(const String
& input
, char from
, char to
) {
2518 const unsigned char *from_table
, *to_table
;
2520 auto uinput
= (unsigned char*)input
.slice().data();
2521 String
retString(input
.size(), ReserveString
);
2522 unsigned char *str
= (unsigned char *)retString
.mutableData();
2524 from_table
= nullptr;
2527 switch (toupper((int)(unsigned char)from
)) {
2528 case 'W': from_table
= _cyr_win1251
; break;
2530 case 'D': from_table
= _cyr_cp866
; break;
2531 case 'I': from_table
= _cyr_iso88595
; break;
2532 case 'M': from_table
= _cyr_mac
; break;
2536 throw_invalid_argument("Unknown source charset: %c", from
);
2540 switch (toupper((int)(unsigned char)to
)) {
2541 case 'W': to_table
= _cyr_win1251
; break;
2543 case 'D': to_table
= _cyr_cp866
; break;
2544 case 'I': to_table
= _cyr_iso88595
; break;
2545 case 'M': to_table
= _cyr_mac
; break;
2549 throw_invalid_argument("Unknown destination charset: %c", to
);
2553 for (int i
= 0; i
< input
.size(); i
++) {
2554 tmp
= from_table
== nullptr ? uinput
[i
] : from_table
[uinput
[i
]];
2555 str
[i
] = to_table
== nullptr ? tmp
: to_table
[tmp
+ 256];
2557 retString
.setSize(input
.size());
2561 ///////////////////////////////////////////////////////////////////////////////
2564 #define HEB_BLOCK_TYPE_ENG 1
2565 #define HEB_BLOCK_TYPE_HEB 2
2568 (((((unsigned char) c) >= 224) && (((unsigned char) c) <= 250)) ? 1 : 0)
2569 #define _isblank(c) \
2570 (((((unsigned char) c) == ' ' || ((unsigned char) c) == '\t')) ? 1 : 0)
2571 #define _isnewline(c) \
2572 (((((unsigned char) c) == '\n' || ((unsigned char) c) == '\r')) ? 1 : 0)
2575 * Converts Logical Hebrew text (Hebrew Windows style) to Visual text
2576 * Cheers/complaints/flames - Zeev Suraski <zeev@php.net>
2579 string_convert_hebrew_string(const String
& inStr
, int /*max_chars_per_line*/,
2580 int convert_newlines
) {
2581 assertx(!inStr
.empty());
2582 auto str
= inStr
.data();
2583 auto str_len
= inStr
.size();
2585 char *heb_str
, *broken_str
;
2587 int block_start
, block_end
, block_type
, block_length
, i
;
2589 int begin
, end
, char_count
, orig_begin
;
2592 block_start
=block_end
=0;
2594 heb_str
= (char *) req::malloc_noptrs(str_len
+ 1);
2595 SCOPE_EXIT
{ req::free(heb_str
); };
2596 target
= heb_str
+str_len
;
2603 block_type
= HEB_BLOCK_TYPE_HEB
;
2605 block_type
= HEB_BLOCK_TYPE_ENG
;
2609 if (block_type
== HEB_BLOCK_TYPE_HEB
) {
2610 while ((isheb((int)*(tmp
+1)) ||
2611 _isblank((int)*(tmp
+1)) ||
2612 ispunct((int)*(tmp
+1)) ||
2613 (int)*(tmp
+1)=='\n' ) && block_end
<str_len
-1) {
2618 for (i
= block_start
; i
<= block_end
; i
++) {
2621 case '(': *target
= ')'; break;
2622 case ')': *target
= '('; break;
2623 case '[': *target
= ']'; break;
2624 case ']': *target
= '['; break;
2625 case '{': *target
= '}'; break;
2626 case '}': *target
= '{'; break;
2627 case '<': *target
= '>'; break;
2628 case '>': *target
= '<'; break;
2629 case '\\': *target
= '/'; break;
2630 case '/': *target
= '\\'; break;
2636 block_type
= HEB_BLOCK_TYPE_ENG
;
2638 while (!isheb(*(tmp
+1)) &&
2639 (int)*(tmp
+1)!='\n' && block_end
< str_len
-1) {
2644 while ((_isblank((int)*tmp
) ||
2645 ispunct((int)*tmp
)) && *tmp
!='/' &&
2646 *tmp
!='-' && block_end
> block_start
) {
2650 for (i
= block_end
; i
>= block_start
; i
--) {
2654 block_type
= HEB_BLOCK_TYPE_HEB
;
2656 block_start
=block_end
+1;
2657 } while (block_end
< str_len
-1);
2659 String
brokenStr(str_len
, ReserveString
);
2660 broken_str
= brokenStr
.mutableData();
2661 begin
=end
=str_len
-1;
2662 target
= broken_str
;
2666 while ((!max_chars
|| char_count
< max_chars
) && begin
> 0) {
2669 if (begin
<= 0 || _isnewline(heb_str
[begin
])) {
2670 while (begin
> 0 && _isnewline(heb_str
[begin
-1])) {
2677 if (char_count
== max_chars
) { /* try to avoid breaking words */
2678 int new_char_count
=char_count
, new_begin
=begin
;
2680 while (new_char_count
> 0) {
2681 if (_isblank(heb_str
[new_begin
]) || _isnewline(heb_str
[new_begin
])) {
2687 if (new_char_count
> 0) {
2688 char_count
=new_char_count
;
2694 if (_isblank(heb_str
[begin
])) {
2695 heb_str
[begin
]='\n';
2697 while (begin
<= end
&& _isnewline(heb_str
[begin
])) {
2698 /* skip leading newlines */
2701 for (i
= begin
; i
<= end
; i
++) { /* copy content */
2702 *target
= heb_str
[i
];
2705 for (i
= orig_begin
; i
<= end
&& _isnewline(heb_str
[i
]); i
++) {
2706 *target
= heb_str
[i
];
2719 if (convert_newlines
) {
2721 auto ret
= string_replace(broken_str
, str_len
, "\n", strlen("\n"),
2722 "<br />\n", strlen("<br />\n"), count
, true);
2723 if (!ret
.isNull()) {
2727 brokenStr
.setSize(str_len
);
2731 ///////////////////////////////////////////////////////////////////////////////