hphp/runtime/base/zend-string.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
   7    +----------------------------------------------------------------------+
   8    | This source file is subject to version 2.00 of the Zend license,     |
   9    | that is bundled with this package in the file LICENSE, and is        |
  10    | available through the world-wide-web at the following url:           |
  11    | http://www.zend.com/license/2_00.txt.                                |
  12    | If you did not receive a copy of the Zend license and are unable to  |
  13    | obtain it through the world-wide-web, please send a note to          |
  14    | license@zend.com so we can mail you a copy immediately.              |
  15    +----------------------------------------------------------------------+
  16 */
  17
  18 #include "hphp/runtime/base/zend-string.h"
  19 #include "hphp/runtime/base/zend-printf.h"
  20 #include "hphp/runtime/base/zend-math.h"
  21
  22 #include "hphp/util/lock.h"
  23 #include "hphp/util/overflow.h"
  24 #include <cmath>
  25
  26 #ifndef _MSC_VER
  27 #include <monetary.h>
  28 #endif
  29
  30 #include "hphp/util/bstring.h"
  31 #include "hphp/runtime/base/exceptions.h"
  32 #include "hphp/runtime/base/string-buffer.h"
  33 #include "hphp/runtime/base/runtime-error.h"
  34 #include "hphp/runtime/base/string-util.h"
  35 #include "hphp/runtime/base/builtin-functions.h"
  36
  37 #include <folly/portability/String.h>
  38
  39 #define PHP_QPRINT_MAXL 75
  40
  41 namespace HPHP {
  42 ///////////////////////////////////////////////////////////////////////////////
  43 // helpers
  44
  45 void string_charmask(const char *sinput, int len, char *mask) {
  46   const unsigned char *input = (unsigned char *)sinput;
  47   const unsigned char *end;
  48   unsigned char c;
  49
  50   memset(mask, 0, 256);
  51   for (end = input+len; input < end; input++) {
  52     c=*input;
  53     if ((input+3 < end) && input[1] == '.' && input[2] == '.'
  54         && input[3] >= c) {
  55       memset(mask+c, 1, input[3] - c + 1);
  56       input+=3;
  57     } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') {
  58       /* Error, try to be as helpful as possible:
  59          (a range ending/starting with '.' won't be captured here) */
  60       if (end-len >= input) { /* there was no 'left' char */
  61         throw_invalid_argument
  62           ("charlist: Invalid '..'-range, missing left of '..'");
  63         continue;
  64       }
  65       if (input+2 >= end) { /* there is no 'right' char */
  66         throw_invalid_argument
  67           ("charlist: Invalid '..'-range, missing right of '..'");
  68         continue;
  69       }
  70       if (input[-1] > input[2]) { /* wrong order */
  71         throw_invalid_argument
  72           ("charlist: '..'-range needs to be incrementing");
  73         continue;
  74       }
  75       /* FIXME: better error (a..b..c is the only left possibility?) */
  76       throw_invalid_argument("charlist: Invalid '..'-range");
  77       continue;
  78     } else {
  79       mask[c]=1;
  80     }
  81   }
  82 }
  83
  84 int string_copy(char *dst, const char *src, int siz) {
  85   register char *d = dst;
  86   register const char *s = src;
  87   register size_t n = siz;
  88
  89   /* Copy as many bytes as will fit */
  90   if (n != 0 && --n != 0) {
  91     do {
  92       if ((*d++ = *s++) == 0)
  93         break;
  94     } while (--n != 0);
  95   }
  96
  97   /* Not enough room in dst, add NUL and traverse rest of src */
  98   if (n == 0) {
  99     if (siz != 0)
 100       *d = '\0';    /* NUL-terminate dst */
 101     while (*s++)
 102       ;
 103   }
 104
 105   return(s - src - 1);  /* count does not include NUL */
 106 }
 107
 108 ///////////////////////////////////////////////////////////////////////////////
 109 // comparisons
 110
 111 int string_ncmp(const char *s1, const char *s2, int len) {
 112   for (int i = 0; i < len; i++) {
 113     char c1 = s1[i];
 114     char c2 = s2[i];
 115     if (c1 > c2) return 1;
 116     if (c1 < c2) return -1;
 117   }
 118   return 0;
 119 }
 120
 121 static int compare_right(char const **a, char const *aend,
 122                          char const **b, char const *bend) {
 123   int bias = 0;
 124
 125   /* The longest run of digits wins.  That aside, the greatest
 126      value wins, but we can't know that it will until we've scanned
 127      both numbers to know that they have the same magnitude, so we
 128      remember it in BIAS. */
 129   for(;; (*a)++, (*b)++) {
 130     if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
 131         (*b == bend || !isdigit((int)(unsigned char)**b)))
 132       return bias;
 133     else if (*a == aend || !isdigit((int)(unsigned char)**a))
 134       return -1;
 135     else if (*b == bend || !isdigit((int)(unsigned char)**b))
 136       return +1;
 137     else if (**a < **b) {
 138       if (!bias)
 139         bias = -1;
 140     } else if (**a > **b) {
 141       if (!bias)
 142         bias = +1;
 143     }
 144   }
 145
 146   return 0;
 147 }
 148
 149 static int compare_left(char const **a, char const *aend,
 150                         char const **b, char const *bend) {
 151   /* Compare two left-aligned numbers: the first to have a
 152      different value wins. */
 153   for(;; (*a)++, (*b)++) {
 154     if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
 155         (*b == bend || !isdigit((int)(unsigned char)**b)))
 156       return 0;
 157     else if (*a == aend || !isdigit((int)(unsigned char)**a))
 158       return -1;
 159     else if (*b == bend || !isdigit((int)(unsigned char)**b))
 160       return +1;
 161     else if (**a < **b)
 162       return -1;
 163     else if (**a > **b)
 164       return +1;
 165   }
 166
 167   return 0;
 168 }
 169
 170 int string_natural_cmp(char const *a, size_t a_len,
 171                        char const *b, size_t b_len, int fold_case) {
 172   char ca, cb;
 173   char const *ap, *bp;
 174   char const *aend = a + a_len, *bend = b + b_len;
 175   int fractional, result;
 176
 177   if (a_len == 0 || b_len == 0)
 178     return a_len - b_len;
 179
 180   ap = a;
 181   bp = b;
 182   while (1) {
 183     ca = *ap; cb = *bp;
 184
 185     /* skip over leading spaces or zeros */
 186     while (isspace((int)(unsigned char)ca))
 187       ca = *++ap;
 188
 189     while (isspace((int)(unsigned char)cb))
 190       cb = *++bp;
 191
 192     /* process run of digits */
 193     if (isdigit((int)(unsigned char)ca)  &&  isdigit((int)(unsigned char)cb)) {
 194       fractional = (ca == '0' || cb == '0');
 195
 196       if (fractional)
 197         result = compare_left(&ap, aend, &bp, bend);
 198       else
 199         result = compare_right(&ap, aend, &bp, bend);
 200
 201       if (result != 0)
 202         return result;
 203       else if (ap == aend && bp == bend)
 204         /* End of the strings. Let caller sort them out. */
 205         return 0;
 206       else {
 207         /* Keep on comparing from the current point. */
 208         ca = *ap; cb = *bp;
 209       }
 210     }
 211
 212     if (fold_case) {
 213       ca = toupper((int)(unsigned char)ca);
 214       cb = toupper((int)(unsigned char)cb);
 215     }
 216
 217     if (ca < cb)
 218       return -1;
 219     else if (ca > cb)
 220       return +1;
 221
 222     ++ap; ++bp;
 223     if (ap >= aend && bp >= bend)
 224       /* The strings compare the same.  Perhaps the caller
 225          will want to call strcmp to break the tie. */
 226       return 0;
 227     else if (ap >= aend)
 228       return -1;
 229     else if (bp >= bend)
 230       return 1;
 231   }
 232 }
 233
 234 ///////////////////////////////////////////////////////////////////////////////
 235
 236 void string_to_case(String& s, int (*tocase)(int)) {
 237   assertx(!s.isNull());
 238   assertx(tocase);
 239   auto data = s.mutableData();
 240   auto len = s.size();
 241   for (int i = 0; i < len; i++) {
 242     data[i] = tocase(data[i]);
 243   }
 244 }
 245
 246 ///////////////////////////////////////////////////////////////////////////////
 247
 248 #define STR_PAD_LEFT            0
 249 #define STR_PAD_RIGHT           1
 250 #define STR_PAD_BOTH            2
 251
 252 String string_pad(const char *input, int len, int pad_length,
 253                   const char *pad_string, int pad_str_len,
 254                   int pad_type) {
 255   assertx(input);
 256   int num_pad_chars = pad_length - len;
 257
 258   /* If resulting string turns out to be shorter than input string,
 259      we simply copy the input and return. */
 260   if (pad_length < 0 || num_pad_chars < 0) {
 261     return String(input, len, CopyString);
 262   }
 263
 264   /* Setup the padding string values if specified. */
 265   if (pad_str_len == 0) {
 266     throw_invalid_argument("pad_string: (empty)");
 267     return String();
 268   }
 269
 270   String ret(pad_length, ReserveString);
 271   char *result = ret.mutableData();
 272
 273   /* We need to figure out the left/right padding lengths. */
 274   int left_pad, right_pad;
 275   switch (pad_type) {
 276   case STR_PAD_RIGHT:
 277     left_pad = 0;
 278     right_pad = num_pad_chars;
 279     break;
 280   case STR_PAD_LEFT:
 281     left_pad = num_pad_chars;
 282     right_pad = 0;
 283     break;
 284   case STR_PAD_BOTH:
 285     left_pad = num_pad_chars / 2;
 286     right_pad = num_pad_chars - left_pad;
 287     break;
 288   default:
 289     throw_invalid_argument("pad_type: %d", pad_type);
 290     return String();
 291   }
 292
 293   /* First we pad on the left. */
 294   int result_len = 0;
 295   for (int i = 0; i < left_pad; i++) {
 296     result[result_len++] = pad_string[i % pad_str_len];
 297   }
 298
 299   /* Then we copy the input string. */
 300   memcpy(result + result_len, input, len);
 301   result_len += len;
 302
 303   /* Finally, we pad on the right. */
 304   for (int i = 0; i < right_pad; i++) {
 305     result[result_len++] = pad_string[i % pad_str_len];
 306   }
 307   ret.setSize(result_len);
 308   return ret;
 309 }
 310
 311 ///////////////////////////////////////////////////////////////////////////////
 312
 313 int string_find(const char *input, int len, char ch, int pos,
 314                 bool case_sensitive) {
 315   assertx(input);
 316   if (pos < 0 || pos > len) {
 317     return -1;
 318   }
 319   const void *ptr;
 320   if (case_sensitive) {
 321     ptr = memchr(input + pos, ch, len - pos);
 322   } else {
 323     ptr = bstrcasechr(input + pos, ch, len - pos);
 324   }
 325   if (ptr != nullptr) {
 326     return (int)((const char *)ptr - input);
 327   }
 328   return -1;
 329 }
 330
 331 int string_rfind(const char *input, int len, char ch, int pos,
 332                  bool case_sensitive) {
 333   assertx(input);
 334   if (pos < -len || pos > len) {
 335     return -1;
 336   }
 337   const void *ptr;
 338   if (case_sensitive) {
 339     if (pos >= 0) {
 340       ptr = memrchr(input + pos, ch, len - pos);
 341     } else {
 342       ptr = memrchr(input, ch, len + pos + 1);
 343     }
 344   } else {
 345     if (pos >= 0) {
 346       ptr = bstrrcasechr(input + pos, ch, len - pos);
 347     } else {
 348       ptr = bstrrcasechr(input, ch, len + pos + 1);
 349     }
 350   }
 351   if (ptr != nullptr) {
 352     return (int)((const char *)ptr - input);
 353   }
 354   return -1;
 355 }
 356
 357 int string_find(const char *input, int len, const char *s, int s_len,
 358                 int pos, bool case_sensitive) {
 359   assertx(input);
 360   assertx(s);
 361   if (!s_len || pos < 0 || pos > len) {
 362     return -1;
 363   }
 364   void *ptr;
 365   if (case_sensitive) {
 366     ptr = (void*)string_memnstr(input + pos, s, s_len, input + len);
 367   } else {
 368     ptr = bstrcasestr(input + pos, len - pos, s, s_len);
 369   }
 370   if (ptr != nullptr) {
 371     return (int)((const char *)ptr - input);
 372   }
 373   return -1;
 374 }
 375
 376 int string_rfind(const char *input, int len, const char *s, int s_len,
 377                  int pos, bool case_sensitive) {
 378   assertx(input);
 379   assertx(s);
 380   if (!s_len || pos < -len || pos > len) {
 381     return -1;
 382   }
 383   void *ptr;
 384   if (case_sensitive) {
 385     if (pos >= 0) {
 386       ptr = bstrrstr(input + pos, len - pos, s, s_len);
 387     } else {
 388       ptr = bstrrstr(input, len + pos + s_len, s, s_len);
 389     }
 390   } else {
 391     if (pos >= 0) {
 392       ptr = bstrrcasestr(input + pos, len - pos, s, s_len);
 393     } else {
 394       ptr = bstrrcasestr(input, len + pos + s_len, s, s_len);
 395     }
 396   }
 397   if (ptr != nullptr) {
 398     return (int)((const char *)ptr - input);
 399   }
 400   return -1;
 401 }
 402
 403 const char *string_memnstr(const char *haystack, const char *needle,
 404                            int needle_len, const char *end) {
 405   const char *p = haystack;
 406   char ne = needle[needle_len-1];
 407
 408   end -= needle_len;
 409   while (p <= end) {
 410     if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
 411       if (!memcmp(needle, p, needle_len-1)) {
 412         return p;
 413       }
 414     }
 415     if (p == nullptr) {
 416       return nullptr;
 417     }
 418     p++;
 419   }
 420   return nullptr;
 421 }
 422
 423 String string_replace(const char *s, int len, int start, int length,
 424                       const char *replacement, int len_repl) {
 425   assertx(s);
 426   assertx(replacement);
 427   assertx(len >= 0);
 428
 429   // if "start" position is negative, count start position from the end
 430   // of the string
 431   if (start < 0) {
 432     start = len + start;
 433     if (start < 0) {
 434       start = 0;
 435     }
 436   }
 437   if (start > len) {
 438     start = len;
 439   }
 440   // if "length" position is negative, set it to the length
 441   // needed to stop that many chars from the end of the string
 442   if (length < 0) {
 443     length = (len - start) + length;
 444     if (length < 0) {
 445       length = 0;
 446     }
 447   }
 448   // check if length is too large
 449   if (length > len) {
 450     length = len;
 451   }
 452   // check if the length is too large adjusting for non-zero start
 453   // Write this way instead of start + length > len to avoid overflow
 454   if (length > len - start) {
 455     length = len - start;
 456   }
 457
 458   String retString(len + len_repl - length, ReserveString);
 459   char *ret = retString.mutableData();
 460
 461   int ret_len = 0;
 462   if (start) {
 463     memcpy(ret, s, start);
 464     ret_len += start;
 465   }
 466   if (len_repl) {
 467     memcpy(ret + ret_len, replacement, len_repl);
 468     ret_len += len_repl;
 469   }
 470   len -= (start + length);
 471   if (len) {
 472     memcpy(ret + ret_len, s + start + length, len);
 473     ret_len += len;
 474   }
 475   retString.setSize(ret_len);
 476   return retString;
 477 }
 478
 479 String string_replace(const char *input, int len,
 480                       const char *search, int len_search,
 481                       const char *replacement, int len_replace,
 482                       int &count, bool case_sensitive) {
 483   assertx(input);
 484   assertx(search && len_search);
 485   assertx(len >= 0);
 486   assertx(len_search >= 0);
 487   assertx(len_replace >= 0);
 488
 489   if (len == 0) {
 490     return String();
 491   }
 492
 493   req::vector<int> founds;
 494   founds.reserve(16);
 495   if (len_search == 1) {
 496     for (int pos = string_find(input, len, *search, 0, case_sensitive);
 497          pos >= 0;
 498          pos = string_find(input, len, *search, pos + len_search,
 499                            case_sensitive)) {
 500       founds.push_back(pos);
 501     }
 502   } else {
 503     for (int pos = string_find(input, len, search, len_search, 0,
 504                                case_sensitive);
 505          pos >= 0;
 506          pos = string_find(input, len, search, len_search,
 507                            pos + len_search, case_sensitive)) {
 508       founds.push_back(pos);
 509     }
 510   }
 511
 512   count = founds.size();
 513   if (count == 0) {
 514     return String(); // not found
 515   }
 516
 517   int reserve;
 518
 519   // Make sure the new size of the string wouldn't overflow int32_t. Don't
 520   // bother if the replacement wouldn't make the string longer.
 521   if (len_replace > len_search) {
 522     auto raise = [&] { raise_error("String too large"); };
 523     if (mul_overflow(len_replace - len_search, count)) {
 524       raise();
 525     }
 526     int diff = (len_replace - len_search) * count;
 527     if (add_overflow(len, diff)) {
 528       raise();
 529     }
 530     reserve = len + diff;
 531   } else {
 532     reserve = len + (len_replace - len_search) * count;
 533   }
 534
 535   String retString(reserve, ReserveString);
 536   char *ret = retString.mutableData();
 537   char *p = ret;
 538   int pos = 0; // last position in input that hasn't been copied over yet
 539   int n;
 540   for (unsigned int i = 0; i < founds.size(); i++) {
 541     n = founds[i];
 542     if (n > pos) {
 543       n -= pos;
 544       memcpy(p, input, n);
 545       p += n;
 546       input += n;
 547       pos += n;
 548     }
 549     if (len_replace) {
 550       memcpy(p, replacement, len_replace);
 551       p += len_replace;
 552     }
 553     input += len_search;
 554     pos += len_search;
 555   }
 556   n = len;
 557   if (n > pos) {
 558     n -= pos;
 559     memcpy(p, input, n);
 560     p += n;
 561   }
 562   retString.setSize(p - ret);
 563   return retString;
 564 }
 565
 566 ///////////////////////////////////////////////////////////////////////////////
 567
 568 String string_chunk_split(const char *src, int srclen, const char *end,
 569                           int endlen, int chunklen) {
 570   int chunks = srclen / chunklen; // complete chunks!
 571   int restlen = srclen - chunks * chunklen; /* srclen % chunklen */
 572
 573   String ret(
 574     safe_address(
 575       chunks + 1,
 576       endlen,
 577       srclen
 578     ),
 579     ReserveString
 580   );
 581   char *dest = ret.mutableData();
 582
 583   const char *p; char *q;
 584   const char *pMax = src + srclen - chunklen + 1;
 585   for (p = src, q = dest; p < pMax; ) {
 586     memcpy(q, p, chunklen);
 587     q += chunklen;
 588     memcpy(q, end, endlen);
 589     q += endlen;
 590     p += chunklen;
 591   }
 592
 593   if (restlen) {
 594     memcpy(q, p, restlen);
 595     q += restlen;
 596     memcpy(q, end, endlen);
 597     q += endlen;
 598   }
 599
 600   ret.setSize(q - dest);
 601   return ret;
 602 }
 603
 604 ///////////////////////////////////////////////////////////////////////////////
 605
 606 #define PHP_TAG_BUF_SIZE 1023
 607
 608 /**
 609  * Check if tag is in a set of tags
 610  *
 611  * states:
 612  *
 613  * 0 start tag
 614  * 1 first non-whitespace char seen
 615  */
 616 static int string_tag_find(const char *tag, int len, const char *set) {
 617   char c, *n;
 618   const char *t;
 619   int state=0, done=0;
 620   char *norm;
 621
 622   if (len <= 0) {
 623     return 0;
 624   }
 625
 626   norm = (char *)req::malloc_noptrs(len+1);
 627   SCOPE_EXIT { req::free(norm); };
 628
 629   n = norm;
 630   t = tag;
 631   c = tolower(*t);
 632   /*
 633     normalize the tag removing leading and trailing whitespace
 634     and turn any <a whatever...> into just <a> and any </tag>
 635     into <tag>
 636   */
 637   while (!done) {
 638     switch (c) {
 639     case '<':
 640       *(n++) = c;
 641       break;
 642     case '>':
 643       done =1;
 644       break;
 645     default:
 646       if (!isspace((int)c)) {
 647         if (state == 0) {
 648           state=1;
 649         }
 650         if (c != '/') {
 651           *(n++) = c;
 652         }
 653       } else {
 654         if (state == 1)
 655           done=1;
 656       }
 657       break;
 658     }
 659     c = tolower(*(++t));
 660   }
 661   *(n++) = '>';
 662   *n = '\0';
 663   if (strstr(set, norm)) {
 664     done=1;
 665   } else {
 666     done=0;
 667   }
 668   return done;
 669 }
 670
 671 /**
 672  * A simple little state-machine to strip out html and php tags
 673  *
 674  * State 0 is the output state, State 1 means we are inside a
 675  * normal html tag and state 2 means we are inside a php tag.
 676  *
 677  * The state variable is passed in to allow a function like fgetss
 678  * to maintain state across calls to the function.
 679  *
 680  * lc holds the last significant character read and br is a bracket
 681  * counter.
 682  *
 683  * When an allow string is passed in we keep track of the string
 684  * in state 1 and when the tag is closed check it against the
 685  * allow string to see if we should allow it.
 686
 687  * swm: Added ability to strip <?xml tags without assuming it PHP
 688  * code.
 689  */
 690 String string_strip_tags(const char *s, const int len,
 691                          const char *allow, const int allow_len,
 692                          bool allow_tag_spaces) {
 693   const char *abuf, *p;
 694   char *rbuf, *tbuf, *tp, *rp, c, lc;
 695
 696   int br, i=0, depth=0, in_q = 0;
 697   int state = 0, pos;
 698
 699   assertx(s);
 700   assertx(allow);
 701
 702   String retString(s, len, CopyString);
 703   rbuf = retString.mutableData();
 704   String allowString;
 705
 706   c = *s;
 707   lc = '\0';
 708   p = s;
 709   rp = rbuf;
 710   br = 0;
 711   if (allow_len) {
 712     assertx(allow);
 713
 714     allowString = String(allow_len, ReserveString);
 715     char *atmp = allowString.mutableData();
 716     for (const char *tmp = allow; *tmp; tmp++, atmp++) {
 717       *atmp = tolower((int)*(const unsigned char *)tmp);
 718     }
 719     allowString.setSize(allow_len);
 720     abuf = allowString.data();
 721
 722     tbuf = (char *)req::malloc_noptrs(PHP_TAG_BUF_SIZE+1);
 723     tp = tbuf;
 724   } else {
 725     abuf = nullptr;
 726     tbuf = tp = nullptr;
 727   }
 728
 729   auto move = [&pos, &tbuf, &tp]() {
 730     if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
 731       pos = tp - tbuf;
 732       tbuf = (char*)req::realloc_noptrs(tbuf,
 733                                         (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
 734       tp = tbuf + pos;
 735     }
 736   };
 737
 738   while (i < len) {
 739     switch (c) {
 740     case '\0':
 741       break;
 742     case '<':
 743       if (isspace(*(p + 1)) && !allow_tag_spaces) {
 744         goto reg_char;
 745       }
 746       if (state == 0) {
 747         lc = '<';
 748         state = 1;
 749         if (allow_len) {
 750           move();
 751           *(tp++) = '<';
 752         }
 753       } else if (state == 1) {
 754         depth++;
 755       }
 756       break;
 757
 758     case '(':
 759       if (state == 2) {
 760         if (lc != '"' && lc != '\'') {
 761           lc = '(';
 762           br++;
 763         }
 764       } else if (allow_len && state == 1) {
 765         move();
 766         *(tp++) = c;
 767       } else if (state == 0) {
 768         *(rp++) = c;
 769       }
 770       break;
 771
 772     case ')':
 773       if (state == 2) {
 774         if (lc != '"' && lc != '\'') {
 775           lc = ')';
 776           br--;
 777         }
 778       } else if (allow_len && state == 1) {
 779         move();
 780         *(tp++) = c;
 781       } else if (state == 0) {
 782         *(rp++) = c;
 783       }
 784       break;
 785
 786     case '>':
 787       if (depth) {
 788         depth--;
 789         break;
 790       }
 791
 792       if (in_q) {
 793         break;
 794       }
 795
 796       switch (state) {
 797       case 1: /* HTML/XML */
 798         lc = '>';
 799         in_q = state = 0;
 800         if (allow_len) {
 801           move();
 802           *(tp++) = '>';
 803           *tp='\0';
 804           if (string_tag_find(tbuf, tp-tbuf, abuf)) {
 805             memcpy(rp, tbuf, tp-tbuf);
 806             rp += tp-tbuf;
 807           }
 808           tp = tbuf;
 809         }
 810         break;
 811
 812       case 2: /* PHP */
 813         if (!br && lc != '\"' && *(p-1) == '?') {
 814           in_q = state = 0;
 815           tp = tbuf;
 816         }
 817         break;
 818
 819       case 3:
 820         in_q = state = 0;
 821         tp = tbuf;
 822         break;
 823
 824       case 4: /* JavaScript/CSS/etc... */
 825         if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') {
 826           in_q = state = 0;
 827           tp = tbuf;
 828         }
 829         break;
 830
 831       default:
 832         *(rp++) = c;
 833         break;
 834       }
 835       break;
 836
 837     case '"':
 838     case '\'':
 839       if (state == 4) {
 840         /* Inside <!-- comment --> */
 841         break;
 842       } else if (state == 2 && *(p-1) != '\\') {
 843         if (lc == c) {
 844           lc = '\0';
 845         } else if (lc != '\\') {
 846           lc = c;
 847         }
 848       } else if (state == 0) {
 849         *(rp++) = c;
 850       } else if (allow_len && state == 1) {
 851         move();
 852         *(tp++) = c;
 853       }
 854       if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) {
 855         if (in_q) {
 856           in_q = 0;
 857         } else {
 858           in_q = *p;
 859         }
 860       }
 861       break;
 862
 863     case '!':
 864       /* JavaScript & Other HTML scripting languages */
 865       if (state == 1 && *(p-1) == '<') {
 866         state = 3;
 867         lc = c;
 868       } else {
 869         if (state == 0) {
 870           *(rp++) = c;
 871         } else if (allow_len && state == 1) {
 872           move();
 873           *(tp++) = c;
 874         }
 875       }
 876       break;
 877
 878     case '-':
 879       if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') {
 880         state = 4;
 881       } else {
 882         goto reg_char;
 883       }
 884       break;
 885
 886     case '?':
 887
 888       if (state == 1 && *(p-1) == '<') {
 889         br=0;
 890         state=2;
 891         break;
 892       }
 893
 894     case 'E':
 895     case 'e':
 896       /* !DOCTYPE exception */
 897       if (state==3 && p > s+6
 898           && tolower(*(p-1)) == 'p'
 899           && tolower(*(p-2)) == 'y'
 900           && tolower(*(p-3)) == 't'
 901           && tolower(*(p-4)) == 'c'
 902           && tolower(*(p-5)) == 'o'
 903           && tolower(*(p-6)) == 'd') {
 904         state = 1;
 905         break;
 906       }
 907       /* fall-through */
 908
 909     case 'l':
 910
 911       /* swm: If we encounter '<?xml' then we shouldn't be in
 912        * state == 2 (PHP). Switch back to HTML.
 913        */
 914
 915       if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') {
 916         state = 1;
 917         break;
 918       }
 919
 920       /* fall-through */
 921     default:
 922     reg_char:
 923       if (state == 0) {
 924         *(rp++) = c;
 925       } else if (allow_len && state == 1) {
 926         move();
 927         *(tp++) = c;
 928       }
 929       break;
 930     }
 931     c = *(++p);
 932     i++;
 933   }
 934   if (rp < rbuf + len) {
 935     *rp = '\0';
 936   }
 937   if (allow_len) {
 938     req::free(tbuf);
 939   }
 940
 941   retString.setSize(rp - rbuf);
 942   return retString;
 943 }
 944
 945 ///////////////////////////////////////////////////////////////////////////////
 946
 947 static char string_hex2int(int c) {
 948   if (isdigit(c)) {
 949     return c - '0';
 950   }
 951   if (c >= 'A' && c <= 'F') {
 952     return c - 'A' + 10;
 953   }
 954   if (c >= 'a' && c <= 'f') {
 955     return c - 'a' + 10;
 956   }
 957   return -1;
 958 }
 959
 960 String string_quoted_printable_encode(const char *input, int len) {
 961   size_t length = len;
 962   const unsigned char *str = (unsigned char*)input;
 963
 964   unsigned long lp = 0;
 965   unsigned char c;
 966   char *d, *buffer;
 967   char *hex = "0123456789ABCDEF";
 968
 969   String ret(
 970     safe_address(
 971       3,
 972       length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1),
 973       1),
 974     ReserveString
 975   );
 976   d = buffer = ret.mutableData();
 977
 978   while (length--) {
 979     if (((c = *str++) == '\015') && (*str == '\012') && length > 0) {
 980       *d++ = '\015';
 981       *d++ = *str++;
 982       length--;
 983       lp = 0;
 984     } else {
 985       if (iscntrl (c) || (c == 0x7f) || (c & 0x80) ||
 986           (c == '=') || ((c == ' ') && (*str == '\015'))) {
 987         if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f))
 988             || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL))
 989             || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL))
 990             || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) {
 991           *d++ = '=';
 992           *d++ = '\015';
 993           *d++ = '\012';
 994           lp = 3;
 995         }
 996         *d++ = '=';
 997         *d++ = hex[c >> 4];
 998         *d++ = hex[c & 0xf];
 999       } else {
1000         if ((++lp) > PHP_QPRINT_MAXL) {
1001           *d++ = '=';
1002           *d++ = '\015';
1003           *d++ = '\012';
1004           lp = 1;
1005         }
1006         *d++ = c;
1007       }
1008     }
1009   }
1010   len = d - buffer;
1011
1012   ret.setSize(len);
1013   return ret;
1014 }
1015
1016 String string_quoted_printable_decode(const char *input, int len, bool is_q) {
1017   assertx(input);
1018   if (len == 0) {
1019     return String();
1020   }
1021
1022   int i = 0, j = 0, k;
1023   const char *str_in = input;
1024   String ret(len, ReserveString);
1025   char *str_out = ret.mutableData();
1026   while (i < len && str_in[i]) {
1027     switch (str_in[i]) {
1028     case '=':
1029       if (i + 2 < len && str_in[i + 1] && str_in[i + 2] &&
1030           isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2]))
1031         {
1032           str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4)
1033             + string_hex2int((int) str_in[i + 2]);
1034           i += 3;
1035         } else  /* check for soft line break according to RFC 2045*/ {
1036         k = 1;
1037         while (str_in[i + k] &&
1038                ((str_in[i + k] == 32) || (str_in[i + k] == 9))) {
1039           /* Possibly, skip spaces/tabs at the end of line */
1040           k++;
1041         }
1042         if (!str_in[i + k]) {
1043           /* End of line reached */
1044           i += k;
1045         }
1046         else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) {
1047           /* CRLF */
1048           i += k + 2;
1049         }
1050         else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) {
1051           /* CR or LF */
1052           i += k + 1;
1053         }
1054         else {
1055           str_out[j++] = str_in[i++];
1056         }
1057       }
1058       break;
1059     case '_':
1060       if (is_q) {
1061         str_out[j++] = ' ';
1062         i++;
1063       } else {
1064         str_out[j++] = str_in[i++];
1065       }
1066       break;
1067     default:
1068       str_out[j++] = str_in[i++];
1069     }
1070   }
1071   ret.setSize(j);
1072   return ret;
1073 }
1074
1075 Variant string_base_to_numeric(const char *s, int len, int base) {
1076   int64_t num = 0;
1077   double fnum = 0;
1078   int mode = 0;
1079   int64_t cutoff;
1080   int cutlim;
1081
1082   assertx(string_validate_base(base));
1083
1084   cutoff = LONG_MAX / base;
1085   cutlim = LONG_MAX % base;
1086
1087   for (int i = len; i > 0; i--) {
1088     char c = *s++;
1089
1090     /* might not work for EBCDIC */
1091     if (c >= '0' && c <= '9')
1092       c -= '0';
1093     else if (c >= 'A' && c <= 'Z')
1094       c -= 'A' - 10;
1095     else if (c >= 'a' && c <= 'z')
1096       c -= 'a' - 10;
1097     else
1098       continue;
1099
1100     if (c >= base)
1101       continue;
1102
1103     switch (mode) {
1104     case 0: /* Integer */
1105       if (num < cutoff || (num == cutoff && c <= cutlim)) {
1106         num = num * base + c;
1107         break;
1108       } else {
1109         fnum = num;
1110         mode = 1;
1111       }
1112       /* fall-through */
1113     case 1: /* Float */
1114       fnum = fnum * base + c;
1115     }
1116   }
1117
1118   if (mode == 1) {
1119     return fnum;
1120   }
1121   return num;
1122 }
1123
1124 String string_long_to_base(unsigned long value, int base) {
1125   static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1126   char buf[(sizeof(unsigned long) << 3) + 1];
1127   char *ptr, *end;
1128
1129   assertx(string_validate_base(base));
1130
1131   end = ptr = buf + sizeof(buf) - 1;
1132
1133   do {
1134     *--ptr = digits[value % base];
1135     value /= base;
1136   } while (ptr > buf && value);
1137
1138   return String(ptr, end - ptr, CopyString);
1139 }
1140
1141 String string_numeric_to_base(const Variant& value, int base) {
1142   static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1143
1144   assertx(string_validate_base(base));
1145   if ((!value.isInteger() && !value.isDouble())) {
1146     return empty_string();
1147   }
1148
1149   if (value.isDouble()) {
1150     double fvalue = floor(value.toDouble()); /* floor it just in case */
1151     char *ptr, *end;
1152     char buf[(sizeof(double) << 3) + 1];
1153
1154     /* Don't try to convert +/- infinity */
1155     if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) {
1156       raise_warning("Number too large");
1157       return empty_string();
1158     }
1159
1160     end = ptr = buf + sizeof(buf) - 1;
1161
1162     do {
1163       *--ptr = digits[(int) fmod(fvalue, base)];
1164       fvalue /= base;
1165     } while (ptr > buf && fabs(fvalue) >= 1);
1166
1167     return String(ptr, end - ptr, CopyString);
1168   }
1169
1170   return string_long_to_base(value.toInt64(), base);
1171 }
1172
1173 ///////////////////////////////////////////////////////////////////////////////
1174 // uuencode
1175
1176 #define PHP_UU_ENC(c) \
1177   ((c) ? ((c) & 077) + ' ' : '`')
1178 #define PHP_UU_ENC_C2(c) \
1179   PHP_UU_ENC(((*(c) << 4) & 060) | ((*((c) + 1) >> 4) & 017))
1180 #define PHP_UU_ENC_C3(c) \
1181   PHP_UU_ENC(((*(c + 1) << 2) & 074) | ((*((c) + 2) >> 6) & 03))
1182 #define PHP_UU_DEC(c) \
1183   (((c) - ' ') & 077)
1184
1185 String string_uuencode(const char *src, int src_len) {
1186   assertx(src);
1187   assertx(src_len);
1188
1189   int len = 45;
1190   char *p;
1191   const char *s, *e, *ee;
1192   char *dest;
1193
1194   /* encoded length is ~ 38% greater than the original */
1195   String ret((int)ceil(src_len * 1.38) + 45, ReserveString);
1196   p = dest = ret.mutableData();
1197   s = src;
1198   e = src + src_len;
1199
1200   while ((s + 3) < e) {
1201     ee = s + len;
1202     if (ee > e) {
1203       ee = e;
1204       len = ee - s;
1205       if (len % 3) {
1206         ee = s + (int) (floor(len / 3) * 3);
1207       }
1208     }
1209     *p++ = PHP_UU_ENC(len);
1210
1211     while (s < ee) {
1212       *p++ = PHP_UU_ENC(*s >> 2);
1213       *p++ = PHP_UU_ENC_C2(s);
1214       *p++ = PHP_UU_ENC_C3(s);
1215       *p++ = PHP_UU_ENC(*(s + 2) & 077);
1216
1217       s += 3;
1218     }
1219
1220     if (len == 45) {
1221       *p++ = '\n';
1222     }
1223   }
1224
1225   if (s < e) {
1226     if (len == 45) {
1227       *p++ = PHP_UU_ENC(e - s);
1228       len = 0;
1229     }
1230
1231     *p++ = PHP_UU_ENC(*s >> 2);
1232     *p++ = PHP_UU_ENC_C2(s);
1233     *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0');
1234     *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0');
1235   }
1236
1237   if (len < 45) {
1238     *p++ = '\n';
1239   }
1240
1241   *p++ = PHP_UU_ENC('\0');
1242   *p++ = '\n';
1243   *p = '\0';
1244
1245   ret.setSize(p - dest);
1246   return ret;
1247 }
1248
1249 String string_uudecode(const char *src, int src_len) {
1250   int total_len = 0;
1251   int len;
1252   const char *s, *e, *ee;
1253   char *p, *dest;
1254
1255   String ret(ceil(src_len * 0.75), ReserveString);
1256   p = dest = ret.mutableData();
1257   s = src;
1258   e = src + src_len;
1259
1260   while (s < e) {
1261     if ((len = PHP_UU_DEC(*s++)) <= 0) {
1262       break;
1263     }
1264     /* sanity check */
1265     if (len > src_len) {
1266       goto err;
1267     }
1268
1269     total_len += len;
1270
1271     ee = s + (len == 45 ? 60 : (int) floor(len * 1.33));
1272     /* sanity check */
1273     if (ee > e) {
1274       goto err;
1275     }
1276
1277     while (s < ee) {
1278       if (s + 4 > e) goto err;
1279
1280       *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1281       *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1282       *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1283       s += 4;
1284     }
1285
1286     if (len < 45) {
1287       break;
1288     }
1289
1290     /* skip \n */
1291     s++;
1292   }
1293
1294   if ((len = total_len > (p - dest))) {
1295     *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1296     if (len > 1) {
1297       *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1298       if (len > 2) {
1299         *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1300       }
1301     }
1302   }
1303
1304   ret.setSize(total_len);
1305   return ret;
1306
1307  err:
1308   return String();
1309 }
1310
1311 ///////////////////////////////////////////////////////////////////////////////
1312 // base64
1313
1314 namespace {
1315
1316 const char base64_table[] = {
1317   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
1318   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
1319   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
1320   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
1321   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
1322 };
1323
1324 const char base64_pad = '=';
1325
1326 const short base64_reverse_table[256] = {
1327   -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
1328   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1329   -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
1330   52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
1331   -2,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
1332   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
1333   -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
1334   41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
1335   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1336   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1337   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1338   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1339   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1340   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1341   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1342   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
1343 };
1344
1345 folly::Optional<int> maxEncodedSize(int length) {
1346   if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
1347     return folly::none;
1348   }
1349   return ((length + 2) / 3) * 4;
1350 }
1351
1352 // outstr must be at least maxEncodedSize(length) bytes
1353 size_t php_base64_encode(const unsigned char *str, int length,
1354                          unsigned char* outstr) {
1355   const unsigned char *current = str;
1356   unsigned char *p = outstr;
1357
1358   while (length > 2) { /* keep going until we have less than 24 bits */
1359     *p++ = base64_table[current[0] >> 2];
1360     *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1361     *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)];
1362     *p++ = base64_table[current[2] & 0x3f];
1363
1364     current += 3;
1365     length -= 3; /* we just handle 3 octets of data */
1366   }
1367
1368   /* now deal with the tail end of things */
1369   if (length != 0) {
1370     *p++ = base64_table[current[0] >> 2];
1371     if (length > 1) {
1372       *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1373       *p++ = base64_table[(current[1] & 0x0f) << 2];
1374       *p++ = base64_pad;
1375     } else {
1376       *p++ = base64_table[(current[0] & 0x03) << 4];
1377       *p++ = base64_pad;
1378       *p++ = base64_pad;
1379     }
1380   }
1381   return p - outstr;
1382 }
1383
1384 // outstr must be at least length bytes
1385 ssize_t php_base64_decode(const char *str, int length, bool strict,
1386                          unsigned char* outstr) {
1387   const unsigned char *current = (unsigned char*)str;
1388   int ch, i = 0, j = 0, k;
1389   /* this sucks for threaded environments */
1390
1391   unsigned char* result = outstr;
1392
1393   /* run through the whole string, converting as we go */
1394   while ((ch = *current++) != '\0' && length-- > 0) {
1395     if (ch == base64_pad) {
1396       if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
1397         if ((i % 4) != 1) {
1398           while (isspace(*(++current))) {
1399             continue;
1400           }
1401           if (*current == '\0') {
1402             continue;
1403           }
1404         }
1405         return -1;
1406       }
1407       continue;
1408     }
1409
1410     ch = base64_reverse_table[ch];
1411     if ((!strict && ch < 0) || ch == -1) {
1412       /* a space or some other separator character, we simply skip over */
1413       continue;
1414     } else if (ch == -2) {
1415       return -1;
1416     }
1417
1418     switch(i % 4) {
1419     case 0:
1420       result[j] = ch << 2;
1421       break;
1422     case 1:
1423       result[j++] |= ch >> 4;
1424       result[j] = (ch & 0x0f) << 4;
1425       break;
1426     case 2:
1427       result[j++] |= ch >>2;
1428       result[j] = (ch & 0x03) << 6;
1429       break;
1430     case 3:
1431       result[j++] |= ch;
1432       break;
1433     }
1434     i++;
1435   }
1436
1437   k = j;
1438   /* mop things up if we ended on a boundary */
1439   if (ch == base64_pad) {
1440     switch(i % 4) {
1441     case 1:
1442       return -1;
1443     case 2:
1444       k++;
1445     case 3:
1446       result[k] = 0;
1447     }
1448   }
1449   return j;
1450 }
1451
1452 }
1453
1454 String string_base64_encode(const char* input, int len) {
1455   if (auto const wantedSize = maxEncodedSize(len)) {
1456     String ret(*wantedSize, ReserveString);
1457     auto actualSize = php_base64_encode((unsigned char*)input, len,
1458                                         (unsigned char*)ret.mutableData());
1459     ret.setSize(actualSize);
1460     return ret;
1461   }
1462   return String();
1463 }
1464
1465 String string_base64_decode(const char* input, int len, bool strict) {
1466   String ret(len, ReserveString);
1467   auto actualSize = php_base64_decode(input, len, strict,
1468                                       (unsigned char*)ret.mutableData());
1469   if (actualSize < 0) return String();
1470
1471   ret.setSize(actualSize);
1472   return ret;
1473 }
1474
1475 std::string base64_encode(const char* input, int len) {
1476   if (auto const wantedSize = maxEncodedSize(len)) {
1477     std::string ret;
1478     ret.resize(*wantedSize);
1479     auto actualSize = php_base64_encode((unsigned char*)input, len,
1480                                         (unsigned char*)ret.data());
1481     ret.resize(actualSize);
1482     return ret;
1483   }
1484   return std::string();
1485 }
1486
1487 std::string base64_decode(const char* input, int len, bool strict) {
1488   if (!len) return std::string();
1489   std::string ret;
1490   ret.resize(len);
1491   auto actualSize = php_base64_decode(input, len, strict,
1492                                       (unsigned char*)ret.data());
1493   if (!actualSize) return std::string();
1494
1495   ret.resize(actualSize);
1496   return ret;
1497 }
1498
1499 ///////////////////////////////////////////////////////////////////////////////
1500
1501 String string_escape_shell_arg(const char *str) {
1502   int x, y, l;
1503   char *cmd;
1504
1505   y = 0;
1506   l = strlen(str);
1507
1508   String ret(safe_address(l, 4, 3), ReserveString); /* worst case */
1509   cmd = ret.mutableData();
1510
1511 #ifdef _MSC_VER
1512   cmd[y++] = '"';
1513 #else
1514   cmd[y++] = '\'';
1515 #endif
1516
1517   for (x = 0; x < l; x++) {
1518     switch (str[x]) {
1519 #ifdef _MSC_VER
1520     case '"':
1521     case '%':
1522     case '!':
1523       cmd[y++] = ' ';
1524       break;
1525 #else
1526     case '\'':
1527       cmd[y++] = '\'';
1528       cmd[y++] = '\\';
1529       cmd[y++] = '\'';
1530 #endif
1531       /* fall-through */
1532     default:
1533       cmd[y++] = str[x];
1534     }
1535   }
1536 #ifdef _MSC_VER
1537   if (y > 0 && '\\' == cmd[y - 1]) {
1538     int k = 0, n = y - 1;
1539     for (; n >= 0 && '\\' == cmd[n]; n--, k++);
1540     if (k % 2) {
1541       cmd[y++] = '\\';
1542     }
1543   }
1544
1545   cmd[y++] = '"';
1546 #else
1547   cmd[y++] = '\'';
1548 #endif
1549   ret.setSize(y);
1550   return ret;
1551 }
1552
1553 String string_escape_shell_cmd(const char *str) {
1554   register int x, y, l;
1555   char *cmd;
1556   char *p = nullptr;
1557
1558   l = strlen(str);
1559   String ret(safe_address(l, 2, 1), ReserveString);
1560   cmd = ret.mutableData();
1561
1562   for (x = 0, y = 0; x < l; x++) {
1563     switch (str[x]) {
1564 #ifndef _MSC_VER
1565     case '"':
1566     case '\'':
1567       if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) {
1568         /* noop */
1569       } else if (p && *p == str[x]) {
1570         p = nullptr;
1571       } else {
1572         cmd[y++] = '\\';
1573       }
1574       cmd[y++] = str[x];
1575       break;
1576 #else
1577     /* % is Windows specific for environmental variables, ^%PATH% will
1578     output PATH while ^%PATH^% will not. escapeshellcmd->val will
1579     escape all % and !.
1580     */
1581     case '%':
1582     case '!':
1583     case '"':
1584     case '\'':
1585 #endif
1586     case '#': /* This is character-set independent */
1587     case '&':
1588     case ';':
1589     case '`':
1590     case '|':
1591     case '*':
1592     case '?':
1593     case '~':
1594     case '<':
1595     case '>':
1596     case '^':
1597     case '(':
1598     case ')':
1599     case '[':
1600     case ']':
1601     case '{':
1602     case '}':
1603     case '$':
1604     case '\\':
1605     case '\x0A': /* excluding these two */
1606     case '\xFF':
1607 #ifdef _MSC_VER
1608       cmd[y++] = '^';
1609 #else
1610       cmd[y++] = '\\';
1611 #endif
1612       /* fall-through */
1613     default:
1614       cmd[y++] = str[x];
1615     }
1616   }
1617   ret.setSize(y);
1618   return ret;
1619 }
1620
1621 ///////////////////////////////////////////////////////////////////////////////
1622
1623 static void string_similar_str(const char *txt1, int len1,
1624                                const char *txt2, int len2,
1625                                int *pos1, int *pos2, int *max) {
1626   const char *p, *q;
1627   const char *end1 = txt1 + len1;
1628   const char *end2 = txt2 + len2;
1629   int l;
1630
1631   *max = 0;
1632   for (p = txt1; p < end1; p++) {
1633     for (q = txt2; q < end2; q++) {
1634       for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++);
1635       if (l > *max) {
1636         *max = l;
1637         *pos1 = p - txt1;
1638         *pos2 = q - txt2;
1639       }
1640     }
1641   }
1642 }
1643
1644 static int string_similar_char(const char *txt1, int len1,
1645                                const char *txt2, int len2) {
1646   int sum;
1647   int pos1 = 0, pos2 = 0, max;
1648
1649   string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);
1650   if ((sum = max)) {
1651     if (pos1 && pos2) {
1652       sum += string_similar_char(txt1, pos1, txt2, pos2);
1653     }
1654     if ((pos1 + max < len1) && (pos2 + max < len2)) {
1655       sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max,
1656                                  txt2 + pos2 + max, len2 - pos2 - max);
1657     }
1658   }
1659
1660   return sum;
1661 }
1662
1663 int string_similar_text(const char *t1, int len1,
1664                         const char *t2, int len2, float *percent) {
1665   if (len1 == 0 && len2 == 0) {
1666     if (percent) *percent = 0.0;
1667     return 0;
1668   }
1669
1670   int sim = string_similar_char(t1, len1, t2, len2);
1671   if (percent) *percent = sim * 200.0 / (len1 + len2);
1672   return sim;
1673 }
1674
1675 ///////////////////////////////////////////////////////////////////////////////
1676
1677 #define LEVENSHTEIN_MAX_LENTH 255
1678
1679 // reference implementation, only optimized for memory usage, not speed
1680 int string_levenshtein(const char *s1, int l1, const char *s2, int l2,
1681                        int cost_ins, int cost_rep, int cost_del ) {
1682   int *p1, *p2, *tmp;
1683   int i1, i2, c0, c1, c2;
1684
1685   if (l1==0) return l2*cost_ins;
1686   if (l2==0) return l1*cost_del;
1687
1688   if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) {
1689     raise_warning("levenshtein(): Argument string(s) too long");
1690     return -1;
1691   }
1692
1693   p1 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
1694   SCOPE_EXIT { req::free(p1); };
1695   p2 = (int*)req::malloc_noptrs((l2+1) * sizeof(int));
1696   SCOPE_EXIT { req::free(p2); };
1697
1698   for(i2=0;i2<=l2;i2++) {
1699     p1[i2] = i2*cost_ins;
1700   }
1701
1702   for(i1=0;i1<l1;i1++) {
1703     p2[0]=p1[0]+cost_del;
1704     for(i2=0;i2<l2;i2++) {
1705       c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep);
1706       c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1;
1707       c2=p2[i2]+cost_ins; if (c2<c0) c0=c2;
1708       p2[i2+1]=c0;
1709     }
1710     tmp=p1; p1=p2; p2=tmp;
1711   }
1712
1713   c0=p1[l2];
1714   return c0;
1715 }
1716
1717 ///////////////////////////////////////////////////////////////////////////////
1718
1719 String string_money_format(const char *format, double value) {
1720   bool check = false;
1721   const char *p = format;
1722   while ((p = strchr(p, '%'))) {
1723     if (*(p + 1) == '%') {
1724       p += 2;
1725     } else if (!check) {
1726       check = true;
1727       p++;
1728     } else {
1729       throw_invalid_argument
1730         ("format: Only a single %%i or %%n token can be used");
1731       return String();
1732     }
1733   }
1734
1735   int format_len = strlen(format);
1736   int str_len = safe_address(format_len, 1, 1024);
1737   String ret(str_len, ReserveString);
1738   char *str = ret.mutableData();
1739   if ((str_len = strfmon(str, str_len, format, value)) < 0) {
1740     return String();
1741   }
1742   ret.setSize(str_len);
1743   return ret;
1744 }
1745
1746 ///////////////////////////////////////////////////////////////////////////////
1747
1748 String string_number_format(double d, int dec,
1749                             const String& dec_point,
1750                             const String& thousand_sep) {
1751   char *tmpbuf = nullptr, *resbuf;
1752   char *s, *t;  /* source, target */
1753   char *dp;
1754   int integral;
1755   int tmplen, reslen=0;
1756   int count=0;
1757   int is_negative=0;
1758
1759   if (d < 0) {
1760     is_negative = 1;
1761     d = -d;
1762   }
1763
1764   if (dec < 0) dec = 0;
1765   d = php_math_round(d, dec);
1766
1767   // departure from PHP: we got rid of dependencies on spprintf() here.
1768   String tmpstr(63, ReserveString);
1769   tmpbuf = tmpstr.mutableData();
1770   tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d);
1771   if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1772     tmpstr.setSize(tmplen);
1773     return tmpstr;
1774   }
1775   if (tmplen >= 64) {
1776     // Uncommon, asked for more than 64 chars worth of precision
1777     tmpstr = String(tmplen, ReserveString);
1778     tmpbuf = tmpstr.mutableData();
1779     tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d);
1780     if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1781       tmpstr.setSize(tmplen);
1782       return tmpstr;
1783     }
1784   }
1785
1786   /* find decimal point, if expected */
1787   if (dec) {
1788     dp = strpbrk(tmpbuf, ".,");
1789   } else {
1790     dp = nullptr;
1791   }
1792
1793   /* calculate the length of the return buffer */
1794   if (dp) {
1795     integral = dp - tmpbuf;
1796   } else {
1797     /* no decimal point was found */
1798     integral = tmplen;
1799   }
1800
1801   /* allow for thousand separators */
1802   if (!thousand_sep.empty()) {
1803     if (integral + thousand_sep.size() * ((integral-1) / 3) < integral) {
1804       /* overflow */
1805       raise_error("String overflow");
1806     }
1807
1808     integral += ((integral-1) / 3) * thousand_sep.size();
1809   }
1810
1811   reslen = integral;
1812
1813   if (dec) {
1814     reslen += dec;
1815
1816     if (!dec_point.empty()) {
1817       if (reslen + dec_point.size() < dec_point.size()) {
1818         /* overflow */
1819         raise_error("String overflow");
1820       }
1821       reslen += dec_point.size();
1822     }
1823   }
1824
1825   /* add a byte for minus sign */
1826   if (is_negative) {
1827     reslen++;
1828   }
1829   String resstr(reslen, ReserveString);
1830   resbuf = resstr.mutableData();
1831
1832   s = tmpbuf+tmplen-1;
1833   t = resbuf+reslen-1;
1834
1835   /* copy the decimal places.
1836    * Take care, as the sprintf implementation may return less places than
1837    * we requested due to internal buffer limitations */
1838   if (dec) {
1839     int declen = dp ? s - dp : 0;
1840     int topad = dec > declen ? dec - declen : 0;
1841
1842     /* pad with '0's */
1843     while (topad--) {
1844       *t-- = '0';
1845     }
1846
1847     if (dp) {
1848       s -= declen + 1; /* +1 to skip the point */
1849       t -= declen;
1850
1851       /* now copy the chars after the point */
1852       memcpy(t + 1, dp + 1, declen);
1853     }
1854
1855     /* add decimal point */
1856     if (!dec_point.empty()) {
1857       memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size());
1858       t -= dec_point.size();
1859     }
1860   }
1861
1862   /* copy the numbers before the decimal point, adding thousand
1863    * separator every three digits */
1864   while(s >= tmpbuf) {
1865     *t-- = *s--;
1866     if (thousand_sep && (++count%3)==0 && s>=tmpbuf) {
1867       memcpy(t + (1 - thousand_sep.size()),
1868              thousand_sep.data(),
1869              thousand_sep.size());
1870       t -= thousand_sep.size();
1871     }
1872   }
1873
1874   /* and a minus sign, if needed */
1875   if (is_negative) {
1876     *t-- = '-';
1877   }
1878
1879   resstr.setSize(reslen);
1880   return resstr;
1881 }
1882
1883 ///////////////////////////////////////////////////////////////////////////////
1884 // soundex
1885
1886 /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
1887 String string_soundex(const String& str) {
1888   assertx(!str.empty());
1889   int _small, code, last;
1890   String retString(4, ReserveString);
1891   char* soundex = retString.mutableData();
1892
1893   static char soundex_table[26] = {
1894     0,              /* A */
1895     '1',            /* B */
1896     '2',            /* C */
1897     '3',            /* D */
1898     0,              /* E */
1899     '1',            /* F */
1900     '2',            /* G */
1901     0,              /* H */
1902     0,              /* I */
1903     '2',            /* J */
1904     '2',            /* K */
1905     '4',            /* L */
1906     '5',            /* M */
1907     '5',            /* N */
1908     0,              /* O */
1909     '1',            /* P */
1910     '2',            /* Q */
1911     '6',            /* R */
1912     '2',            /* S */
1913     '3',            /* T */
1914     0,              /* U */
1915     '1',            /* V */
1916     0,              /* W */
1917     '2',            /* X */
1918     0,              /* Y */
1919     '2'             /* Z */
1920   };
1921
1922   /* build soundex string */
1923   last = -1;
1924   auto p = str.slice().data();
1925   for (_small = 0; *p && _small < 4; p++) {
1926     /* convert chars to upper case and strip non-letter chars */
1927     /* BUG: should also map here accented letters used in non */
1928     /* English words or names (also found in English text!): */
1929     /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
1930     code = toupper((int)(unsigned char)(*p));
1931     if (code >= 'A' && code <= 'Z') {
1932       if (_small == 0) {
1933         /* remember first valid char */
1934         soundex[_small++] = code;
1935         last = soundex_table[code - 'A'];
1936       } else {
1937         /* ignore sequences of consonants with same soundex */
1938         /* code in trail, and vowels unless they separate */
1939         /* consonant letters */
1940         code = soundex_table[code - 'A'];
1941         if (code != last) {
1942           if (code != 0) {
1943             soundex[_small++] = code;
1944           }
1945           last = code;
1946         }
1947       }
1948     }
1949   }
1950   /* pad with '0' and terminate with 0 ;-) */
1951   while (_small < 4) {
1952     soundex[_small++] = '0';
1953   }
1954   retString.setSize(4);
1955   return retString;
1956 }
1957
1958 ///////////////////////////////////////////////////////////////////////////////
1959 // metaphone
1960
1961 /**
1962  * this is now the original code by Michael G Schwern:
1963  * i've changed it just a slightly bit (use emalloc,
1964  * get rid of includes etc)
1965  * - thies - 13.09.1999
1966  */
1967
1968 /*-----------------------------  */
1969 /* this used to be "metaphone.h" */
1970 /*-----------------------------  */
1971
1972 /* Special encodings */
1973 #define  SH   'X'
1974 #define  TH   '0'
1975
1976 /*-----------------------------  */
1977 /* end of "metaphone.h"          */
1978 /*-----------------------------  */
1979
1980 /*----------------------------- */
1981 /* this used to be "metachar.h" */
1982 /*----------------------------- */
1983
1984 /* Metachar.h ... little bits about characters for metaphone */
1985 /*-- Character encoding array & accessing macros --*/
1986 /* Stolen directly out of the book... */
1987 char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
1988
1989 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
1990
1991 #define isvowel(c)  (ENCODE(c) & 1)    /* AEIOU */
1992
1993 /* These letters are passed through unchanged */
1994 #define NOCHANGE(c) (ENCODE(c) & 2)    /* FJMNR */
1995
1996 /* These form dipthongs when preceding H */
1997 #define AFFECTH(c)  (ENCODE(c) & 4)    /* CGPST */
1998
1999 /* These make C and G soft */
2000 #define MAKESOFT(c) (ENCODE(c) & 8)    /* EIY */
2001
2002 /* These prevent GH from becoming F */
2003 #define NOGHTOF(c)  (ENCODE(c) & 16)  /* BDH */
2004
2005 /*----------------------------- */
2006 /* end of "metachar.h"          */
2007 /*----------------------------- */
2008
2009 /* I suppose I could have been using a character pointer instead of
2010  * accesssing the array directly... */
2011
2012 /* Look at the next letter in the word */
2013 #define Next_Letter ((char)toupper(word[w_idx+1]))
2014 /* Look at the current letter in the word */
2015 #define Curr_Letter ((char)toupper(word[w_idx]))
2016 /* Go N letters back. */
2017 #define Look_Back_Letter(n)  (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
2018 /* Previous letter.  I dunno, should this return null on failure? */
2019 #define Prev_Letter (Look_Back_Letter(1))
2020 /* Look two letters down.  It makes sure you don't walk off the string. */
2021 #define After_Next_Letter  (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
2022                            : '\0')
2023 #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
2024
2025 /* Allows us to safely look ahead an arbitrary # of letters */
2026 /* I probably could have just used strlen... */
2027 static char Lookahead(unsigned char *word, int how_far) {
2028   char letter_ahead = '\0';  /* null by default */
2029   int idx;
2030   for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
2031   /* Edge forward in the string... */
2032
2033   letter_ahead = (char)word[idx];  /* idx will be either == to how_far or
2034                                     * at the end of the string
2035                                     */
2036   return letter_ahead;
2037 }
2038
2039 /* phonize one letter
2040  * We don't know the buffers size in advance. On way to solve this is to just
2041  * re-allocate the buffer size. We're using an extra of 2 characters (this
2042  * could be one though; or more too). */
2043 #define Phonize(c)  { buffer.append(c); }
2044 /* How long is the phoned word? */
2045 #define Phone_Len  (buffer.size())
2046
2047 /* Note is a letter is a 'break' in the word */
2048 #define Isbreak(c)  (!isalpha(c))
2049
2050 String string_metaphone(const char *input, int word_len, long max_phonemes,
2051                         int traditional) {
2052   unsigned char *word = (unsigned char *)input;
2053
2054   int w_idx = 0;        /* point in the phonization we're at. */
2055   int max_buffer_len = 0;    /* maximum length of the destination buffer */
2056
2057   /*-- Parameter checks --*/
2058   /* Negative phoneme length is meaningless */
2059
2060   if (max_phonemes < 0)
2061     return String();
2062
2063   /* Empty/null string is meaningless */
2064   /* Overly paranoid */
2065   /* always_assert(word != NULL && word[0] != '\0'); */
2066
2067   if (word == nullptr)
2068     return String();
2069
2070   /*-- Allocate memory for our phoned_phrase --*/
2071   if (max_phonemes == 0) {  /* Assume largest possible */
2072     max_buffer_len = word_len;
2073   } else {
2074     max_buffer_len = max_phonemes;
2075   }
2076   StringBuffer buffer(max_buffer_len);
2077
2078   /*-- The first phoneme has to be processed specially. --*/
2079   /* Find our first letter */
2080   for (; !isalpha(Curr_Letter); w_idx++) {
2081     /* On the off chance we were given nothing but crap... */
2082     if (Curr_Letter == '\0') {
2083       return buffer.detach();  /* For testing */
2084     }
2085   }
2086
2087   switch (Curr_Letter) {
2088     /* AE becomes E */
2089   case 'A':
2090     if (Next_Letter == 'E') {
2091       Phonize('E');
2092       w_idx += 2;
2093     }
2094     /* Remember, preserve vowels at the beginning */
2095     else {
2096       Phonize('A');
2097       w_idx++;
2098     }
2099     break;
2100     /* [GKP]N becomes N */
2101   case 'G':
2102   case 'K':
2103   case 'P':
2104     if (Next_Letter == 'N') {
2105       Phonize('N');
2106       w_idx += 2;
2107     }
2108     break;
2109     /* WH becomes H,
2110        WR becomes R
2111        W if followed by a vowel */
2112   case 'W':
2113     if (Next_Letter == 'H' ||
2114       Next_Letter == 'R') {
2115       Phonize(Next_Letter);
2116       w_idx += 2;
2117     } else if (isvowel(Next_Letter)) {
2118       Phonize('W');
2119       w_idx += 2;
2120     }
2121     /* else ignore */
2122     break;
2123     /* X becomes S */
2124   case 'X':
2125     Phonize('S');
2126     w_idx++;
2127     break;
2128     /* Vowels are kept */
2129     /* We did A already
2130        case 'A':
2131        case 'a':
2132      */
2133   case 'E':
2134   case 'I':
2135   case 'O':
2136   case 'U':
2137     Phonize(Curr_Letter);
2138     w_idx++;
2139     break;
2140   default:
2141     /* do nothing */
2142     break;
2143   }
2144
2145   /* On to the metaphoning */
2146   for (; Curr_Letter != '\0' &&
2147          (max_phonemes == 0 || Phone_Len < max_phonemes);
2148        w_idx++) {
2149     /* How many letters to skip because an eariler encoding handled
2150      * multiple letters */
2151     unsigned short int skip_letter = 0;
2152
2153
2154     /* THOUGHT:  It would be nice if, rather than having things like...
2155      * well, SCI.  For SCI you encode the S, then have to remember
2156      * to skip the C.  So the phonome SCI invades both S and C.  It would
2157      * be better, IMHO, to skip the C from the S part of the encoding.
2158      * Hell, I'm trying it.
2159      */
2160
2161     /* Ignore non-alphas */
2162     if (!isalpha(Curr_Letter))
2163       continue;
2164
2165     /* Drop duplicates, except CC */
2166     if (Curr_Letter == Prev_Letter &&
2167       Curr_Letter != 'C')
2168       continue;
2169
2170     switch (Curr_Letter) {
2171       /* B -> B unless in MB */
2172     case 'B':
2173       if (Prev_Letter != 'M')
2174         Phonize('B');
2175       break;
2176       /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
2177        * (SCHW is handled in S)
2178        *  S if -CI-, -CE- or -CY-
2179        *  dropped if -SCI-, SCE-, -SCY- (handed in S)
2180        *  else K
2181        */
2182     case 'C':
2183       if (MAKESOFT(Next_Letter)) {  /* C[IEY] */
2184         if (After_Next_Letter == 'A' &&
2185           Next_Letter == 'I') {  /* CIA */
2186           Phonize(SH);
2187         }
2188         /* SC[IEY] */
2189         else if (Prev_Letter == 'S') {
2190           /* Dropped */
2191         } else {
2192           Phonize('S');
2193         }
2194       } else if (Next_Letter == 'H') {
2195         if ((!traditional) && (After_Next_Letter == 'R' ||
2196                                Prev_Letter == 'S')) {  /* Christ, School */
2197           Phonize('K');
2198         } else {
2199           Phonize(SH);
2200         }
2201         skip_letter++;
2202       } else {
2203         Phonize('K');
2204       }
2205       break;
2206       /* J if in -DGE-, -DGI- or -DGY-
2207        * else T
2208        */
2209     case 'D':
2210       if (Next_Letter == 'G' && MAKESOFT(After_Next_Letter)) {
2211         Phonize('J');
2212         skip_letter++;
2213       } else
2214         Phonize('T');
2215       break;
2216       /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
2217        * else dropped if -GNED, -GN,
2218        * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
2219        * else J if in -GE-, -GI, -GY and not GG
2220        * else K
2221        */
2222     case 'G':
2223       if (Next_Letter == 'H') {
2224         if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) {
2225           Phonize('F');
2226           skip_letter++;
2227         } else {
2228           /* silent */
2229         }
2230       } else if (Next_Letter == 'N') {
2231         if (Isbreak(After_Next_Letter) ||
2232             (After_Next_Letter == 'E' && Look_Ahead_Letter(3) == 'D')) {
2233           /* dropped */
2234         } else
2235           Phonize('K');
2236       } else if (MAKESOFT(Next_Letter) && Prev_Letter != 'G') {
2237         Phonize('J');
2238       } else {
2239         Phonize('K');
2240       }
2241       break;
2242       /* H if before a vowel and not after C,G,P,S,T */
2243     case 'H':
2244       if (isvowel(Next_Letter) && !AFFECTH(Prev_Letter))
2245         Phonize('H');
2246       break;
2247       /* dropped if after C
2248        * else K
2249        */
2250     case 'K':
2251       if (Prev_Letter != 'C')
2252         Phonize('K');
2253       break;
2254       /* F if before H
2255        * else P
2256        */
2257     case 'P':
2258       if (Next_Letter == 'H') {
2259         Phonize('F');
2260       } else {
2261         Phonize('P');
2262       }
2263       break;
2264       /* K
2265        */
2266     case 'Q':
2267       Phonize('K');
2268       break;
2269       /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
2270        * else S
2271        */
2272     case 'S':
2273       if (Next_Letter == 'I' &&
2274           (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2275         Phonize(SH);
2276       } else if (Next_Letter == 'H') {
2277         Phonize(SH);
2278         skip_letter++;
2279       } else if ((!traditional) &&
2280                  (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' &&
2281                   Look_Ahead_Letter(3) == 'W')) {
2282         Phonize(SH);
2283         skip_letter += 2;
2284       } else {
2285         Phonize('S');
2286       }
2287       break;
2288       /* 'sh' in -TIA- or -TIO-
2289        * else 'th' before H
2290        * else T
2291        */
2292     case 'T':
2293       if (Next_Letter == 'I' &&
2294         (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2295         Phonize(SH);
2296       } else if (Next_Letter == 'H') {
2297         Phonize(TH);
2298         skip_letter++;
2299       } else {
2300         Phonize('T');
2301       }
2302       break;
2303       /* F */
2304     case 'V':
2305       Phonize('F');
2306       break;
2307       /* W before a vowel, else dropped */
2308     case 'W':
2309       if (isvowel(Next_Letter))
2310         Phonize('W');
2311       break;
2312       /* KS */
2313     case 'X':
2314       Phonize('K');
2315       Phonize('S');
2316       break;
2317       /* Y if followed by a vowel */
2318     case 'Y':
2319       if (isvowel(Next_Letter))
2320         Phonize('Y');
2321       break;
2322       /* S */
2323     case 'Z':
2324       Phonize('S');
2325       break;
2326       /* No transformation */
2327     case 'F':
2328     case 'J':
2329     case 'L':
2330     case 'M':
2331     case 'N':
2332     case 'R':
2333       Phonize(Curr_Letter);
2334       break;
2335     default:
2336       /* nothing */
2337       break;
2338     } /* END SWITCH */
2339
2340     w_idx += skip_letter;
2341   } /* END FOR */
2342
2343   return buffer.detach();
2344 }
2345
2346 ///////////////////////////////////////////////////////////////////////////////
2347 // Cyrillic
2348
2349 /**
2350  * This is codetables for different Cyrillic charsets (relative to koi8-r).
2351  * Each table contains data for 128-255 symbols from ASCII table.
2352  * First 256 symbols are for conversion from koi8-r to corresponding charset,
2353  * second 256 symbols are for reverse conversion, from charset to koi8-r.
2354  *
2355  * Here we have the following tables:
2356  * _cyr_win1251   - for windows-1251 charset
2357  * _cyr_iso88595  - for iso8859-5 charset
2358  * _cyr_cp866     - for x-cp866 charset
2359  * _cyr_mac       - for x-mac-cyrillic charset
2360  */
2361 typedef unsigned char _cyr_charset_table[512];
2362
2363 static const _cyr_charset_table _cyr_win1251 = {
2364   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2365   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2366   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2367   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2368   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2369   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2370   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2371   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2372   46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2373   46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2374   154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
2375   46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
2376   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2377   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2378   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2379   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2380   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2381   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2382   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2383   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2384   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2385   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2386   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2387   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2388   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2389   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2390   32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
2391   32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
2392   254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2393   239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2394   222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
2395   207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
2396 };
2397
2398 static const _cyr_charset_table _cyr_cp866 = {
2399   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2400   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2401   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2402   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2403   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2404   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2405   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2406   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2407   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2408   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2409   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2410   35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
2411   43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
2412   45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
2413   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2414   179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
2415   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2416   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2417   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2418   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2419   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2420   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2421   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2422   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2423   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2424   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2425   205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
2426   199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
2427   238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
2428   175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
2429   158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2430   143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2431 };
2432
2433 static const _cyr_charset_table _cyr_iso88595 = {
2434   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2435   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2436   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2437   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2438   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2439   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2440   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2441   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2442   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2443   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2444   32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2445   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2446   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2447   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2448   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2449   32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2450   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2451   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2452   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2453   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2454   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2455   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2456   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2457   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2458   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2459   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2460   32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
2461   32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
2462   238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
2463   223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
2464   206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
2465   191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
2466 };
2467
2468 static const _cyr_charset_table _cyr_mac = {
2469   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2470   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2471   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2472   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2473   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2474   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2475   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2476   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2477   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2478   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2479   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2480   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2481   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2482   144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
2483   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2484   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
2485   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2486   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2487   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2488   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2489   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2490   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2491   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2492   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2493   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2494   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2495   160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
2496   176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
2497   254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2498   239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2499   158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2500   143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2501 };
2502
2503 /**
2504  * This is the function that performs real in-place conversion of the string
2505  * between charsets.
2506  * Parameters:
2507  *    str - string to be converted
2508  *    from,to - one-symbol label of source and destination charset
2509  * The following symbols are used as labels:
2510  *    k - koi8-r
2511  *    w - windows-1251
2512  *    i - iso8859-5
2513  *    a - x-cp866
2514  *    d - x-cp866
2515  *    m - x-mac-cyrillic
2516  */
2517 String string_convert_cyrillic_string(const String& input, char from, char to) {
2518   const unsigned char *from_table, *to_table;
2519   unsigned char tmp;
2520   auto uinput = (unsigned char*)input.slice().data();
2521   String retString(input.size(), ReserveString);
2522   unsigned char *str = (unsigned char *)retString.mutableData();
2523
2524   from_table = nullptr;
2525   to_table   = nullptr;
2526
2527   switch (toupper((int)(unsigned char)from)) {
2528   case 'W': from_table = _cyr_win1251;  break;
2529   case 'A':
2530   case 'D': from_table = _cyr_cp866;    break;
2531   case 'I': from_table = _cyr_iso88595; break;
2532   case 'M': from_table = _cyr_mac;      break;
2533   case 'K':
2534     break;
2535   default:
2536     throw_invalid_argument("Unknown source charset: %c", from);
2537     break;
2538   }
2539
2540   switch (toupper((int)(unsigned char)to)) {
2541   case 'W': to_table = _cyr_win1251;    break;
2542   case 'A':
2543   case 'D': to_table = _cyr_cp866;      break;
2544   case 'I': to_table = _cyr_iso88595;   break;
2545   case 'M': to_table = _cyr_mac;        break;
2546   case 'K':
2547     break;
2548   default:
2549     throw_invalid_argument("Unknown destination charset: %c", to);
2550     break;
2551   }
2552
2553   for (int i = 0; i < input.size(); i++) {
2554     tmp = from_table == nullptr ? uinput[i] : from_table[uinput[i]];
2555     str[i] = to_table == nullptr ? tmp : to_table[tmp + 256];
2556   }
2557   retString.setSize(input.size());
2558   return retString;
2559 }
2560
2561 ///////////////////////////////////////////////////////////////////////////////
2562 // Hebrew
2563
2564 #define HEB_BLOCK_TYPE_ENG 1
2565 #define HEB_BLOCK_TYPE_HEB 2
2566
2567 #define isheb(c)                                                        \
2568   (((((unsigned char) c) >= 224) && (((unsigned char) c) <= 250)) ? 1 : 0)
2569 #define _isblank(c)                                                     \
2570   (((((unsigned char) c) == ' '  || ((unsigned char) c) == '\t')) ? 1 : 0)
2571 #define _isnewline(c)                                                   \
2572   (((((unsigned char) c) == '\n' || ((unsigned char) c) == '\r')) ? 1 : 0)
2573
2574 /**
2575  * Converts Logical Hebrew text (Hebrew Windows style) to Visual text
2576  * Cheers/complaints/flames - Zeev Suraski <zeev@php.net>
2577  */
2578 String
2579 string_convert_hebrew_string(const String& inStr, int /*max_chars_per_line*/,
2580                              int convert_newlines) {
2581   assertx(!inStr.empty());
2582   auto str = inStr.data();
2583   auto str_len = inStr.size();
2584   const char *tmp;
2585   char *heb_str, *broken_str;
2586   char *target;
2587   int block_start, block_end, block_type, block_length, i;
2588   long max_chars=0;
2589   int begin, end, char_count, orig_begin;
2590
2591   tmp = str;
2592   block_start=block_end=0;
2593
2594   heb_str = (char *) req::malloc_noptrs(str_len + 1);
2595   SCOPE_EXIT { req::free(heb_str); };
2596   target = heb_str+str_len;
2597   *target = 0;
2598   target--;
2599
2600   block_length=0;
2601
2602   if (isheb(*tmp)) {
2603     block_type = HEB_BLOCK_TYPE_HEB;
2604   } else {
2605     block_type = HEB_BLOCK_TYPE_ENG;
2606   }
2607
2608   do {
2609     if (block_type == HEB_BLOCK_TYPE_HEB) {
2610       while ((isheb((int)*(tmp+1)) ||
2611               _isblank((int)*(tmp+1)) ||
2612               ispunct((int)*(tmp+1)) ||
2613               (int)*(tmp+1)=='\n' ) && block_end<str_len-1) {
2614         tmp++;
2615         block_end++;
2616         block_length++;
2617       }
2618       for (i = block_start; i<= block_end; i++) {
2619         *target = str[i];
2620         switch (*target) {
2621         case '(':  *target = ')';  break;
2622         case ')':  *target = '(';  break;
2623         case '[':  *target = ']';  break;
2624         case ']':  *target = '[';  break;
2625         case '{':  *target = '}';  break;
2626         case '}':  *target = '{';  break;
2627         case '<':  *target = '>';  break;
2628         case '>':  *target = '<';  break;
2629         case '\\': *target = '/';  break;
2630         case '/':  *target = '\\'; break;
2631         default:
2632           break;
2633         }
2634         target--;
2635       }
2636       block_type = HEB_BLOCK_TYPE_ENG;
2637     } else {
2638       while (!isheb(*(tmp+1)) &&
2639              (int)*(tmp+1)!='\n' && block_end < str_len-1) {
2640         tmp++;
2641         block_end++;
2642         block_length++;
2643       }
2644       while ((_isblank((int)*tmp) ||
2645               ispunct((int)*tmp)) && *tmp!='/' &&
2646              *tmp!='-' && block_end > block_start) {
2647         tmp--;
2648         block_end--;
2649       }
2650       for (i = block_end; i >= block_start; i--) {
2651         *target = str[i];
2652         target--;
2653       }
2654       block_type = HEB_BLOCK_TYPE_HEB;
2655     }
2656     block_start=block_end+1;
2657   } while (block_end < str_len-1);
2658
2659   String brokenStr(str_len, ReserveString);
2660   broken_str = brokenStr.mutableData();
2661   begin=end=str_len-1;
2662   target = broken_str;
2663
2664   while (1) {
2665     char_count=0;
2666     while ((!max_chars || char_count < max_chars) && begin > 0) {
2667       char_count++;
2668       begin--;
2669       if (begin <= 0 || _isnewline(heb_str[begin])) {
2670         while (begin > 0 && _isnewline(heb_str[begin-1])) {
2671           begin--;
2672           char_count++;
2673         }
2674         break;
2675       }
2676     }
2677     if (char_count == max_chars) { /* try to avoid breaking words */
2678       int new_char_count=char_count, new_begin=begin;
2679
2680       while (new_char_count > 0) {
2681         if (_isblank(heb_str[new_begin]) || _isnewline(heb_str[new_begin])) {
2682           break;
2683         }
2684         new_begin++;
2685         new_char_count--;
2686       }
2687       if (new_char_count > 0) {
2688         char_count=new_char_count;
2689         begin=new_begin;
2690       }
2691     }
2692     orig_begin=begin;
2693
2694     if (_isblank(heb_str[begin])) {
2695       heb_str[begin]='\n';
2696     }
2697     while (begin <= end && _isnewline(heb_str[begin])) {
2698       /* skip leading newlines */
2699       begin++;
2700     }
2701     for (i = begin; i <= end; i++) { /* copy content */
2702       *target = heb_str[i];
2703       target++;
2704     }
2705     for (i = orig_begin; i <= end && _isnewline(heb_str[i]); i++) {
2706       *target = heb_str[i];
2707       target++;
2708     }
2709     begin=orig_begin;
2710
2711     if (begin <= 0) {
2712       *target = 0;
2713       break;
2714     }
2715     begin--;
2716     end=begin;
2717   }
2718
2719   if (convert_newlines) {
2720     int count;
2721     auto ret = string_replace(broken_str, str_len, "\n", strlen("\n"),
2722                               "<br />\n", strlen("<br />\n"), count, true);
2723     if (!ret.isNull()) {
2724       return ret;
2725     }
2726   }
2727   brokenStr.setSize(str_len);
2728   return brokenStr;
2729 }
2730
2731 ///////////////////////////////////////////////////////////////////////////////
2732 }