hphp/runtime/base/zend-string.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com)     |
   6    | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) |
   7    +----------------------------------------------------------------------+
   8    | This source file is subject to version 2.00 of the Zend license,     |
   9    | that is bundled with this package in the file LICENSE, and is        |
  10    | available through the world-wide-web at the following url:           |
  11    | http://www.zend.com/license/2_00.txt.                                |
  12    | If you did not receive a copy of the Zend license and are unable to  |
  13    | obtain it through the world-wide-web, please send a note to          |
  14    | license@zend.com so we can mail you a copy immediately.              |
  15    +----------------------------------------------------------------------+
  16 */
  17
  18 #include "hphp/runtime/base/zend-string.h"
  19 #include "hphp/runtime/base/zend-printf.h"
  20 #include "hphp/runtime/base/zend-math.h"
  21
  22 #include "hphp/util/lock.h"
  23 #include "hphp/util/overflow.h"
  24 #include <math.h>
  25 #include <monetary.h>
  26
  27 #include "hphp/util/bstring.h"
  28 #include "hphp/runtime/base/exceptions.h"
  29 #include "hphp/runtime/base/string-buffer.h"
  30 #include "hphp/runtime/base/runtime-error.h"
  31 #include "hphp/runtime/base/type-conversions.h"
  32 #include "hphp/runtime/base/string-util.h"
  33 #include "hphp/runtime/base/builtin-functions.h"
  34
  35 #ifdef __APPLE__
  36 #ifndef isnan
  37 #define isnan(x)  \
  38   ( sizeof (x) == sizeof(float )  ? __inline_isnanf((float)(x)) \
  39   : sizeof (x) == sizeof(double)  ? __inline_isnand((double)(x))  \
  40   : __inline_isnan ((long double)(x)))
  41 #endif
  42
  43 #ifndef isinf
  44 #define isinf(x)  \
  45   ( sizeof (x) == sizeof(float )  ? __inline_isinff((float)(x)) \
  46   : sizeof (x) == sizeof(double)  ? __inline_isinfd((double)(x))  \
  47   : __inline_isinf ((long double)(x)))
  48 #endif
  49 #endif
  50
  51
  52 #define PHP_QPRINT_MAXL 75
  53
  54 namespace HPHP {
  55 ///////////////////////////////////////////////////////////////////////////////
  56 // helpers
  57
  58 bool string_substr_check(int len, int &f, int &l) {
  59   if (l < 0 && -l > len) {
  60     return false;
  61   } else if (l > len) {
  62     l = len;
  63   }
  64
  65   if (f > len) {
  66     return false;
  67   } else if (f < 0 && -f > len) {
  68     f = 0;
  69   }
  70
  71   if (l < 0 && (l + len - f) < 0) {
  72     return false;
  73   }
  74
  75   // if "from" position is negative, count start position from the end
  76   if (f < 0) {
  77     f += len;
  78     if (f < 0) {
  79       f = 0;
  80     }
  81   }
  82   if (f >= len) {
  83     return false;
  84   }
  85
  86   // if "length" position is negative, set it to the length
  87   // needed to stop that many chars from the end of the string
  88   if (l < 0) {
  89     l += len - f;
  90     if (l < 0) {
  91       l = 0;
  92     }
  93   }
  94   if ((unsigned int)f + (unsigned int)l > (unsigned int)len) {
  95     l = len - f;
  96   }
  97   return true;
  98 }
  99
 100 void string_charmask(const char *sinput, int len, char *mask) {
 101   const unsigned char *input = (unsigned char *)sinput;
 102   const unsigned char *end;
 103   unsigned char c;
 104
 105   memset(mask, 0, 256);
 106   for (end = input+len; input < end; input++) {
 107     c=*input;
 108     if ((input+3 < end) && input[1] == '.' && input[2] == '.'
 109         && input[3] >= c) {
 110       memset(mask+c, 1, input[3] - c + 1);
 111       input+=3;
 112     } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') {
 113       /* Error, try to be as helpful as possible:
 114          (a range ending/starting with '.' won't be captured here) */
 115       if (end-len >= input) { /* there was no 'left' char */
 116         throw_invalid_argument
 117           ("charlist: Invalid '..'-range, missing left of '..'");
 118         continue;
 119       }
 120       if (input+2 >= end) { /* there is no 'right' char */
 121         throw_invalid_argument
 122           ("charlist: Invalid '..'-range, missing right of '..'");
 123         continue;
 124       }
 125       if (input[-1] > input[2]) { /* wrong order */
 126         throw_invalid_argument
 127           ("charlist: '..'-range needs to be incrementing");
 128         continue;
 129       }
 130       /* FIXME: better error (a..b..c is the only left possibility?) */
 131       throw_invalid_argument("charlist: Invalid '..'-range");
 132       continue;
 133     } else {
 134       mask[c]=1;
 135     }
 136   }
 137 }
 138
 139 int string_copy(char *dst, const char *src, int siz) {
 140   register char *d = dst;
 141   register const char *s = src;
 142   register size_t n = siz;
 143
 144   /* Copy as many bytes as will fit */
 145   if (n != 0 && --n != 0) {
 146     do {
 147       if ((*d++ = *s++) == 0)
 148         break;
 149     } while (--n != 0);
 150   }
 151
 152   /* Not enough room in dst, add NUL and traverse rest of src */
 153   if (n == 0) {
 154     if (siz != 0)
 155       *d = '\0';    /* NUL-terminate dst */
 156     while (*s++)
 157       ;
 158   }
 159
 160   return(s - src - 1);  /* count does not include NUL */
 161 }
 162
 163 ///////////////////////////////////////////////////////////////////////////////
 164 // comparisons
 165
 166 int string_ncmp(const char *s1, const char *s2, int len) {
 167   for (int i = 0; i < len; i++) {
 168     char c1 = s1[i];
 169     char c2 = s2[i];
 170     if (c1 > c2) return 1;
 171     if (c1 < c2) return -1;
 172   }
 173   return 0;
 174 }
 175
 176 static int compare_right(char const **a, char const *aend,
 177                          char const **b, char const *bend) {
 178   int bias = 0;
 179
 180   /* The longest run of digits wins.  That aside, the greatest
 181      value wins, but we can't know that it will until we've scanned
 182      both numbers to know that they have the same magnitude, so we
 183      remember it in BIAS. */
 184   for(;; (*a)++, (*b)++) {
 185     if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
 186         (*b == bend || !isdigit((int)(unsigned char)**b)))
 187       return bias;
 188     else if (*a == aend || !isdigit((int)(unsigned char)**a))
 189       return -1;
 190     else if (*b == bend || !isdigit((int)(unsigned char)**b))
 191       return +1;
 192     else if (**a < **b) {
 193       if (!bias)
 194         bias = -1;
 195     } else if (**a > **b) {
 196       if (!bias)
 197         bias = +1;
 198     }
 199   }
 200
 201   return 0;
 202 }
 203
 204 static int compare_left(char const **a, char const *aend,
 205                         char const **b, char const *bend) {
 206   /* Compare two left-aligned numbers: the first to have a
 207      different value wins. */
 208   for(;; (*a)++, (*b)++) {
 209     if ((*a == aend || !isdigit((int)(unsigned char)**a)) &&
 210         (*b == bend || !isdigit((int)(unsigned char)**b)))
 211       return 0;
 212     else if (*a == aend || !isdigit((int)(unsigned char)**a))
 213       return -1;
 214     else if (*b == bend || !isdigit((int)(unsigned char)**b))
 215       return +1;
 216     else if (**a < **b)
 217       return -1;
 218     else if (**a > **b)
 219       return +1;
 220   }
 221
 222   return 0;
 223 }
 224
 225 int string_natural_cmp(char const *a, size_t a_len,
 226                        char const *b, size_t b_len, int fold_case) {
 227   char ca, cb;
 228   char const *ap, *bp;
 229   char const *aend = a + a_len, *bend = b + b_len;
 230   int fractional, result;
 231
 232   if (a_len == 0 || b_len == 0)
 233     return a_len - b_len;
 234
 235   ap = a;
 236   bp = b;
 237   while (1) {
 238     ca = *ap; cb = *bp;
 239
 240     /* skip over leading spaces or zeros */
 241     while (isspace((int)(unsigned char)ca))
 242       ca = *++ap;
 243
 244     while (isspace((int)(unsigned char)cb))
 245       cb = *++bp;
 246
 247     /* process run of digits */
 248     if (isdigit((int)(unsigned char)ca)  &&  isdigit((int)(unsigned char)cb)) {
 249       fractional = (ca == '0' || cb == '0');
 250
 251       if (fractional)
 252         result = compare_left(&ap, aend, &bp, bend);
 253       else
 254         result = compare_right(&ap, aend, &bp, bend);
 255
 256       if (result != 0)
 257         return result;
 258       else if (ap == aend && bp == bend)
 259         /* End of the strings. Let caller sort them out. */
 260         return 0;
 261       else {
 262         /* Keep on comparing from the current point. */
 263         ca = *ap; cb = *bp;
 264       }
 265     }
 266
 267     if (fold_case) {
 268       ca = toupper((int)(unsigned char)ca);
 269       cb = toupper((int)(unsigned char)cb);
 270     }
 271
 272     if (ca < cb)
 273       return -1;
 274     else if (ca > cb)
 275       return +1;
 276
 277     ++ap; ++bp;
 278     if (ap >= aend && bp >= bend)
 279       /* The strings compare the same.  Perhaps the caller
 280          will want to call strcmp to break the tie. */
 281       return 0;
 282     else if (ap >= aend)
 283       return -1;
 284     else if (bp >= bend)
 285       return 1;
 286   }
 287 }
 288
 289 ///////////////////////////////////////////////////////////////////////////////
 290
 291 void string_to_case(String& s, int (*tocase)(int)) {
 292   assert(!s.isNull());
 293   assert(tocase);
 294   auto data = s.mutableData();
 295   auto len = s.size();
 296   for (int i = 0; i < len; i++) {
 297     data[i] = tocase(data[i]);
 298   }
 299 }
 300
 301 ///////////////////////////////////////////////////////////////////////////////
 302
 303 #define STR_PAD_LEFT            0
 304 #define STR_PAD_RIGHT           1
 305 #define STR_PAD_BOTH            2
 306
 307 String string_pad(const char *input, int len, int pad_length,
 308                   const char *pad_string, int pad_str_len,
 309                   int pad_type) {
 310   assert(input);
 311   int num_pad_chars = pad_length - len;
 312
 313   /* If resulting string turns out to be shorter than input string,
 314      we simply copy the input and return. */
 315   if (pad_length < 0 || num_pad_chars < 0) {
 316     return String(input, len, CopyString);
 317   }
 318
 319   /* Setup the padding string values if specified. */
 320   if (pad_str_len == 0) {
 321     throw_invalid_argument("pad_string: (empty)");
 322     return String();
 323   }
 324
 325   String ret(pad_length, ReserveString);
 326   char *result = ret.mutableData();
 327
 328   /* We need to figure out the left/right padding lengths. */
 329   int left_pad, right_pad;
 330   switch (pad_type) {
 331   case STR_PAD_RIGHT:
 332     left_pad = 0;
 333     right_pad = num_pad_chars;
 334     break;
 335   case STR_PAD_LEFT:
 336     left_pad = num_pad_chars;
 337     right_pad = 0;
 338     break;
 339   case STR_PAD_BOTH:
 340     left_pad = num_pad_chars / 2;
 341     right_pad = num_pad_chars - left_pad;
 342     break;
 343   default:
 344     throw_invalid_argument("pad_type: %d", pad_type);
 345     return String();
 346   }
 347
 348   /* First we pad on the left. */
 349   int result_len = 0;
 350   for (int i = 0; i < left_pad; i++) {
 351     result[result_len++] = pad_string[i % pad_str_len];
 352   }
 353
 354   /* Then we copy the input string. */
 355   memcpy(result + result_len, input, len);
 356   result_len += len;
 357
 358   /* Finally, we pad on the right. */
 359   for (int i = 0; i < right_pad; i++) {
 360     result[result_len++] = pad_string[i % pad_str_len];
 361   }
 362   ret.setSize(result_len);
 363   return ret;
 364 }
 365
 366 ///////////////////////////////////////////////////////////////////////////////
 367
 368 int string_find(const char *input, int len, char ch, int pos,
 369                 bool case_sensitive) {
 370   assert(input);
 371   if (pos < 0 || pos > len) {
 372     return -1;
 373   }
 374   const void *ptr;
 375   if (case_sensitive) {
 376     ptr = memchr(input + pos, ch, len - pos);
 377   } else {
 378     ptr = bstrcasechr(input + pos, ch, len - pos);
 379   }
 380   if (ptr != nullptr) {
 381     return (int)((const char *)ptr - input);
 382   }
 383   return -1;
 384 }
 385
 386 int string_rfind(const char *input, int len, char ch, int pos,
 387                  bool case_sensitive) {
 388   assert(input);
 389   if (pos < -len || pos > len) {
 390     return -1;
 391   }
 392   const void *ptr;
 393   if (case_sensitive) {
 394     if (pos >= 0) {
 395       ptr = memrchr(input + pos, ch, len - pos);
 396     } else {
 397       ptr = memrchr(input, ch, len + pos + 1);
 398     }
 399   } else {
 400     if (pos >= 0) {
 401       ptr = bstrrcasechr(input + pos, ch, len - pos);
 402     } else {
 403       ptr = bstrrcasechr(input, ch, len + pos + 1);
 404     }
 405   }
 406   if (ptr != nullptr) {
 407     return (int)((const char *)ptr - input);
 408   }
 409   return -1;
 410 }
 411
 412 int string_find(const char *input, int len, const char *s, int s_len,
 413                 int pos, bool case_sensitive) {
 414   assert(input);
 415   assert(s);
 416   if (!s_len || pos < 0 || pos > len) {
 417     return -1;
 418   }
 419   void *ptr;
 420   if (case_sensitive) {
 421     ptr = (void*)string_memnstr(input + pos, s, s_len, input + len);
 422   } else {
 423     ptr = bstrcasestr(input + pos, len - pos, s, s_len);
 424   }
 425   if (ptr != nullptr) {
 426     return (int)((const char *)ptr - input);
 427   }
 428   return -1;
 429 }
 430
 431 int string_rfind(const char *input, int len, const char *s, int s_len,
 432                  int pos, bool case_sensitive) {
 433   assert(input);
 434   assert(s);
 435   if (!s_len || pos < -len || pos > len) {
 436     return -1;
 437   }
 438   void *ptr;
 439   if (case_sensitive) {
 440     if (pos >= 0) {
 441       ptr = bstrrstr(input + pos, len - pos, s, s_len);
 442     } else {
 443       ptr = bstrrstr(input, len + pos + s_len, s, s_len);
 444     }
 445   } else {
 446     if (pos >= 0) {
 447       ptr = bstrrcasestr(input + pos, len - pos, s, s_len);
 448     } else {
 449       ptr = bstrrcasestr(input, len + pos + s_len, s, s_len);
 450     }
 451   }
 452   if (ptr != nullptr) {
 453     return (int)((const char *)ptr - input);
 454   }
 455   return -1;
 456 }
 457
 458 const char *string_memnstr(const char *haystack, const char *needle,
 459                            int needle_len, const char *end) {
 460   const char *p = haystack;
 461   char ne = needle[needle_len-1];
 462
 463   end -= needle_len;
 464   while (p <= end) {
 465     if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
 466       if (!memcmp(needle, p, needle_len-1)) {
 467         return p;
 468       }
 469     }
 470     if (p == nullptr) {
 471       return nullptr;
 472     }
 473     p++;
 474   }
 475   return nullptr;
 476 }
 477
 478 String string_replace(const char *s, int len, int start, int length,
 479                       const char *replacement, int len_repl) {
 480   assert(s);
 481   assert(replacement);
 482   assert(len >= 0);
 483
 484   // if "start" position is negative, count start position from the end
 485   // of the string
 486   if (start < 0) {
 487     start = len + start;
 488     if (start < 0) {
 489       start = 0;
 490     }
 491   }
 492   if (start > len) {
 493     start = len;
 494   }
 495   // if "length" position is negative, set it to the length
 496   // needed to stop that many chars from the end of the string
 497   if (length < 0) {
 498     length = (len - start) + length;
 499     if (length < 0) {
 500       length = 0;
 501     }
 502   }
 503   // check if length is too large
 504   if (length > len) {
 505     length = len;
 506   }
 507   // check if the length is too large adjusting for non-zero start
 508   // Write this way instead of start + length > len to avoid overflow
 509   if (length > len - start) {
 510     length = len - start;
 511   }
 512
 513   String retString(len + len_repl - length, ReserveString);
 514   char *ret = retString.mutableData();
 515
 516   int ret_len = 0;
 517   if (start) {
 518     memcpy(ret, s, start);
 519     ret_len += start;
 520   }
 521   if (len_repl) {
 522     memcpy(ret + ret_len, replacement, len_repl);
 523     ret_len += len_repl;
 524   }
 525   len -= (start + length);
 526   if (len) {
 527     memcpy(ret + ret_len, s + start + length, len);
 528     ret_len += len;
 529   }
 530   retString.setSize(ret_len);
 531   return retString;
 532 }
 533
 534 String string_replace(const char *input, int len,
 535                       const char *search, int len_search,
 536                       const char *replacement, int len_replace,
 537                       int &count, bool case_sensitive) {
 538   assert(input);
 539   assert(search && len_search);
 540   assert(len >= 0);
 541   assert(len_search >= 0);
 542   assert(len_replace >= 0);
 543
 544   if (len == 0) {
 545     return String();
 546   }
 547
 548   req::vector<int> founds;
 549   founds.reserve(16);
 550   if (len_search == 1) {
 551     for (int pos = string_find(input, len, *search, 0, case_sensitive);
 552          pos >= 0;
 553          pos = string_find(input, len, *search, pos + len_search,
 554                            case_sensitive)) {
 555       founds.push_back(pos);
 556     }
 557   } else {
 558     for (int pos = string_find(input, len, search, len_search, 0,
 559                                case_sensitive);
 560          pos >= 0;
 561          pos = string_find(input, len, search, len_search,
 562                            pos + len_search, case_sensitive)) {
 563       founds.push_back(pos);
 564     }
 565   }
 566
 567   count = founds.size();
 568   if (count == 0) {
 569     return String(); // not found
 570   }
 571
 572   int reserve;
 573
 574   // Make sure the new size of the string wouldn't overflow int32_t. Don't
 575   // bother if the replacement wouldn't make the string longer.
 576   if (len_replace > len_search) {
 577     auto raise = [&] { raise_error("String too large"); };
 578     if (mul_overflow(len_replace - len_search, count)) {
 579       raise();
 580     }
 581     int diff = (len_replace - len_search) * count;
 582     if (add_overflow(len, diff)) {
 583       raise();
 584     }
 585     reserve = len + diff;
 586   } else {
 587     reserve = len + (len_replace - len_search) * count;
 588   }
 589
 590   String retString(reserve, ReserveString);
 591   char *ret = retString.mutableData();
 592   char *p = ret;
 593   int pos = 0; // last position in input that hasn't been copied over yet
 594   int n;
 595   for (unsigned int i = 0; i < founds.size(); i++) {
 596     n = founds[i];
 597     if (n > pos) {
 598       n -= pos;
 599       memcpy(p, input, n);
 600       p += n;
 601       input += n;
 602       pos += n;
 603     }
 604     if (len_replace) {
 605       memcpy(p, replacement, len_replace);
 606       p += len_replace;
 607     }
 608     input += len_search;
 609     pos += len_search;
 610   }
 611   n = len;
 612   if (n > pos) {
 613     n -= pos;
 614     memcpy(p, input, n);
 615     p += n;
 616   }
 617   retString.setSize(p - ret);
 618   return retString;
 619 }
 620
 621 ///////////////////////////////////////////////////////////////////////////////
 622
 623 String string_chunk_split(const char *src, int srclen, const char *end,
 624                           int endlen, int chunklen) {
 625   int chunks = srclen / chunklen; // complete chunks!
 626   int restlen = srclen - chunks * chunklen; /* srclen % chunklen */
 627
 628   String ret(
 629     safe_address(
 630       chunks + 1,
 631       endlen,
 632       srclen
 633     ),
 634     ReserveString
 635   );
 636   char *dest = ret.mutableData();
 637
 638   const char *p; char *q;
 639   const char *pMax = src + srclen - chunklen + 1;
 640   for (p = src, q = dest; p < pMax; ) {
 641     memcpy(q, p, chunklen);
 642     q += chunklen;
 643     memcpy(q, end, endlen);
 644     q += endlen;
 645     p += chunklen;
 646   }
 647
 648   if (restlen) {
 649     memcpy(q, p, restlen);
 650     q += restlen;
 651     memcpy(q, end, endlen);
 652     q += endlen;
 653   }
 654
 655   ret.setSize(q - dest);
 656   return ret;
 657 }
 658
 659 ///////////////////////////////////////////////////////////////////////////////
 660
 661 #define PHP_TAG_BUF_SIZE 1023
 662
 663 /**
 664  * Check if tag is in a set of tags
 665  *
 666  * states:
 667  *
 668  * 0 start tag
 669  * 1 first non-whitespace char seen
 670  */
 671 static int string_tag_find(const char *tag, int len, const char *set) {
 672   char c, *n;
 673   const char *t;
 674   int state=0, done=0;
 675   char *norm;
 676
 677   if (len <= 0) {
 678     return 0;
 679   }
 680
 681   norm = (char *)req::malloc(len+1);
 682
 683   n = norm;
 684   t = tag;
 685   c = tolower(*t);
 686   /*
 687     normalize the tag removing leading and trailing whitespace
 688     and turn any <a whatever...> into just <a> and any </tag>
 689     into <tag>
 690   */
 691   while (!done) {
 692     switch (c) {
 693     case '<':
 694       *(n++) = c;
 695       break;
 696     case '>':
 697       done =1;
 698       break;
 699     default:
 700       if (!isspace((int)c)) {
 701         if (state == 0) {
 702           state=1;
 703         }
 704         if (c != '/') {
 705           *(n++) = c;
 706         }
 707       } else {
 708         if (state == 1)
 709           done=1;
 710       }
 711       break;
 712     }
 713     c = tolower(*(++t));
 714   }
 715   *(n++) = '>';
 716   *n = '\0';
 717   if (strstr(set, norm)) {
 718     done=1;
 719   } else {
 720     done=0;
 721   }
 722   req::free(norm);
 723   return done;
 724 }
 725
 726 /**
 727  * A simple little state-machine to strip out html and php tags
 728  *
 729  * State 0 is the output state, State 1 means we are inside a
 730  * normal html tag and state 2 means we are inside a php tag.
 731  *
 732  * The state variable is passed in to allow a function like fgetss
 733  * to maintain state across calls to the function.
 734  *
 735  * lc holds the last significant character read and br is a bracket
 736  * counter.
 737  *
 738  * When an allow string is passed in we keep track of the string
 739  * in state 1 and when the tag is closed check it against the
 740  * allow string to see if we should allow it.
 741
 742  * swm: Added ability to strip <?xml tags without assuming it PHP
 743  * code.
 744  */
 745 String string_strip_tags(const char *s, const int len,
 746                          const char *allow, const int allow_len,
 747                          bool allow_tag_spaces) {
 748   const char *abuf, *p;
 749   char *rbuf, *tbuf, *tp, *rp, c, lc;
 750
 751   int br, i=0, depth=0, in_q = 0;
 752   int state = 0, pos;
 753
 754   assert(s);
 755   assert(allow);
 756
 757   String retString(s, len, CopyString);
 758   rbuf = retString.mutableData();
 759   String allowString;
 760
 761   c = *s;
 762   lc = '\0';
 763   p = s;
 764   rp = rbuf;
 765   br = 0;
 766   if (allow_len) {
 767     assert(allow);
 768
 769     allowString = String(allow_len, ReserveString);
 770     char *atmp = allowString.mutableData();
 771     for (const char *tmp = allow; *tmp; tmp++, atmp++) {
 772       *atmp = tolower((int)*(const unsigned char *)tmp);
 773     }
 774     allowString.setSize(allow_len);
 775     abuf = allowString.data();
 776
 777     tbuf = (char *)req::malloc(PHP_TAG_BUF_SIZE+1);
 778     tp = tbuf;
 779   } else {
 780     abuf = nullptr;
 781     tbuf = tp = nullptr;
 782   }
 783
 784   auto move = [&pos, &tbuf, &tp]() {
 785     if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
 786       pos = tp - tbuf;
 787       tbuf = (char*)req::realloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
 788       tp = tbuf + pos;
 789     }
 790   };
 791
 792   while (i < len) {
 793     switch (c) {
 794     case '\0':
 795       break;
 796     case '<':
 797       if (isspace(*(p + 1)) && !allow_tag_spaces) {
 798         goto reg_char;
 799       }
 800       if (state == 0) {
 801         lc = '<';
 802         state = 1;
 803         if (allow_len) {
 804           move();
 805           *(tp++) = '<';
 806         }
 807       } else if (state == 1) {
 808         depth++;
 809       }
 810       break;
 811
 812     case '(':
 813       if (state == 2) {
 814         if (lc != '"' && lc != '\'') {
 815           lc = '(';
 816           br++;
 817         }
 818       } else if (allow_len && state == 1) {
 819         move();
 820         *(tp++) = c;
 821       } else if (state == 0) {
 822         *(rp++) = c;
 823       }
 824       break;
 825
 826     case ')':
 827       if (state == 2) {
 828         if (lc != '"' && lc != '\'') {
 829           lc = ')';
 830           br--;
 831         }
 832       } else if (allow_len && state == 1) {
 833         move();
 834         *(tp++) = c;
 835       } else if (state == 0) {
 836         *(rp++) = c;
 837       }
 838       break;
 839
 840     case '>':
 841       if (depth) {
 842         depth--;
 843         break;
 844       }
 845
 846       if (in_q) {
 847         break;
 848       }
 849
 850       switch (state) {
 851       case 1: /* HTML/XML */
 852         lc = '>';
 853         in_q = state = 0;
 854         if (allow_len) {
 855           move();
 856           *(tp++) = '>';
 857           *tp='\0';
 858           if (string_tag_find(tbuf, tp-tbuf, abuf)) {
 859             memcpy(rp, tbuf, tp-tbuf);
 860             rp += tp-tbuf;
 861           }
 862           tp = tbuf;
 863         }
 864         break;
 865
 866       case 2: /* PHP */
 867         if (!br && lc != '\"' && *(p-1) == '?') {
 868           in_q = state = 0;
 869           tp = tbuf;
 870         }
 871         break;
 872
 873       case 3:
 874         in_q = state = 0;
 875         tp = tbuf;
 876         break;
 877
 878       case 4: /* JavaScript/CSS/etc... */
 879         if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') {
 880           in_q = state = 0;
 881           tp = tbuf;
 882         }
 883         break;
 884
 885       default:
 886         *(rp++) = c;
 887         break;
 888       }
 889       break;
 890
 891     case '"':
 892     case '\'':
 893       if (state == 4) {
 894         /* Inside <!-- comment --> */
 895         break;
 896       } else if (state == 2 && *(p-1) != '\\') {
 897         if (lc == c) {
 898           lc = '\0';
 899         } else if (lc != '\\') {
 900           lc = c;
 901         }
 902       } else if (state == 0) {
 903         *(rp++) = c;
 904       } else if (allow_len && state == 1) {
 905         move();
 906         *(tp++) = c;
 907       }
 908       if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) {
 909         if (in_q) {
 910           in_q = 0;
 911         } else {
 912           in_q = *p;
 913         }
 914       }
 915       break;
 916
 917     case '!':
 918       /* JavaScript & Other HTML scripting languages */
 919       if (state == 1 && *(p-1) == '<') {
 920         state = 3;
 921         lc = c;
 922       } else {
 923         if (state == 0) {
 924           *(rp++) = c;
 925         } else if (allow_len && state == 1) {
 926           move();
 927           *(tp++) = c;
 928         }
 929       }
 930       break;
 931
 932     case '-':
 933       if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') {
 934         state = 4;
 935       } else {
 936         goto reg_char;
 937       }
 938       break;
 939
 940     case '?':
 941
 942       if (state == 1 && *(p-1) == '<') {
 943         br=0;
 944         state=2;
 945         break;
 946       }
 947
 948     case 'E':
 949     case 'e':
 950       /* !DOCTYPE exception */
 951       if (state==3 && p > s+6
 952           && tolower(*(p-1)) == 'p'
 953           && tolower(*(p-2)) == 'y'
 954           && tolower(*(p-3)) == 't'
 955           && tolower(*(p-4)) == 'c'
 956           && tolower(*(p-5)) == 'o'
 957           && tolower(*(p-6)) == 'd') {
 958         state = 1;
 959         break;
 960       }
 961       /* fall-through */
 962
 963     case 'l':
 964
 965       /* swm: If we encounter '<?xml' then we shouldn't be in
 966        * state == 2 (PHP). Switch back to HTML.
 967        */
 968
 969       if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') {
 970         state = 1;
 971         break;
 972       }
 973
 974       /* fall-through */
 975     default:
 976     reg_char:
 977       if (state == 0) {
 978         *(rp++) = c;
 979       } else if (allow_len && state == 1) {
 980         move();
 981         *(tp++) = c;
 982       }
 983       break;
 984     }
 985     c = *(++p);
 986     i++;
 987   }
 988   if (rp < rbuf + len) {
 989     *rp = '\0';
 990   }
 991   if (allow_len) {
 992     req::free(tbuf);
 993   }
 994
 995   retString.setSize(rp - rbuf);
 996   return retString;
 997 }
 998
 999 ///////////////////////////////////////////////////////////////////////////////
1000
1001 String string_addslashes(const char *str, int length) {
1002   assert(str);
1003   if (length == 0) {
1004     return String();
1005   }
1006
1007   String retString((length << 1) + 1, ReserveString);
1008   char *new_str = retString.mutableData();
1009   const char *source = str;
1010   const char *end = source + length;
1011   char *target = new_str;
1012
1013   while (source < end) {
1014     switch (*source) {
1015     case '\0':
1016       *target++ = '\\';
1017       *target++ = '0';
1018       break;
1019     case '\'':
1020     case '\"':
1021     case '\\':
1022       *target++ = '\\';
1023       /* break is missing *intentionally* */
1024     default:
1025       *target++ = *source;
1026       break;
1027     }
1028
1029     source++;
1030   }
1031
1032   retString.setSize(target - new_str);
1033   return retString;
1034 }
1035
1036 ///////////////////////////////////////////////////////////////////////////////
1037
1038 static char string_hex2int(int c) {
1039   if (isdigit(c)) {
1040     return c - '0';
1041   }
1042   if (c >= 'A' && c <= 'F') {
1043     return c - 'A' + 10;
1044   }
1045   if (c >= 'a' && c <= 'f') {
1046     return c - 'a' + 10;
1047   }
1048   return -1;
1049 }
1050
1051 String string_quoted_printable_encode(const char *input, int len) {
1052   size_t length = len;
1053   const unsigned char *str = (unsigned char*)input;
1054
1055   unsigned long lp = 0;
1056   unsigned char c;
1057   char *d, *buffer;
1058   char *hex = "0123456789ABCDEF";
1059
1060   String ret(
1061     safe_address(
1062       3,
1063       length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1),
1064       1),
1065     ReserveString
1066   );
1067   d = buffer = ret.mutableData();
1068
1069   while (length--) {
1070     if (((c = *str++) == '\015') && (*str == '\012') && length > 0) {
1071       *d++ = '\015';
1072       *d++ = *str++;
1073       length--;
1074       lp = 0;
1075     } else {
1076       if (iscntrl (c) || (c == 0x7f) || (c & 0x80) ||
1077           (c == '=') || ((c == ' ') && (*str == '\015'))) {
1078         if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f))
1079             || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL))
1080             || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL))
1081             || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) {
1082           *d++ = '=';
1083           *d++ = '\015';
1084           *d++ = '\012';
1085           lp = 3;
1086         }
1087         *d++ = '=';
1088         *d++ = hex[c >> 4];
1089         *d++ = hex[c & 0xf];
1090       } else {
1091         if ((++lp) > PHP_QPRINT_MAXL) {
1092           *d++ = '=';
1093           *d++ = '\015';
1094           *d++ = '\012';
1095           lp = 1;
1096         }
1097         *d++ = c;
1098       }
1099     }
1100   }
1101   len = d - buffer;
1102
1103   ret.setSize(len);
1104   return ret;
1105 }
1106
1107 String string_quoted_printable_decode(const char *input, int len, bool is_q) {
1108   assert(input);
1109   if (len == 0) {
1110     return String();
1111   }
1112
1113   int i = 0, j = 0, k;
1114   const char *str_in = input;
1115   String ret(len, ReserveString);
1116   char *str_out = ret.mutableData();
1117   while (i < len && str_in[i]) {
1118     switch (str_in[i]) {
1119     case '=':
1120       if (i + 2 < len && str_in[i + 1] && str_in[i + 2] &&
1121           isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2]))
1122         {
1123           str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4)
1124             + string_hex2int((int) str_in[i + 2]);
1125           i += 3;
1126         } else  /* check for soft line break according to RFC 2045*/ {
1127         k = 1;
1128         while (str_in[i + k] &&
1129                ((str_in[i + k] == 32) || (str_in[i + k] == 9))) {
1130           /* Possibly, skip spaces/tabs at the end of line */
1131           k++;
1132         }
1133         if (!str_in[i + k]) {
1134           /* End of line reached */
1135           i += k;
1136         }
1137         else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) {
1138           /* CRLF */
1139           i += k + 2;
1140         }
1141         else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) {
1142           /* CR or LF */
1143           i += k + 1;
1144         }
1145         else {
1146           str_out[j++] = str_in[i++];
1147         }
1148       }
1149       break;
1150     case '_':
1151       if (is_q) {
1152         str_out[j++] = ' ';
1153         i++;
1154       } else {
1155         str_out[j++] = str_in[i++];
1156       }
1157       break;
1158     default:
1159       str_out[j++] = str_in[i++];
1160     }
1161   }
1162   ret.setSize(j);
1163   return ret;
1164 }
1165
1166 Variant string_base_to_numeric(const char *s, int len, int base) {
1167   int64_t num = 0;
1168   double fnum = 0;
1169   int mode = 0;
1170   int64_t cutoff;
1171   int cutlim;
1172
1173   assert(string_validate_base(base));
1174
1175   cutoff = LONG_MAX / base;
1176   cutlim = LONG_MAX % base;
1177
1178   for (int i = len; i > 0; i--) {
1179     char c = *s++;
1180
1181     /* might not work for EBCDIC */
1182     if (c >= '0' && c <= '9')
1183       c -= '0';
1184     else if (c >= 'A' && c <= 'Z')
1185       c -= 'A' - 10;
1186     else if (c >= 'a' && c <= 'z')
1187       c -= 'a' - 10;
1188     else
1189       continue;
1190
1191     if (c >= base)
1192       continue;
1193
1194     switch (mode) {
1195     case 0: /* Integer */
1196       if (num < cutoff || (num == cutoff && c <= cutlim)) {
1197         num = num * base + c;
1198         break;
1199       } else {
1200         fnum = num;
1201         mode = 1;
1202       }
1203       /* fall-through */
1204     case 1: /* Float */
1205       fnum = fnum * base + c;
1206     }
1207   }
1208
1209   if (mode == 1) {
1210     return fnum;
1211   }
1212   return num;
1213 }
1214
1215 String string_long_to_base(unsigned long value, int base) {
1216   static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1217   char buf[(sizeof(unsigned long) << 3) + 1];
1218   char *ptr, *end;
1219
1220   assert(string_validate_base(base));
1221
1222   end = ptr = buf + sizeof(buf) - 1;
1223
1224   do {
1225     *--ptr = digits[value % base];
1226     value /= base;
1227   } while (ptr > buf && value);
1228
1229   return String(ptr, end - ptr, CopyString);
1230 }
1231
1232 String string_numeric_to_base(const Variant& value, int base) {
1233   static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";
1234
1235   assert(string_validate_base(base));
1236   if ((!value.isInteger() && !value.isDouble())) {
1237     return empty_string();
1238   }
1239
1240   if (value.isDouble()) {
1241     double fvalue = floor(value.toDouble()); /* floor it just in case */
1242     char *ptr, *end;
1243     char buf[(sizeof(double) << 3) + 1];
1244
1245     /* Don't try to convert +/- infinity */
1246     if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) {
1247       raise_warning("Number too large");
1248       return empty_string();
1249     }
1250
1251     end = ptr = buf + sizeof(buf) - 1;
1252
1253     do {
1254       *--ptr = digits[(int) fmod(fvalue, base)];
1255       fvalue /= base;
1256     } while (ptr > buf && fabs(fvalue) >= 1);
1257
1258     return String(ptr, end - ptr, CopyString);
1259   }
1260
1261   return string_long_to_base(value.toInt64(), base);
1262 }
1263
1264 ///////////////////////////////////////////////////////////////////////////////
1265 // uuencode
1266
1267 #define PHP_UU_ENC(c) \
1268   ((c) ? ((c) & 077) + ' ' : '`')
1269 #define PHP_UU_ENC_C2(c) \
1270   PHP_UU_ENC(((*(c) << 4) & 060) | ((*((c) + 1) >> 4) & 017))
1271 #define PHP_UU_ENC_C3(c) \
1272   PHP_UU_ENC(((*(c + 1) << 2) & 074) | ((*((c) + 2) >> 6) & 03))
1273 #define PHP_UU_DEC(c) \
1274   (((c) - ' ') & 077)
1275
1276 String string_uuencode(const char *src, int src_len) {
1277   assert(src);
1278   assert(src_len);
1279
1280   int len = 45;
1281   char *p;
1282   const char *s, *e, *ee;
1283   char *dest;
1284
1285   /* encoded length is ~ 38% greater than the original */
1286   String ret((int)ceil(src_len * 1.38) + 45, ReserveString);
1287   p = dest = ret.mutableData();
1288   s = src;
1289   e = src + src_len;
1290
1291   while ((s + 3) < e) {
1292     ee = s + len;
1293     if (ee > e) {
1294       ee = e;
1295       len = ee - s;
1296       if (len % 3) {
1297         ee = s + (int) (floor(len / 3) * 3);
1298       }
1299     }
1300     *p++ = PHP_UU_ENC(len);
1301
1302     while (s < ee) {
1303       *p++ = PHP_UU_ENC(*s >> 2);
1304       *p++ = PHP_UU_ENC_C2(s);
1305       *p++ = PHP_UU_ENC_C3(s);
1306       *p++ = PHP_UU_ENC(*(s + 2) & 077);
1307
1308       s += 3;
1309     }
1310
1311     if (len == 45) {
1312       *p++ = '\n';
1313     }
1314   }
1315
1316   if (s < e) {
1317     if (len == 45) {
1318       *p++ = PHP_UU_ENC(e - s);
1319       len = 0;
1320     }
1321
1322     *p++ = PHP_UU_ENC(*s >> 2);
1323     *p++ = PHP_UU_ENC_C2(s);
1324     *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0');
1325     *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0');
1326   }
1327
1328   if (len < 45) {
1329     *p++ = '\n';
1330   }
1331
1332   *p++ = PHP_UU_ENC('\0');
1333   *p++ = '\n';
1334   *p = '\0';
1335
1336   ret.setSize(p - dest);
1337   return ret;
1338 }
1339
1340 String string_uudecode(const char *src, int src_len) {
1341   int total_len = 0;
1342   int len;
1343   const char *s, *e, *ee;
1344   char *p, *dest;
1345
1346   String ret(ceil(src_len * 0.75), ReserveString);
1347   p = dest = ret.mutableData();
1348   s = src;
1349   e = src + src_len;
1350
1351   while (s < e) {
1352     if ((len = PHP_UU_DEC(*s++)) <= 0) {
1353       break;
1354     }
1355     /* sanity check */
1356     if (len > src_len) {
1357       goto err;
1358     }
1359
1360     total_len += len;
1361
1362     ee = s + (len == 45 ? 60 : (int) floor(len * 1.33));
1363     /* sanity check */
1364     if (ee > e) {
1365       goto err;
1366     }
1367
1368     while (s < ee) {
1369       if (s + 4 > e) goto err;
1370
1371       *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1372       *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1373       *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1374       s += 4;
1375     }
1376
1377     if (len < 45) {
1378       break;
1379     }
1380
1381     /* skip \n */
1382     s++;
1383   }
1384
1385   if ((len = total_len > (p - dest))) {
1386     *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4;
1387     if (len > 1) {
1388       *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2;
1389       if (len > 2) {
1390         *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3));
1391       }
1392     }
1393   }
1394
1395   ret.setSize(total_len);
1396   return ret;
1397
1398  err:
1399   return String();
1400 }
1401
1402 ///////////////////////////////////////////////////////////////////////////////
1403 // base64
1404
1405 static const char base64_table[] = {
1406   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
1407   'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
1408   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
1409   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
1410   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0'
1411 };
1412
1413 static const char base64_pad = '=';
1414
1415 static const short base64_reverse_table[256] = {
1416   -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2,
1417   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1418   -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63,
1419   52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2,
1420   -2,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
1421   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2,
1422   -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
1423   41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2,
1424   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1425   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1426   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1427   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1428   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1429   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1430   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
1431   -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2
1432 };
1433
1434 static String php_base64_encode(const unsigned char *str, int length) {
1435   const unsigned char *current = str;
1436   unsigned char *p;
1437   unsigned char *result;
1438
1439   if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) {
1440     return String();
1441   }
1442
1443   String ret(((length + 2) / 3) * 4, ReserveString);
1444   p = result = (unsigned char *)ret.mutableData();
1445
1446   while (length > 2) { /* keep going until we have less than 24 bits */
1447     *p++ = base64_table[current[0] >> 2];
1448     *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1449     *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)];
1450     *p++ = base64_table[current[2] & 0x3f];
1451
1452     current += 3;
1453     length -= 3; /* we just handle 3 octets of data */
1454   }
1455
1456   /* now deal with the tail end of things */
1457   if (length != 0) {
1458     *p++ = base64_table[current[0] >> 2];
1459     if (length > 1) {
1460       *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)];
1461       *p++ = base64_table[(current[1] & 0x0f) << 2];
1462       *p++ = base64_pad;
1463     } else {
1464       *p++ = base64_table[(current[0] & 0x03) << 4];
1465       *p++ = base64_pad;
1466       *p++ = base64_pad;
1467     }
1468   }
1469   ret.setSize(p - result);
1470   return ret;
1471 }
1472
1473 static String php_base64_decode(const char *str, int length, bool strict) {
1474   const unsigned char *current = (unsigned char*)str;
1475   int ch, i = 0, j = 0, k;
1476   /* this sucks for threaded environments */
1477
1478   String retString(length, ReserveString);
1479   unsigned char* result = (unsigned char*)retString.mutableData();
1480
1481   /* run through the whole string, converting as we go */
1482   while ((ch = *current++) != '\0' && length-- > 0) {
1483     if (ch == base64_pad) {
1484       if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
1485         if ((i % 4) != 1) {
1486           while (isspace(*(++current))) {
1487             continue;
1488           }
1489           if (*current == '\0') {
1490             continue;
1491           }
1492         }
1493         return String();
1494       }
1495       continue;
1496     }
1497
1498     ch = base64_reverse_table[ch];
1499     if ((!strict && ch < 0) || ch == -1) {
1500       /* a space or some other separator character, we simply skip over */
1501       continue;
1502     } else if (ch == -2) {
1503       return String();
1504     }
1505
1506     switch(i % 4) {
1507     case 0:
1508       result[j] = ch << 2;
1509       break;
1510     case 1:
1511       result[j++] |= ch >> 4;
1512       result[j] = (ch & 0x0f) << 4;
1513       break;
1514     case 2:
1515       result[j++] |= ch >>2;
1516       result[j] = (ch & 0x03) << 6;
1517       break;
1518     case 3:
1519       result[j++] |= ch;
1520       break;
1521     }
1522     i++;
1523   }
1524
1525   k = j;
1526   /* mop things up if we ended on a boundary */
1527   if (ch == base64_pad) {
1528     switch(i % 4) {
1529     case 1:
1530       return String();
1531     case 2:
1532       k++;
1533     case 3:
1534       result[k] = 0;
1535     }
1536   }
1537   retString.setSize(j);
1538   return retString;
1539 }
1540
1541 String string_base64_encode(const char *input, int len) {
1542   return php_base64_encode((unsigned char *)input, len);
1543 }
1544
1545 String string_base64_decode(const char *input, int len, bool strict) {
1546   return php_base64_decode(input, len, strict);
1547 }
1548
1549 ///////////////////////////////////////////////////////////////////////////////
1550
1551 String string_escape_shell_arg(const char *str) {
1552   int x, y, l;
1553   char *cmd;
1554
1555   y = 0;
1556   l = strlen(str);
1557
1558   String ret(safe_address(l, 4, 3), ReserveString); /* worst case */
1559   cmd = ret.mutableData();
1560
1561   cmd[y++] = '\'';
1562
1563   for (x = 0; x < l; x++) {
1564     switch (str[x]) {
1565     case '\'':
1566       cmd[y++] = '\'';
1567       cmd[y++] = '\\';
1568       cmd[y++] = '\'';
1569       /* fall-through */
1570     default:
1571       cmd[y++] = str[x];
1572     }
1573   }
1574   cmd[y++] = '\'';
1575   ret.setSize(y);
1576   return ret;
1577 }
1578
1579 String string_escape_shell_cmd(const char *str) {
1580   register int x, y, l;
1581   char *cmd;
1582   char *p = nullptr;
1583
1584   l = strlen(str);
1585   String ret(safe_address(l, 2, 1), ReserveString);
1586   cmd = ret.mutableData();
1587
1588   for (x = 0, y = 0; x < l; x++) {
1589     switch (str[x]) {
1590     case '"':
1591     case '\'':
1592       if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) {
1593         /* noop */
1594       } else if (p && *p == str[x]) {
1595         p = nullptr;
1596       } else {
1597         cmd[y++] = '\\';
1598       }
1599       cmd[y++] = str[x];
1600       break;
1601     case '#': /* This is character-set independent */
1602     case '&':
1603     case ';':
1604     case '`':
1605     case '|':
1606     case '*':
1607     case '?':
1608     case '~':
1609     case '<':
1610     case '>':
1611     case '^':
1612     case '(':
1613     case ')':
1614     case '[':
1615     case ']':
1616     case '{':
1617     case '}':
1618     case '$':
1619     case '\\':
1620     case '\x0A': /* excluding these two */
1621     case '\xFF':
1622       cmd[y++] = '\\';
1623       /* fall-through */
1624     default:
1625       cmd[y++] = str[x];
1626     }
1627   }
1628   ret.setSize(y);
1629   return ret;
1630 }
1631
1632 ///////////////////////////////////////////////////////////////////////////////
1633
1634 static void string_similar_str(const char *txt1, int len1,
1635                                const char *txt2, int len2,
1636                                int *pos1, int *pos2, int *max) {
1637   const char *p, *q;
1638   const char *end1 = txt1 + len1;
1639   const char *end2 = txt2 + len2;
1640   int l;
1641
1642   *max = 0;
1643   for (p = txt1; p < end1; p++) {
1644     for (q = txt2; q < end2; q++) {
1645       for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++);
1646       if (l > *max) {
1647         *max = l;
1648         *pos1 = p - txt1;
1649         *pos2 = q - txt2;
1650       }
1651     }
1652   }
1653 }
1654
1655 static int string_similar_char(const char *txt1, int len1,
1656                                const char *txt2, int len2) {
1657   int sum;
1658   int pos1 = 0, pos2 = 0, max;
1659
1660   string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max);
1661   if ((sum = max)) {
1662     if (pos1 && pos2) {
1663       sum += string_similar_char(txt1, pos1, txt2, pos2);
1664     }
1665     if ((pos1 + max < len1) && (pos2 + max < len2)) {
1666       sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max,
1667                                  txt2 + pos2 + max, len2 - pos2 - max);
1668     }
1669   }
1670
1671   return sum;
1672 }
1673
1674 int string_similar_text(const char *t1, int len1,
1675                         const char *t2, int len2, float *percent) {
1676   if (len1 == 0 && len2 == 0) {
1677     if (percent) *percent = 0.0;
1678     return 0;
1679   }
1680
1681   int sim = string_similar_char(t1, len1, t2, len2);
1682   if (percent) *percent = sim * 200.0 / (len1 + len2);
1683   return sim;
1684 }
1685
1686 ///////////////////////////////////////////////////////////////////////////////
1687
1688 #define LEVENSHTEIN_MAX_LENTH 255
1689
1690 // reference implementation, only optimized for memory usage, not speed
1691 int string_levenshtein(const char *s1, int l1, const char *s2, int l2,
1692                        int cost_ins, int cost_rep, int cost_del ) {
1693   int *p1, *p2, *tmp;
1694   int i1, i2, c0, c1, c2;
1695
1696   if (l1==0) return l2*cost_ins;
1697   if (l2==0) return l1*cost_del;
1698
1699   if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) {
1700     raise_warning("levenshtein(): Argument string(s) too long");
1701     return -1;
1702   }
1703
1704   p1 = (int*)req::malloc((l2+1) * sizeof(int));
1705   p2 = (int*)req::malloc((l2+1) * sizeof(int));
1706
1707   for(i2=0;i2<=l2;i2++) {
1708     p1[i2] = i2*cost_ins;
1709   }
1710
1711   for(i1=0;i1<l1;i1++) {
1712     p2[0]=p1[0]+cost_del;
1713     for(i2=0;i2<l2;i2++) {
1714       c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep);
1715       c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1;
1716       c2=p2[i2]+cost_ins; if (c2<c0) c0=c2;
1717       p2[i2+1]=c0;
1718     }
1719     tmp=p1; p1=p2; p2=tmp;
1720   }
1721
1722   c0=p1[l2];
1723   req::free(p1);
1724   req::free(p2);
1725   return c0;
1726 }
1727
1728 ///////////////////////////////////////////////////////////////////////////////
1729
1730 String string_money_format(const char *format, double value) {
1731   bool check = false;
1732   const char *p = format;
1733   while ((p = strchr(p, '%'))) {
1734     if (*(p + 1) == '%') {
1735       p += 2;
1736     } else if (!check) {
1737       check = true;
1738       p++;
1739     } else {
1740       throw_invalid_argument
1741         ("format: Only a single %%i or %%n token can be used");
1742       return String();
1743     }
1744   }
1745
1746   int format_len = strlen(format);
1747   int str_len = safe_address(format_len, 1, 1024);
1748   String ret(str_len, ReserveString);
1749   char *str = ret.mutableData();
1750   if ((str_len = strfmon(str, str_len, format, value)) < 0) {
1751     return String();
1752   }
1753   ret.setSize(str_len);
1754   return ret;
1755 }
1756
1757 ///////////////////////////////////////////////////////////////////////////////
1758
1759 String string_number_format(double d, int dec,
1760                             const String& dec_point,
1761                             const String& thousand_sep) {
1762   char *tmpbuf = nullptr, *resbuf;
1763   char *s, *t;  /* source, target */
1764   char *dp;
1765   int integral;
1766   int tmplen, reslen=0;
1767   int count=0;
1768   int is_negative=0;
1769
1770   if (d < 0) {
1771     is_negative = 1;
1772     d = -d;
1773   }
1774
1775   if (dec < 0) dec = 0;
1776   d = php_math_round(d, dec);
1777
1778   // departure from PHP: we got rid of dependencies on spprintf() here.
1779   String tmpstr(63, ReserveString);
1780   tmpbuf = tmpstr.mutableData();
1781   tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d);
1782   if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1783     tmpstr.setSize(tmplen);
1784     return tmpstr;
1785   }
1786   if (tmplen >= 64) {
1787     // Uncommon, asked for more than 64 chars worth of precision
1788     tmpstr = String(tmplen, ReserveString);
1789     tmpbuf = tmpstr.mutableData();
1790     tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d);
1791     if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) {
1792       tmpstr.setSize(tmplen);
1793       return tmpstr;
1794     }
1795   }
1796
1797   /* find decimal point, if expected */
1798   if (dec) {
1799     dp = strpbrk(tmpbuf, ".,");
1800   } else {
1801     dp = nullptr;
1802   }
1803
1804   /* calculate the length of the return buffer */
1805   if (dp) {
1806     integral = dp - tmpbuf;
1807   } else {
1808     /* no decimal point was found */
1809     integral = tmplen;
1810   }
1811
1812   /* allow for thousand separators */
1813   if (!thousand_sep.empty()) {
1814     integral += ((integral-1) / 3) * thousand_sep.size();
1815   }
1816
1817   reslen = integral;
1818
1819   if (dec) {
1820     reslen += dec;
1821
1822     if (!dec_point.empty()) {
1823       reslen += dec_point.size();
1824     }
1825   }
1826
1827   /* add a byte for minus sign */
1828   if (is_negative) {
1829     reslen++;
1830   }
1831   String resstr(reslen, ReserveString);
1832   resbuf = resstr.mutableData();
1833
1834   s = tmpbuf+tmplen-1;
1835   t = resbuf+reslen-1;
1836
1837   /* copy the decimal places.
1838    * Take care, as the sprintf implementation may return less places than
1839    * we requested due to internal buffer limitations */
1840   if (dec) {
1841     int declen = dp ? s - dp : 0;
1842     int topad = dec > declen ? dec - declen : 0;
1843
1844     /* pad with '0's */
1845     while (topad--) {
1846       *t-- = '0';
1847     }
1848
1849     if (dp) {
1850       s -= declen + 1; /* +1 to skip the point */
1851       t -= declen;
1852
1853       /* now copy the chars after the point */
1854       memcpy(t + 1, dp + 1, declen);
1855     }
1856
1857     /* add decimal point */
1858     if (!dec_point.empty()) {
1859       memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size());
1860       t -= dec_point.size();
1861     }
1862   }
1863
1864   /* copy the numbers before the decimal point, adding thousand
1865    * separator every three digits */
1866   while(s >= tmpbuf) {
1867     *t-- = *s--;
1868     if (thousand_sep && (++count%3)==0 && s>=tmpbuf) {
1869       memcpy(t + (1 - thousand_sep.size()),
1870              thousand_sep.data(),
1871              thousand_sep.size());
1872       t -= thousand_sep.size();
1873     }
1874   }
1875
1876   /* and a minus sign, if needed */
1877   if (is_negative) {
1878     *t-- = '-';
1879   }
1880
1881   resstr.setSize(reslen);
1882   return resstr;
1883 }
1884
1885 ///////////////////////////////////////////////////////////////////////////////
1886 // soundex
1887
1888 /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */
1889 String string_soundex(const String& str) {
1890   assert(!str.empty());
1891   int _small, code, last;
1892   String retString(4, ReserveString);
1893   char* soundex = retString.mutableData();
1894
1895   static char soundex_table[26] = {
1896     0,              /* A */
1897     '1',            /* B */
1898     '2',            /* C */
1899     '3',            /* D */
1900     0,              /* E */
1901     '1',            /* F */
1902     '2',            /* G */
1903     0,              /* H */
1904     0,              /* I */
1905     '2',            /* J */
1906     '2',            /* K */
1907     '4',            /* L */
1908     '5',            /* M */
1909     '5',            /* N */
1910     0,              /* O */
1911     '1',            /* P */
1912     '2',            /* Q */
1913     '6',            /* R */
1914     '2',            /* S */
1915     '3',            /* T */
1916     0,              /* U */
1917     '1',            /* V */
1918     0,              /* W */
1919     '2',            /* X */
1920     0,              /* Y */
1921     '2'             /* Z */
1922   };
1923
1924   /* build soundex string */
1925   last = -1;
1926   const char *p = str.slice().ptr;
1927   for (_small = 0; *p && _small < 4; p++) {
1928     /* convert chars to upper case and strip non-letter chars */
1929     /* BUG: should also map here accented letters used in non */
1930     /* English words or names (also found in English text!): */
1931     /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */
1932     code = toupper((int)(unsigned char)(*p));
1933     if (code >= 'A' && code <= 'Z') {
1934       if (_small == 0) {
1935         /* remember first valid char */
1936         soundex[_small++] = code;
1937         last = soundex_table[code - 'A'];
1938       } else {
1939         /* ignore sequences of consonants with same soundex */
1940         /* code in trail, and vowels unless they separate */
1941         /* consonant letters */
1942         code = soundex_table[code - 'A'];
1943         if (code != last) {
1944           if (code != 0) {
1945             soundex[_small++] = code;
1946           }
1947           last = code;
1948         }
1949       }
1950     }
1951   }
1952   /* pad with '0' and terminate with 0 ;-) */
1953   while (_small < 4) {
1954     soundex[_small++] = '0';
1955   }
1956   retString.setSize(4);
1957   return retString;
1958 }
1959
1960 ///////////////////////////////////////////////////////////////////////////////
1961 // metaphone
1962
1963 /**
1964  * this is now the original code by Michael G Schwern:
1965  * i've changed it just a slightly bit (use emalloc,
1966  * get rid of includes etc)
1967  * - thies - 13.09.1999
1968  */
1969
1970 /*-----------------------------  */
1971 /* this used to be "metaphone.h" */
1972 /*-----------------------------  */
1973
1974 /* Special encodings */
1975 #define  SH   'X'
1976 #define  TH   '0'
1977
1978 /*-----------------------------  */
1979 /* end of "metaphone.h"          */
1980 /*-----------------------------  */
1981
1982 /*----------------------------- */
1983 /* this used to be "metachar.h" */
1984 /*----------------------------- */
1985
1986 /* Metachar.h ... little bits about characters for metaphone */
1987 /*-- Character encoding array & accessing macros --*/
1988 /* Stolen directly out of the book... */
1989 char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0};
1990
1991 #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0)
1992
1993 #define isvowel(c)  (ENCODE(c) & 1)    /* AEIOU */
1994
1995 /* These letters are passed through unchanged */
1996 #define NOCHANGE(c) (ENCODE(c) & 2)    /* FJMNR */
1997
1998 /* These form dipthongs when preceding H */
1999 #define AFFECTH(c)  (ENCODE(c) & 4)    /* CGPST */
2000
2001 /* These make C and G soft */
2002 #define MAKESOFT(c) (ENCODE(c) & 8)    /* EIY */
2003
2004 /* These prevent GH from becoming F */
2005 #define NOGHTOF(c)  (ENCODE(c) & 16)  /* BDH */
2006
2007 /*----------------------------- */
2008 /* end of "metachar.h"          */
2009 /*----------------------------- */
2010
2011 /* I suppose I could have been using a character pointer instead of
2012  * accesssing the array directly... */
2013
2014 /* Look at the next letter in the word */
2015 #define Next_Letter ((char)toupper(word[w_idx+1]))
2016 /* Look at the current letter in the word */
2017 #define Curr_Letter ((char)toupper(word[w_idx]))
2018 /* Go N letters back. */
2019 #define Look_Back_Letter(n)  (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0')
2020 /* Previous letter.  I dunno, should this return null on failure? */
2021 #define Prev_Letter (Look_Back_Letter(1))
2022 /* Look two letters down.  It makes sure you don't walk off the string. */
2023 #define After_Next_Letter  (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \
2024                            : '\0')
2025 #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n)))
2026
2027 /* Allows us to safely look ahead an arbitrary # of letters */
2028 /* I probably could have just used strlen... */
2029 static char Lookahead(unsigned char *word, int how_far) {
2030   char letter_ahead = '\0';  /* null by default */
2031   int idx;
2032   for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
2033   /* Edge forward in the string... */
2034
2035   letter_ahead = (char)word[idx];  /* idx will be either == to how_far or
2036                                     * at the end of the string
2037                                     */
2038   return letter_ahead;
2039 }
2040
2041 /* phonize one letter
2042  * We don't know the buffers size in advance. On way to solve this is to just
2043  * re-allocate the buffer size. We're using an extra of 2 characters (this
2044  * could be one though; or more too). */
2045 #define Phonize(c)  { buffer.append(c); }
2046 /* How long is the phoned word? */
2047 #define Phone_Len  (buffer.size())
2048
2049 /* Note is a letter is a 'break' in the word */
2050 #define Isbreak(c)  (!isalpha(c))
2051
2052 String string_metaphone(const char *input, int word_len, long max_phonemes,
2053                         int traditional) {
2054   unsigned char *word = (unsigned char *)input;
2055
2056   int w_idx = 0;        /* point in the phonization we're at. */
2057   int max_buffer_len = 0;    /* maximum length of the destination buffer */
2058
2059   /*-- Parameter checks --*/
2060   /* Negative phoneme length is meaningless */
2061
2062   if (max_phonemes < 0)
2063     return String();
2064
2065   /* Empty/null string is meaningless */
2066   /* Overly paranoid */
2067   /* always_assert(word != NULL && word[0] != '\0'); */
2068
2069   if (word == nullptr)
2070     return String();
2071
2072   /*-- Allocate memory for our phoned_phrase --*/
2073   if (max_phonemes == 0) {  /* Assume largest possible */
2074     max_buffer_len = word_len;
2075   } else {
2076     max_buffer_len = max_phonemes;
2077   }
2078   StringBuffer buffer(max_buffer_len);
2079
2080   /*-- The first phoneme has to be processed specially. --*/
2081   /* Find our first letter */
2082   for (; !isalpha(Curr_Letter); w_idx++) {
2083     /* On the off chance we were given nothing but crap... */
2084     if (Curr_Letter == '\0') {
2085       return buffer.detach();  /* For testing */
2086     }
2087   }
2088
2089   switch (Curr_Letter) {
2090     /* AE becomes E */
2091   case 'A':
2092     if (Next_Letter == 'E') {
2093       Phonize('E');
2094       w_idx += 2;
2095     }
2096     /* Remember, preserve vowels at the beginning */
2097     else {
2098       Phonize('A');
2099       w_idx++;
2100     }
2101     break;
2102     /* [GKP]N becomes N */
2103   case 'G':
2104   case 'K':
2105   case 'P':
2106     if (Next_Letter == 'N') {
2107       Phonize('N');
2108       w_idx += 2;
2109     }
2110     break;
2111     /* WH becomes H,
2112        WR becomes R
2113        W if followed by a vowel */
2114   case 'W':
2115     if (Next_Letter == 'H' ||
2116       Next_Letter == 'R') {
2117       Phonize(Next_Letter);
2118       w_idx += 2;
2119     } else if (isvowel(Next_Letter)) {
2120       Phonize('W');
2121       w_idx += 2;
2122     }
2123     /* else ignore */
2124     break;
2125     /* X becomes S */
2126   case 'X':
2127     Phonize('S');
2128     w_idx++;
2129     break;
2130     /* Vowels are kept */
2131     /* We did A already
2132        case 'A':
2133        case 'a':
2134      */
2135   case 'E':
2136   case 'I':
2137   case 'O':
2138   case 'U':
2139     Phonize(Curr_Letter);
2140     w_idx++;
2141     break;
2142   default:
2143     /* do nothing */
2144     break;
2145   }
2146
2147   /* On to the metaphoning */
2148   for (; Curr_Letter != '\0' &&
2149          (max_phonemes == 0 || Phone_Len < max_phonemes);
2150        w_idx++) {
2151     /* How many letters to skip because an eariler encoding handled
2152      * multiple letters */
2153     unsigned short int skip_letter = 0;
2154
2155
2156     /* THOUGHT:  It would be nice if, rather than having things like...
2157      * well, SCI.  For SCI you encode the S, then have to remember
2158      * to skip the C.  So the phonome SCI invades both S and C.  It would
2159      * be better, IMHO, to skip the C from the S part of the encoding.
2160      * Hell, I'm trying it.
2161      */
2162
2163     /* Ignore non-alphas */
2164     if (!isalpha(Curr_Letter))
2165       continue;
2166
2167     /* Drop duplicates, except CC */
2168     if (Curr_Letter == Prev_Letter &&
2169       Curr_Letter != 'C')
2170       continue;
2171
2172     switch (Curr_Letter) {
2173       /* B -> B unless in MB */
2174     case 'B':
2175       if (Prev_Letter != 'M')
2176         Phonize('B');
2177       break;
2178       /* 'sh' if -CIA- or -CH, but not SCH, except SCHW.
2179        * (SCHW is handled in S)
2180        *  S if -CI-, -CE- or -CY-
2181        *  dropped if -SCI-, SCE-, -SCY- (handed in S)
2182        *  else K
2183        */
2184     case 'C':
2185       if (MAKESOFT(Next_Letter)) {  /* C[IEY] */
2186         if (After_Next_Letter == 'A' &&
2187           Next_Letter == 'I') {  /* CIA */
2188           Phonize(SH);
2189         }
2190         /* SC[IEY] */
2191         else if (Prev_Letter == 'S') {
2192           /* Dropped */
2193         } else {
2194           Phonize('S');
2195         }
2196       } else if (Next_Letter == 'H') {
2197         if ((!traditional) && (After_Next_Letter == 'R' ||
2198                                Prev_Letter == 'S')) {  /* Christ, School */
2199           Phonize('K');
2200         } else {
2201           Phonize(SH);
2202         }
2203         skip_letter++;
2204       } else {
2205         Phonize('K');
2206       }
2207       break;
2208       /* J if in -DGE-, -DGI- or -DGY-
2209        * else T
2210        */
2211     case 'D':
2212       if (Next_Letter == 'G' && MAKESOFT(After_Next_Letter)) {
2213         Phonize('J');
2214         skip_letter++;
2215       } else
2216         Phonize('T');
2217       break;
2218       /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH
2219        * else dropped if -GNED, -GN,
2220        * else dropped if -DGE-, -DGI- or -DGY- (handled in D)
2221        * else J if in -GE-, -GI, -GY and not GG
2222        * else K
2223        */
2224     case 'G':
2225       if (Next_Letter == 'H') {
2226         if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) {
2227           Phonize('F');
2228           skip_letter++;
2229         } else {
2230           /* silent */
2231         }
2232       } else if (Next_Letter == 'N') {
2233         if (Isbreak(After_Next_Letter) ||
2234             (After_Next_Letter == 'E' && Look_Ahead_Letter(3) == 'D')) {
2235           /* dropped */
2236         } else
2237           Phonize('K');
2238       } else if (MAKESOFT(Next_Letter) && Prev_Letter != 'G') {
2239         Phonize('J');
2240       } else {
2241         Phonize('K');
2242       }
2243       break;
2244       /* H if before a vowel and not after C,G,P,S,T */
2245     case 'H':
2246       if (isvowel(Next_Letter) && !AFFECTH(Prev_Letter))
2247         Phonize('H');
2248       break;
2249       /* dropped if after C
2250        * else K
2251        */
2252     case 'K':
2253       if (Prev_Letter != 'C')
2254         Phonize('K');
2255       break;
2256       /* F if before H
2257        * else P
2258        */
2259     case 'P':
2260       if (Next_Letter == 'H') {
2261         Phonize('F');
2262       } else {
2263         Phonize('P');
2264       }
2265       break;
2266       /* K
2267        */
2268     case 'Q':
2269       Phonize('K');
2270       break;
2271       /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW-
2272        * else S
2273        */
2274     case 'S':
2275       if (Next_Letter == 'I' &&
2276           (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2277         Phonize(SH);
2278       } else if (Next_Letter == 'H') {
2279         Phonize(SH);
2280         skip_letter++;
2281       } else if ((!traditional) &&
2282                  (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' &&
2283                   Look_Ahead_Letter(3) == 'W')) {
2284         Phonize(SH);
2285         skip_letter += 2;
2286       } else {
2287         Phonize('S');
2288       }
2289       break;
2290       /* 'sh' in -TIA- or -TIO-
2291        * else 'th' before H
2292        * else T
2293        */
2294     case 'T':
2295       if (Next_Letter == 'I' &&
2296         (After_Next_Letter == 'O' || After_Next_Letter == 'A')) {
2297         Phonize(SH);
2298       } else if (Next_Letter == 'H') {
2299         Phonize(TH);
2300         skip_letter++;
2301       } else {
2302         Phonize('T');
2303       }
2304       break;
2305       /* F */
2306     case 'V':
2307       Phonize('F');
2308       break;
2309       /* W before a vowel, else dropped */
2310     case 'W':
2311       if (isvowel(Next_Letter))
2312         Phonize('W');
2313       break;
2314       /* KS */
2315     case 'X':
2316       Phonize('K');
2317       Phonize('S');
2318       break;
2319       /* Y if followed by a vowel */
2320     case 'Y':
2321       if (isvowel(Next_Letter))
2322         Phonize('Y');
2323       break;
2324       /* S */
2325     case 'Z':
2326       Phonize('S');
2327       break;
2328       /* No transformation */
2329     case 'F':
2330     case 'J':
2331     case 'L':
2332     case 'M':
2333     case 'N':
2334     case 'R':
2335       Phonize(Curr_Letter);
2336       break;
2337     default:
2338       /* nothing */
2339       break;
2340     } /* END SWITCH */
2341
2342     w_idx += skip_letter;
2343   } /* END FOR */
2344
2345   return buffer.detach();
2346 }
2347
2348 ///////////////////////////////////////////////////////////////////////////////
2349 // Cyrillic
2350
2351 /**
2352  * This is codetables for different Cyrillic charsets (relative to koi8-r).
2353  * Each table contains data for 128-255 symbols from ASCII table.
2354  * First 256 symbols are for conversion from koi8-r to corresponding charset,
2355  * second 256 symbols are for reverse conversion, from charset to koi8-r.
2356  *
2357  * Here we have the following tables:
2358  * _cyr_win1251   - for windows-1251 charset
2359  * _cyr_iso88595  - for iso8859-5 charset
2360  * _cyr_cp866     - for x-cp866 charset
2361  * _cyr_mac       - for x-mac-cyrillic charset
2362  */
2363 typedef unsigned char _cyr_charset_table[512];
2364
2365 static const _cyr_charset_table _cyr_win1251 = {
2366   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2367   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2368   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2369   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2370   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2371   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2372   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2373   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2374   46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2375   46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,
2376   154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183,
2377   46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167,
2378   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2379   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2380   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2381   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2382   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2383   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2384   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2385   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2386   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2387   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2388   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2389   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2390   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2391   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2392   32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32,
2393   32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169,
2394   254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2395   239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2396   222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206,
2397   207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218,
2398 };
2399
2400 static const _cyr_charset_table _cyr_cp866 = {
2401   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2402   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2403   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2404   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2405   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2406   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2407   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2408   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2409   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2410   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2411   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2412   35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43,
2413   43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45,
2414   45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35,
2415   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2416   179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154,
2417   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2418   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2419   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2420   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2421   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2422   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2423   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2424   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2425   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2426   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2427   205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198,
2428   199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32,
2429   238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174,
2430   175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234,
2431   158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2432   143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2433 };
2434
2435 static const _cyr_charset_table _cyr_iso88595 = {
2436   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2437   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2438   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2439   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2440   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2441   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2442   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2443   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2444   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2445   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2446   32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2447   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2448   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2449   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2450   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209,
2451   32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2452   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2453   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2454   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2455   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2456   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2457   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2458   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2459   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2460   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2461   32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,
2462   32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32,
2463   32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32,
2464   238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222,
2465   223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234,
2466   206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190,
2467   191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202,
2468 };
2469
2470 static const _cyr_charset_table _cyr_mac = {
2471   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2472   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2473   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2474   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2475   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2476   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2477   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2478   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2479   225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,
2480   242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241,
2481   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2482   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2483   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2484   144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209,
2485   193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,
2486   210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255,
2487   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
2488   16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
2489   32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
2490   48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
2491   64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
2492   80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
2493   96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
2494   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2495   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2496   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2497   160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175,
2498   176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191,
2499   254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238,
2500   239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250,
2501   158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142,
2502   143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154,
2503 };
2504
2505 /**
2506  * This is the function that performs real in-place conversion of the string
2507  * between charsets.
2508  * Parameters:
2509  *    str - string to be converted
2510  *    from,to - one-symbol label of source and destination charset
2511  * The following symbols are used as labels:
2512  *    k - koi8-r
2513  *    w - windows-1251
2514  *    i - iso8859-5
2515  *    a - x-cp866
2516  *    d - x-cp866
2517  *    m - x-mac-cyrillic
2518  */
2519 String string_convert_cyrillic_string(const String& input, char from, char to) {
2520   const unsigned char *from_table, *to_table;
2521   unsigned char tmp;
2522   const unsigned char *uinput = (unsigned char *)input.slice().ptr;
2523   String retString(input.size(), ReserveString);
2524   unsigned char *str = (unsigned char *)retString.mutableData();
2525
2526   from_table = nullptr;
2527   to_table   = nullptr;
2528
2529   switch (toupper((int)(unsigned char)from)) {
2530   case 'W': from_table = _cyr_win1251;  break;
2531   case 'A':
2532   case 'D': from_table = _cyr_cp866;    break;
2533   case 'I': from_table = _cyr_iso88595; break;
2534   case 'M': from_table = _cyr_mac;      break;
2535   case 'K':
2536     break;
2537   default:
2538     throw_invalid_argument("Unknown source charset: %c", from);
2539     break;
2540   }
2541
2542   switch (toupper((int)(unsigned char)to)) {
2543   case 'W': to_table = _cyr_win1251;    break;
2544   case 'A':
2545   case 'D': to_table = _cyr_cp866;      break;
2546   case 'I': to_table = _cyr_iso88595;   break;
2547   case 'M': to_table = _cyr_mac;        break;
2548   case 'K':
2549     break;
2550   default:
2551     throw_invalid_argument("Unknown destination charset: %c", to);
2552     break;
2553   }
2554
2555   for (int i = 0; i < input.size(); i++) {
2556     tmp = from_table == nullptr ? uinput[i] : from_table[uinput[i]];
2557     str[i] = to_table == nullptr ? tmp : to_table[tmp + 256];
2558   }
2559   retString.setSize(input.size());
2560   return retString;
2561 }
2562
2563 ///////////////////////////////////////////////////////////////////////////////
2564 // Hebrew
2565
2566 #define HEB_BLOCK_TYPE_ENG 1
2567 #define HEB_BLOCK_TYPE_HEB 2
2568
2569 #define isheb(c)                                                        \
2570   (((((unsigned char) c) >= 224) && (((unsigned char) c) <= 250)) ? 1 : 0)
2571 #define _isblank(c)                                                     \
2572   (((((unsigned char) c) == ' '  || ((unsigned char) c) == '\t')) ? 1 : 0)
2573 #define _isnewline(c)                                                   \
2574   (((((unsigned char) c) == '\n' || ((unsigned char) c) == '\r')) ? 1 : 0)
2575
2576 /**
2577  * Converts Logical Hebrew text (Hebrew Windows style) to Visual text
2578  * Cheers/complaints/flames - Zeev Suraski <zeev@php.net>
2579  */
2580 String string_convert_hebrew_string(const String& inStr,
2581                                     int max_chars_per_line,
2582                                     int convert_newlines) {
2583   assert(!inStr.empty());
2584   auto str = inStr.data();
2585   auto str_len = inStr.size();
2586   const char *tmp;
2587   char *heb_str, *broken_str;
2588   char *target;
2589   int block_start, block_end, block_type, block_length, i;
2590   long max_chars=0;
2591   int begin, end, char_count, orig_begin;
2592
2593   tmp = str;
2594   block_start=block_end=0;
2595
2596   heb_str = (char *) req::malloc(str_len + 1);
2597   SCOPE_EXIT { req::free(heb_str); };
2598   target = heb_str+str_len;
2599   *target = 0;
2600   target--;
2601
2602   block_length=0;
2603
2604   if (isheb(*tmp)) {
2605     block_type = HEB_BLOCK_TYPE_HEB;
2606   } else {
2607     block_type = HEB_BLOCK_TYPE_ENG;
2608   }
2609
2610   do {
2611     if (block_type == HEB_BLOCK_TYPE_HEB) {
2612       while ((isheb((int)*(tmp+1)) ||
2613               _isblank((int)*(tmp+1)) ||
2614               ispunct((int)*(tmp+1)) ||
2615               (int)*(tmp+1)=='\n' ) && block_end<str_len-1) {
2616         tmp++;
2617         block_end++;
2618         block_length++;
2619       }
2620       for (i = block_start; i<= block_end; i++) {
2621         *target = str[i];
2622         switch (*target) {
2623         case '(':  *target = ')';  break;
2624         case ')':  *target = '(';  break;
2625         case '[':  *target = ']';  break;
2626         case ']':  *target = '[';  break;
2627         case '{':  *target = '}';  break;
2628         case '}':  *target = '{';  break;
2629         case '<':  *target = '>';  break;
2630         case '>':  *target = '<';  break;
2631         case '\\': *target = '/';  break;
2632         case '/':  *target = '\\'; break;
2633         default:
2634           break;
2635         }
2636         target--;
2637       }
2638       block_type = HEB_BLOCK_TYPE_ENG;
2639     } else {
2640       while (!isheb(*(tmp+1)) &&
2641              (int)*(tmp+1)!='\n' && block_end < str_len-1) {
2642         tmp++;
2643         block_end++;
2644         block_length++;
2645       }
2646       while ((_isblank((int)*tmp) ||
2647               ispunct((int)*tmp)) && *tmp!='/' &&
2648              *tmp!='-' && block_end > block_start) {
2649         tmp--;
2650         block_end--;
2651       }
2652       for (i = block_end; i >= block_start; i--) {
2653         *target = str[i];
2654         target--;
2655       }
2656       block_type = HEB_BLOCK_TYPE_HEB;
2657     }
2658     block_start=block_end+1;
2659   } while (block_end < str_len-1);
2660
2661   String brokenStr(str_len, ReserveString);
2662   broken_str = brokenStr.mutableData();
2663   begin=end=str_len-1;
2664   target = broken_str;
2665
2666   while (1) {
2667     char_count=0;
2668     while ((!max_chars || char_count < max_chars) && begin > 0) {
2669       char_count++;
2670       begin--;
2671       if (begin <= 0 || _isnewline(heb_str[begin])) {
2672         while (begin > 0 && _isnewline(heb_str[begin-1])) {
2673           begin--;
2674           char_count++;
2675         }
2676         break;
2677       }
2678     }
2679     if (char_count == max_chars) { /* try to avoid breaking words */
2680       int new_char_count=char_count, new_begin=begin;
2681
2682       while (new_char_count > 0) {
2683         if (_isblank(heb_str[new_begin]) || _isnewline(heb_str[new_begin])) {
2684           break;
2685         }
2686         new_begin++;
2687         new_char_count--;
2688       }
2689       if (new_char_count > 0) {
2690         char_count=new_char_count;
2691         begin=new_begin;
2692       }
2693     }
2694     orig_begin=begin;
2695
2696     if (_isblank(heb_str[begin])) {
2697       heb_str[begin]='\n';
2698     }
2699     while (begin <= end && _isnewline(heb_str[begin])) {
2700       /* skip leading newlines */
2701       begin++;
2702     }
2703     for (i = begin; i <= end; i++) { /* copy content */
2704       *target = heb_str[i];
2705       target++;
2706     }
2707     for (i = orig_begin; i <= end && _isnewline(heb_str[i]); i++) {
2708       *target = heb_str[i];
2709       target++;
2710     }
2711     begin=orig_begin;
2712
2713     if (begin <= 0) {
2714       *target = 0;
2715       break;
2716     }
2717     begin--;
2718     end=begin;
2719   }
2720
2721   if (convert_newlines) {
2722     int count;
2723     auto ret = string_replace(broken_str, str_len, "\n", strlen("\n"),
2724                               "<br />\n", strlen("<br />\n"), count, true);
2725     if (!ret.isNull()) {
2726       return ret;
2727     }
2728   }
2729   brokenStr.setSize(str_len);
2730   return brokenStr;
2731 }
2732
2733 #if defined(__APPLE__)
2734
2735   void *memrchr(const void *s, int c, size_t n) {
2736     for (const char *p = (const char *)s + n - 1; p >= s; p--) {
2737       if (*p == c) return (void *)p;
2738     }
2739     return nullptr;
2740   }
2741
2742 #endif
2743
2744 ///////////////////////////////////////////////////////////////////////////////
2745 }