string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/internal/config.h"
  15
  16 #include <ctype.h>
  17 #include <errno.h>
  18 #include <math.h>
  19
  20 #ifdef HAVE_UNISTD_H
  21 # include <unistd.h>
  22 #endif
  23
  24 #include "debug_counter.h"
  25 #include "encindex.h"
  26 #include "gc.h"
  27 #include "id.h"
  28 #include "internal.h"
  29 #include "internal/array.h"
  30 #include "internal/compar.h"
  31 #include "internal/compilers.h"
  32 #include "internal/encoding.h"
  33 #include "internal/error.h"
  34 #include "internal/gc.h"
  35 #include "internal/numeric.h"
  36 #include "internal/object.h"
  37 #include "internal/proc.h"
  38 #include "internal/re.h"
  39 #include "internal/sanitizers.h"
  40 #include "internal/string.h"
  41 #include "internal/transcode.h"
  42 #include "probes.h"
  43 #include "ruby/encoding.h"
  44 #include "ruby/re.h"
  45 #include "ruby/util.h"
  46 #include "ruby_assert.h"
  47 #include "vm_sync.h"
  48
  49 #if defined HAVE_CRYPT_R
  50 # if defined HAVE_CRYPT_H
  51 #  include <crypt.h>
  52 # endif
  53 #elif !defined HAVE_CRYPT
  54 # include "missing/crypt.h"
  55 # define HAVE_CRYPT_R 1
  56 #endif
  57
  58 #define BEG(no) (regs->beg[(no)])
  59 #define END(no) (regs->end[(no)])
  60
  61 #undef rb_str_new
  62 #undef rb_usascii_str_new
  63 #undef rb_utf8_str_new
  64 #undef rb_enc_str_new
  65 #undef rb_str_new_cstr
  66 #undef rb_tainted_str_new_cstr
  67 #undef rb_usascii_str_new_cstr
  68 #undef rb_utf8_str_new_cstr
  69 #undef rb_enc_str_new_cstr
  70 #undef rb_external_str_new_cstr
  71 #undef rb_locale_str_new_cstr
  72 #undef rb_str_dup_frozen
  73 #undef rb_str_buf_new_cstr
  74 #undef rb_str_buf_cat
  75 #undef rb_str_buf_cat2
  76 #undef rb_str_cat2
  77 #undef rb_str_cat_cstr
  78 #undef rb_fstring_cstr
  79
  80 VALUE rb_cString;
  81 VALUE rb_cSymbol;
  82
  83 /* FLAGS of RString
  84  *
  85  * 1:     RSTRING_NOEMBED
  86  * 2:     STR_SHARED (== ELTS_SHARED)
  87  * 2-6:   RSTRING_EMBED_LEN (5 bits == 32)
  88  * 5:     STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
  89  *                         other strings that rely on this string's buffer)
  90  * 6:     STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
  91  *                      early, specific to rb_str_tmp_frozen_{acquire,release})
  92  * 7:     STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
  93  *                     such as read(2). Any modification and realloc is prohibited)
  94  *
  95  * 8-9:   ENC_CODERANGE (2 bits)
  96  * 10-16: ENCODING (7 bits == 128)
  97  * 17:    RSTRING_FSTR
  98  * 18:    STR_NOFREE (do not free this string's buffer when a String is freed.
  99  *                    used for a string object based on C string literal)
 100  * 19:    STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
 101  *                     object header is temporarily allocated on C stack)
 102  */
 103
 104 #define RUBY_MAX_CHAR_LEN 16
 105 #define STR_SHARED_ROOT FL_USER5
 106 #define STR_BORROWED FL_USER6
 107 #define STR_TMPLOCK FL_USER7
 108 #define STR_NOFREE FL_USER18
 109 #define STR_FAKESTR FL_USER19
 110
 111 #define STR_SET_NOEMBED(str) do {\
 112     FL_SET((str), STR_NOEMBED);\
 113     if (USE_RVARGC) {\
 114         FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
 115     }\
 116     else {\
 117         STR_SET_EMBED_LEN((str), 0);\
 118     }\
 119 } while (0)
 120 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
 121 #if USE_RVARGC
 122 # define STR_SET_EMBED_LEN(str, n) do { \
 123     assert(str_embed_capa(str) > (n));\
 124     RSTRING(str)->as.embed.len = (n);\
 125 } while (0)
 126 #else
 127 # define STR_SET_EMBED_LEN(str, n) do { \
 128     long tmp_n = (n);\
 129     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
 130     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
 131 } while (0)
 132 #endif
 133
 134 #define STR_SET_LEN(str, n) do { \
 135     if (STR_EMBED_P(str)) {\
 136         STR_SET_EMBED_LEN((str), (n));\
 137     }\
 138     else {\
 139         RSTRING(str)->as.heap.len = (n);\
 140     }\
 141 } while (0)
 142
 143 #define STR_DEC_LEN(str) do {\
 144     if (STR_EMBED_P(str)) {\
 145         long n = RSTRING_LEN(str);\
 146         n--;\
 147         STR_SET_EMBED_LEN((str), n);\
 148     }\
 149     else {\
 150         RSTRING(str)->as.heap.len--;\
 151     }\
 152 } while (0)
 153
 154 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
 155 #define TERM_FILL(ptr, termlen) do {\
 156     char *const term_fill_ptr = (ptr);\
 157     const int term_fill_len = (termlen);\
 158     *term_fill_ptr = '\0';\
 159     if (UNLIKELY(term_fill_len > 1))\
 160         memset(term_fill_ptr, 0, term_fill_len);\
 161 } while (0)
 162
 163 #define RESIZE_CAPA(str,capacity) do {\
 164     const int termlen = TERM_LEN(str);\
 165     RESIZE_CAPA_TERM(str,capacity,termlen);\
 166 } while (0)
 167 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
 168     if (STR_EMBED_P(str)) {\
 169         if (str_embed_capa(str) < capacity + termlen) {\
 170             char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
 171             const long tlen = RSTRING_LEN(str);\
 172             memcpy(tmp, RSTRING_PTR(str), tlen);\
 173             RSTRING(str)->as.heap.ptr = tmp;\
 174             RSTRING(str)->as.heap.len = tlen;\
 175             STR_SET_NOEMBED(str);\
 176             RSTRING(str)->as.heap.aux.capa = (capacity);\
 177         }\
 178     }\
 179     else {\
 180         assert(!FL_TEST((str), STR_SHARED)); \
 181         SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
 182                         (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
 183         RSTRING(str)->as.heap.aux.capa = (capacity);\
 184     }\
 185 } while (0)
 186
 187 #define STR_SET_SHARED(str, shared_str) do { \
 188     if (!FL_TEST(str, STR_FAKESTR)) { \
 189         assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
 190         assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
 191         RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
 192         FL_SET((str), STR_SHARED); \
 193         FL_SET((shared_str), STR_SHARED_ROOT); \
 194         if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
 195             FL_SET_RAW((shared_str), STR_BORROWED); \
 196     } \
 197 } while (0)
 198
 199 #define STR_HEAP_PTR(str)  (RSTRING(str)->as.heap.ptr)
 200 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
 201 /* TODO: include the terminator size in capa. */
 202
 203 #define STR_ENC_GET(str) get_encoding(str)
 204
 205 #if !defined SHARABLE_MIDDLE_SUBSTRING
 206 # define SHARABLE_MIDDLE_SUBSTRING 0
 207 #endif
 208 #if !SHARABLE_MIDDLE_SUBSTRING
 209 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
 210 #else
 211 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
 212 #endif
 213
 214
 215 static inline long
 216 str_embed_capa(VALUE str)
 217 {
 218 #if USE_RVARGC
 219     return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
 220 #else
 221     return RSTRING_EMBED_LEN_MAX + 1;
 222 #endif
 223 }
 224
 225 static inline size_t
 226 str_embed_size(long capa)
 227 {
 228     return offsetof(struct RString, as.embed.ary) + capa;
 229 }
 230
 231 static inline bool
 232 STR_EMBEDDABLE_P(long len, long termlen)
 233 {
 234 #if USE_RVARGC
 235     return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
 236 #else
 237     return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
 238 #endif
 239 }
 240
 241 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
 242 static VALUE str_new_frozen(VALUE klass, VALUE orig);
 243 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
 244 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
 245 static VALUE str_new(VALUE klass, const char *ptr, long len);
 246 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
 247 static inline void str_modifiable(VALUE str);
 248 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
 249
 250 static inline void
 251 str_make_independent(VALUE str)
 252 {
 253     long len = RSTRING_LEN(str);
 254     int termlen = TERM_LEN(str);
 255     str_make_independent_expand((str), len, 0L, termlen);
 256 }
 257
 258 static inline int str_dependent_p(VALUE str);
 259
 260 void
 261 rb_str_make_independent(VALUE str)
 262 {
 263     if (str_dependent_p(str)) {
 264         str_make_independent(str);
 265     }
 266 }
 267
 268 void
 269 rb_debug_rstring_null_ptr(const char *func)
 270 {
 271     fprintf(stderr, "%s is returning NULL!! "
 272             "SIGSEGV is highly expected to follow immediately. "
 273             "If you could reproduce, attach your debugger here, "
 274             "and look at the passed string.",
 275             func);
 276 }
 277
 278 /* symbols for [up|down|swap]case/capitalize options */
 279 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
 280
 281 static rb_encoding *
 282 get_actual_encoding(const int encidx, VALUE str)
 283 {
 284     const unsigned char *q;
 285
 286     switch (encidx) {
 287       case ENCINDEX_UTF_16:
 288         if (RSTRING_LEN(str) < 2) break;
 289         q = (const unsigned char *)RSTRING_PTR(str);
 290         if (q[0] == 0xFE && q[1] == 0xFF) {
 291             return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
 292         }
 293         if (q[0] == 0xFF && q[1] == 0xFE) {
 294             return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
 295         }
 296         return rb_ascii8bit_encoding();
 297       case ENCINDEX_UTF_32:
 298         if (RSTRING_LEN(str) < 4) break;
 299         q = (const unsigned char *)RSTRING_PTR(str);
 300         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
 301             return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
 302         }
 303         if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
 304             return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
 305         }
 306         return rb_ascii8bit_encoding();
 307     }
 308     return rb_enc_from_index(encidx);
 309 }
 310
 311 static rb_encoding *
 312 get_encoding(VALUE str)
 313 {
 314     return get_actual_encoding(ENCODING_GET(str), str);
 315 }
 316
 317 static void
 318 mustnot_broken(VALUE str)
 319 {
 320     if (is_broken_string(str)) {
 321         rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
 322     }
 323 }
 324
 325 static void
 326 mustnot_wchar(VALUE str)
 327 {
 328     rb_encoding *enc = STR_ENC_GET(str);
 329     if (rb_enc_mbminlen(enc) > 1) {
 330         rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
 331     }
 332 }
 333
 334 static int fstring_cmp(VALUE a, VALUE b);
 335
 336 static VALUE register_fstring(VALUE str, bool copy);
 337
 338 const struct st_hash_type rb_fstring_hash_type = {
 339     fstring_cmp,
 340     rb_str_hash,
 341 };
 342
 343 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
 344
 345 struct fstr_update_arg {
 346     VALUE fstr;
 347     bool copy;
 348 };
 349
 350 static int
 351 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
 352 {
 353
 354     struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
 355     VALUE str = (VALUE)*key;
 356
 357     if (existing) {
 358         /* because of lazy sweep, str may be unmarked already and swept
 359          * at next time */
 360
 361         if (rb_objspace_garbage_object_p(str)) {
 362             arg->fstr = Qundef;
 363             return ST_DELETE;
 364         }
 365
 366         arg->fstr = str;
 367         return ST_STOP;
 368     }
 369     else {
 370         if (FL_TEST_RAW(str, STR_FAKESTR)) {
 371             if (arg->copy) {
 372                 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
 373                 rb_enc_copy(new_str, str);
 374                 str = new_str;
 375             }
 376             else {
 377                 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
 378                                      RSTRING(str)->as.heap.len,
 379                                      ENCODING_GET(str));
 380             }
 381             OBJ_FREEZE_RAW(str);
 382         }
 383         else {
 384             if (!OBJ_FROZEN(str))
 385                 str = str_new_frozen(rb_cString, str);
 386             if (STR_SHARED_P(str)) { /* str should not be shared */
 387                 /* shared substring  */
 388                 str_make_independent(str);
 389                 assert(OBJ_FROZEN(str));
 390             }
 391             if (!BARE_STRING_P(str)) {
 392                 str = str_new_frozen(rb_cString, str);
 393             }
 394         }
 395         RBASIC(str)->flags |= RSTRING_FSTR;
 396
 397         *key = *value = arg->fstr = str;
 398         return ST_CONTINUE;
 399     }
 400 }
 401
 402 RUBY_FUNC_EXPORTED
 403 VALUE
 404 rb_fstring(VALUE str)
 405 {
 406     VALUE fstr;
 407     int bare;
 408
 409     Check_Type(str, T_STRING);
 410
 411     if (FL_TEST(str, RSTRING_FSTR))
 412         return str;
 413
 414     bare = BARE_STRING_P(str);
 415     if (!bare) {
 416         if (STR_EMBED_P(str)) {
 417             OBJ_FREEZE_RAW(str);
 418             return str;
 419         }
 420         if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
 421             assert(OBJ_FROZEN(str));
 422             return str;
 423         }
 424     }
 425
 426     if (!OBJ_FROZEN(str))
 427         rb_str_resize(str, RSTRING_LEN(str));
 428
 429     fstr = register_fstring(str, FALSE);
 430
 431     if (!bare) {
 432         str_replace_shared_without_enc(str, fstr);
 433         OBJ_FREEZE_RAW(str);
 434         return str;
 435     }
 436     return fstr;
 437 }
 438
 439 static VALUE
 440 register_fstring(VALUE str, bool copy)
 441 {
 442     struct fstr_update_arg args;
 443     args.copy = copy;
 444
 445     RB_VM_LOCK_ENTER();
 446     {
 447         st_table *frozen_strings = rb_vm_fstring_table();
 448         do {
 449             args.fstr = str;
 450             st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
 451         } while (args.fstr == Qundef);
 452     }
 453     RB_VM_LOCK_LEAVE();
 454
 455     assert(OBJ_FROZEN(args.fstr));
 456     assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
 457     assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
 458     assert(RBASIC_CLASS(args.fstr) == rb_cString);
 459     return args.fstr;
 460 }
 461
 462 static VALUE
 463 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
 464 {
 465     fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
 466     /* SHARED to be allocated by the callback */
 467
 468     if (!name) {
 469         RUBY_ASSERT_ALWAYS(len == 0);
 470         name = "";
 471     }
 472
 473     ENCODING_SET_INLINED((VALUE)fake_str, encidx);
 474
 475     RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
 476     fake_str->as.heap.len = len;
 477     fake_str->as.heap.ptr = (char *)name;
 478     fake_str->as.heap.aux.capa = len;
 479     return (VALUE)fake_str;
 480 }
 481
 482 /*
 483  * set up a fake string which refers a static string literal.
 484  */
 485 VALUE
 486 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
 487 {
 488     return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
 489 }
 490
 491 /*
 492  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
 493  * shared string which refers a static string literal.  `ptr` must
 494  * point a constant string.
 495  */
 496 MJIT_FUNC_EXPORTED VALUE
 497 rb_fstring_new(const char *ptr, long len)
 498 {
 499     struct RString fake_str;
 500     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
 501 }
 502
 503 VALUE
 504 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
 505 {
 506     struct RString fake_str;
 507     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
 508 }
 509
 510 VALUE
 511 rb_fstring_cstr(const char *ptr)
 512 {
 513     return rb_fstring_new(ptr, strlen(ptr));
 514 }
 515
 516 static int
 517 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
 518 {
 519     RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
 520     return ST_CONTINUE;
 521 }
 522
 523 static int
 524 fstring_cmp(VALUE a, VALUE b)
 525 {
 526     long alen, blen;
 527     const char *aptr, *bptr;
 528     RSTRING_GETMEM(a, aptr, alen);
 529     RSTRING_GETMEM(b, bptr, blen);
 530     return (alen != blen ||
 531             ENCODING_GET(a) != ENCODING_GET(b) ||
 532             memcmp(aptr, bptr, alen) != 0);
 533 }
 534
 535 static inline int
 536 single_byte_optimizable(VALUE str)
 537 {
 538     rb_encoding *enc;
 539
 540     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 541     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 542         return 1;
 543
 544     enc = STR_ENC_GET(str);
 545     if (rb_enc_mbmaxlen(enc) == 1)
 546         return 1;
 547
 548     /* Conservative.  Possibly single byte.
 549      * "\xa1" in Shift_JIS for example. */
 550     return 0;
 551 }
 552
 553 VALUE rb_fs;
 554
 555 static inline const char *
 556 search_nonascii(const char *p, const char *e)
 557 {
 558     const uintptr_t *s, *t;
 559
 560 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
 561 # if SIZEOF_UINTPTR_T == 8
 562 #  define NONASCII_MASK UINT64_C(0x8080808080808080)
 563 # elif SIZEOF_UINTPTR_T == 4
 564 #  define NONASCII_MASK UINT32_C(0x80808080)
 565 # else
 566 #  error "don't know what to do."
 567 # endif
 568 #else
 569 # if SIZEOF_UINTPTR_T == 8
 570 #  define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
 571 # elif SIZEOF_UINTPTR_T == 4
 572 #  define NONASCII_MASK 0x80808080UL /* or...? */
 573 # else
 574 #  error "don't know what to do."
 575 # endif
 576 #endif
 577
 578     if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
 579 #if !UNALIGNED_WORD_ACCESS
 580         if ((uintptr_t)p % SIZEOF_VOIDP) {
 581             int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
 582             p += l;
 583             switch (l) {
 584               default: UNREACHABLE;
 585 #if SIZEOF_VOIDP > 4
 586               case 7: if (p[-7]&0x80) return p-7;
 587               case 6: if (p[-6]&0x80) return p-6;
 588               case 5: if (p[-5]&0x80) return p-5;
 589               case 4: if (p[-4]&0x80) return p-4;
 590 #endif
 591               case 3: if (p[-3]&0x80) return p-3;
 592               case 2: if (p[-2]&0x80) return p-2;
 593               case 1: if (p[-1]&0x80) return p-1;
 594               case 0: break;
 595             }
 596         }
 597 #endif
 598 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
 599 #define aligned_ptr(value) \
 600         __builtin_assume_aligned((value), sizeof(uintptr_t))
 601 #else
 602 #define aligned_ptr(value) (uintptr_t *)(value)
 603 #endif
 604         s = aligned_ptr(p);
 605         t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
 606 #undef aligned_ptr
 607         for (;s < t; s++) {
 608             if (*s & NONASCII_MASK) {
 609 #ifdef WORDS_BIGENDIAN
 610                 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
 611 #else
 612                 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
 613 #endif
 614             }
 615         }
 616         p = (const char *)s;
 617     }
 618
 619     switch (e - p) {
 620       default: UNREACHABLE;
 621 #if SIZEOF_VOIDP > 4
 622       case 7: if (e[-7]&0x80) return e-7;
 623       case 6: if (e[-6]&0x80) return e-6;
 624       case 5: if (e[-5]&0x80) return e-5;
 625       case 4: if (e[-4]&0x80) return e-4;
 626 #endif
 627       case 3: if (e[-3]&0x80) return e-3;
 628       case 2: if (e[-2]&0x80) return e-2;
 629       case 1: if (e[-1]&0x80) return e-1;
 630       case 0: return NULL;
 631     }
 632 }
 633
 634 static int
 635 coderange_scan(const char *p, long len, rb_encoding *enc)
 636 {
 637     const char *e = p + len;
 638
 639     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 640         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 641         p = search_nonascii(p, e);
 642         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 643     }
 644
 645     if (rb_enc_asciicompat(enc)) {
 646         p = search_nonascii(p, e);
 647         if (!p) return ENC_CODERANGE_7BIT;
 648         for (;;) {
 649             int ret = rb_enc_precise_mbclen(p, e, enc);
 650             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 651             p += MBCLEN_CHARFOUND_LEN(ret);
 652             if (p == e) break;
 653             p = search_nonascii(p, e);
 654             if (!p) break;
 655         }
 656     }
 657     else {
 658         while (p < e) {
 659             int ret = rb_enc_precise_mbclen(p, e, enc);
 660             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 661             p += MBCLEN_CHARFOUND_LEN(ret);
 662         }
 663     }
 664     return ENC_CODERANGE_VALID;
 665 }
 666
 667 long
 668 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 669 {
 670     const char *p = s;
 671
 672     if (*cr == ENC_CODERANGE_BROKEN)
 673         return e - s;
 674
 675     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 676         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 677         if (*cr == ENC_CODERANGE_VALID) return e - s;
 678         p = search_nonascii(p, e);
 679         *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 680         return e - s;
 681     }
 682     else if (rb_enc_asciicompat(enc)) {
 683         p = search_nonascii(p, e);
 684         if (!p) {
 685             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 686             return e - s;
 687         }
 688         for (;;) {
 689             int ret = rb_enc_precise_mbclen(p, e, enc);
 690             if (!MBCLEN_CHARFOUND_P(ret)) {
 691                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 692                 return p - s;
 693             }
 694             p += MBCLEN_CHARFOUND_LEN(ret);
 695             if (p == e) break;
 696             p = search_nonascii(p, e);
 697             if (!p) break;
 698         }
 699     }
 700     else {
 701         while (p < e) {
 702             int ret = rb_enc_precise_mbclen(p, e, enc);
 703             if (!MBCLEN_CHARFOUND_P(ret)) {
 704                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 705                 return p - s;
 706             }
 707             p += MBCLEN_CHARFOUND_LEN(ret);
 708         }
 709     }
 710     *cr = ENC_CODERANGE_VALID;
 711     return e - s;
 712 }
 713
 714 static inline void
 715 str_enc_copy(VALUE str1, VALUE str2)
 716 {
 717     rb_enc_set_index(str1, ENCODING_GET(str2));
 718 }
 719
 720 static void
 721 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 722 {
 723     /* this function is designed for copying encoding and coderange
 724      * from src to new string "dest" which is made from the part of src.
 725      */
 726     str_enc_copy(dest, src);
 727     if (RSTRING_LEN(dest) == 0) {
 728         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 729             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 730         else
 731             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 732         return;
 733     }
 734     switch (ENC_CODERANGE(src)) {
 735       case ENC_CODERANGE_7BIT:
 736         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 737         break;
 738       case ENC_CODERANGE_VALID:
 739         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 740             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 741             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 742         else
 743             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 744         break;
 745       default:
 746         break;
 747     }
 748 }
 749
 750 static void
 751 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 752 {
 753     str_enc_copy(dest, src);
 754     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 755 }
 756
 757 static int
 758 enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
 759 {
 760     if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
 761         rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
 762         return ENC_CODERANGE_BROKEN;
 763     }
 764     else {
 765         return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 766     }
 767 }
 768
 769 int
 770 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
 771 {
 772     return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
 773 }
 774
 775 int
 776 rb_enc_str_coderange(VALUE str)
 777 {
 778     int cr = ENC_CODERANGE(str);
 779
 780     if (cr == ENC_CODERANGE_UNKNOWN) {
 781         int encidx = ENCODING_GET(str);
 782         rb_encoding *enc = rb_enc_from_index(encidx);
 783         cr = enc_coderange_scan(str, enc, encidx);
 784         ENC_CODERANGE_SET(str, cr);
 785     }
 786     return cr;
 787 }
 788
 789 int
 790 rb_enc_str_asciionly_p(VALUE str)
 791 {
 792     rb_encoding *enc = STR_ENC_GET(str);
 793
 794     if (!rb_enc_asciicompat(enc))
 795         return FALSE;
 796     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 797         return TRUE;
 798     return FALSE;
 799 }
 800
 801 static inline void
 802 str_mod_check(VALUE s, const char *p, long len)
 803 {
 804     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 805         rb_raise(rb_eRuntimeError, "string modified");
 806     }
 807 }
 808
 809 static size_t
 810 str_capacity(VALUE str, const int termlen)
 811 {
 812     if (STR_EMBED_P(str)) {
 813 #if USE_RVARGC
 814         return str_embed_capa(str) - termlen;
 815 #else
 816         return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
 817 #endif
 818     }
 819     else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
 820         return RSTRING(str)->as.heap.len;
 821     }
 822     else {
 823         return RSTRING(str)->as.heap.aux.capa;
 824     }
 825 }
 826
 827 size_t
 828 rb_str_capacity(VALUE str)
 829 {
 830     return str_capacity(str, TERM_LEN(str));
 831 }
 832
 833 static inline void
 834 must_not_null(const char *ptr)
 835 {
 836     if (!ptr) {
 837         rb_raise(rb_eArgError, "NULL pointer given");
 838     }
 839 }
 840
 841 static inline VALUE
 842 str_alloc(VALUE klass, size_t size)
 843 {
 844     assert(size > 0);
 845     RVARGC_NEWOBJ_OF(str, struct RString, klass,
 846                      T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
 847     return (VALUE)str;
 848 }
 849
 850 static inline VALUE
 851 str_alloc_embed(VALUE klass, size_t capa)
 852 {
 853     size_t size = str_embed_size(capa);
 854     assert(rb_gc_size_allocatable_p(size));
 855 #if !USE_RVARGC
 856     assert(size <= sizeof(struct RString));
 857 #endif
 858     return str_alloc(klass, size);
 859 }
 860
 861 static inline VALUE
 862 str_alloc_heap(VALUE klass)
 863 {
 864     return str_alloc(klass, sizeof(struct RString));
 865 }
 866
 867 static inline VALUE
 868 empty_str_alloc(VALUE klass)
 869 {
 870     RUBY_DTRACE_CREATE_HOOK(STRING, 0);
 871     VALUE str = str_alloc_embed(klass, 0);
 872     memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
 873     return str;
 874 }
 875
 876 static VALUE
 877 str_new0(VALUE klass, const char *ptr, long len, int termlen)
 878 {
 879     VALUE str;
 880
 881     if (len < 0) {
 882         rb_raise(rb_eArgError, "negative string size (or size too big)");
 883     }
 884
 885     RUBY_DTRACE_CREATE_HOOK(STRING, len);
 886
 887     if (STR_EMBEDDABLE_P(len, termlen)) {
 888         str = str_alloc_embed(klass, len + termlen);
 889         if (len == 0) {
 890             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
 891         }
 892     }
 893     else {
 894         str = str_alloc_heap(klass);
 895         RSTRING(str)->as.heap.aux.capa = len;
 896         /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
 897          * integer overflow.  If we can STATIC_ASSERT that, the following
 898          * mul_add_mul can be reverted to a simple ALLOC_N. */
 899         RSTRING(str)->as.heap.ptr =
 900             rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
 901         STR_SET_NOEMBED(str);
 902     }
 903     if (ptr) {
 904         memcpy(RSTRING_PTR(str), ptr, len);
 905     }
 906     STR_SET_LEN(str, len);
 907     TERM_FILL(RSTRING_PTR(str) + len, termlen);
 908     return str;
 909 }
 910
 911 static VALUE
 912 str_new(VALUE klass, const char *ptr, long len)
 913 {
 914     return str_new0(klass, ptr, len, 1);
 915 }
 916
 917 VALUE
 918 rb_str_new(const char *ptr, long len)
 919 {
 920     return str_new(rb_cString, ptr, len);
 921 }
 922
 923 VALUE
 924 rb_usascii_str_new(const char *ptr, long len)
 925 {
 926     VALUE str = rb_str_new(ptr, len);
 927     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 928     return str;
 929 }
 930
 931 VALUE
 932 rb_utf8_str_new(const char *ptr, long len)
 933 {
 934     VALUE str = str_new(rb_cString, ptr, len);
 935     rb_enc_associate_index(str, rb_utf8_encindex());
 936     return str;
 937 }
 938
 939 VALUE
 940 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 941 {
 942     VALUE str;
 943
 944     if (!enc) return rb_str_new(ptr, len);
 945
 946     str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
 947     rb_enc_associate(str, enc);
 948     return str;
 949 }
 950
 951 VALUE
 952 rb_str_new_cstr(const char *ptr)
 953 {
 954     must_not_null(ptr);
 955     /* rb_str_new_cstr() can take pointer from non-malloc-generated
 956      * memory regions, and that cannot be detected by the MSAN.  Just
 957      * trust the programmer that the argument passed here is a sane C
 958      * string. */
 959     __msan_unpoison_string(ptr);
 960     return rb_str_new(ptr, strlen(ptr));
 961 }
 962
 963 VALUE
 964 rb_usascii_str_new_cstr(const char *ptr)
 965 {
 966     VALUE str = rb_str_new_cstr(ptr);
 967     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 968     return str;
 969 }
 970
 971 VALUE
 972 rb_utf8_str_new_cstr(const char *ptr)
 973 {
 974     VALUE str = rb_str_new_cstr(ptr);
 975     rb_enc_associate_index(str, rb_utf8_encindex());
 976     return str;
 977 }
 978
 979 VALUE
 980 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
 981 {
 982     must_not_null(ptr);
 983     if (rb_enc_mbminlen(enc) != 1) {
 984         rb_raise(rb_eArgError, "wchar encoding given");
 985     }
 986     return rb_enc_str_new(ptr, strlen(ptr), enc);
 987 }
 988
 989 static VALUE
 990 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
 991 {
 992     VALUE str;
 993
 994     if (len < 0) {
 995         rb_raise(rb_eArgError, "negative string size (or size too big)");
 996     }
 997
 998     if (!ptr) {
 999         rb_encoding *enc = rb_enc_get_from_index(encindex);
1000         str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1001     }
1002     else {
1003         RUBY_DTRACE_CREATE_HOOK(STRING, len);
1004         str = str_alloc_heap(klass);
1005         RSTRING(str)->as.heap.len = len;
1006         RSTRING(str)->as.heap.ptr = (char *)ptr;
1007         RSTRING(str)->as.heap.aux.capa = len;
1008         STR_SET_NOEMBED(str);
1009         RBASIC(str)->flags |= STR_NOFREE;
1010     }
1011     rb_enc_associate_index(str, encindex);
1012     return str;
1013 }
1014
1015 VALUE
1016 rb_str_new_static(const char *ptr, long len)
1017 {
1018     return str_new_static(rb_cString, ptr, len, 0);
1019 }
1020
1021 VALUE
1022 rb_usascii_str_new_static(const char *ptr, long len)
1023 {
1024     return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1025 }
1026
1027 VALUE
1028 rb_utf8_str_new_static(const char *ptr, long len)
1029 {
1030     return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1031 }
1032
1033 VALUE
1034 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1035 {
1036     return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1037 }
1038
1039 VALUE
1040 rb_tainted_str_new(const char *ptr, long len)
1041 {
1042     rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new", NULL);
1043     return rb_str_new(ptr, len);
1044 }
1045
1046 VALUE
1047 rb_tainted_str_new_cstr(const char *ptr)
1048 {
1049     rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new_cstr", NULL);
1050     return rb_str_new_cstr(ptr);
1051 }
1052
1053 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1054                                    rb_encoding *from, rb_encoding *to,
1055                                    int ecflags, VALUE ecopts);
1056
1057 static inline bool
1058 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1059 {
1060     int encidx = rb_enc_to_index(enc);
1061     if (rb_enc_get_index(str) == encidx)
1062         return is_ascii_string(str);
1063     return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1064 }
1065
1066 VALUE
1067 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1068 {
1069     long len;
1070     const char *ptr;
1071     VALUE newstr;
1072
1073     if (!to) return str;
1074     if (!from) from = rb_enc_get(str);
1075     if (from == to) return str;
1076     if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1077         to == rb_ascii8bit_encoding()) {
1078         if (STR_ENC_GET(str) != to) {
1079             str = rb_str_dup(str);
1080             rb_enc_associate(str, to);
1081         }
1082         return str;
1083     }
1084
1085     RSTRING_GETMEM(str, ptr, len);
1086     newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1087                                    from, to, ecflags, ecopts);
1088     if (NIL_P(newstr)) {
1089         /* some error, return original */
1090         return str;
1091     }
1092     return newstr;
1093 }
1094
1095 VALUE
1096 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1097                          rb_encoding *from, int ecflags, VALUE ecopts)
1098 {
1099     long olen;
1100
1101     olen = RSTRING_LEN(newstr);
1102     if (ofs < -olen || olen < ofs)
1103         rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1104     if (ofs < 0) ofs += olen;
1105     if (!from) {
1106         STR_SET_LEN(newstr, ofs);
1107         return rb_str_cat(newstr, ptr, len);
1108     }
1109
1110     rb_str_modify(newstr);
1111     return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1112                                  rb_enc_get(newstr),
1113                                  ecflags, ecopts);
1114 }
1115
1116 VALUE
1117 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1118 {
1119     STR_SET_LEN(str, 0);
1120     rb_enc_associate(str, enc);
1121     rb_str_cat(str, ptr, len);
1122     return str;
1123 }
1124
1125 static VALUE
1126 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1127                       rb_encoding *from, rb_encoding *to,
1128                       int ecflags, VALUE ecopts)
1129 {
1130     rb_econv_t *ec;
1131     rb_econv_result_t ret;
1132     long olen;
1133     VALUE econv_wrapper;
1134     const unsigned char *start, *sp;
1135     unsigned char *dest, *dp;
1136     size_t converted_output = (size_t)ofs;
1137
1138     olen = rb_str_capacity(newstr);
1139
1140     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1141     RBASIC_CLEAR_CLASS(econv_wrapper);
1142     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1143     if (!ec) return Qnil;
1144     DATA_PTR(econv_wrapper) = ec;
1145
1146     sp = (unsigned char*)ptr;
1147     start = sp;
1148     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1149            (dp = dest + converted_output),
1150            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1151            ret == econv_destination_buffer_full) {
1152         /* destination buffer short */
1153         size_t converted_input = sp - start;
1154         size_t rest = len - converted_input;
1155         converted_output = dp - dest;
1156         rb_str_set_len(newstr, converted_output);
1157         if (converted_input && converted_output &&
1158             rest < (LONG_MAX / converted_output)) {
1159             rest = (rest * converted_output) / converted_input;
1160         }
1161         else {
1162             rest = olen;
1163         }
1164         olen += rest < 2 ? 2 : rest;
1165         rb_str_resize(newstr, olen);
1166     }
1167     DATA_PTR(econv_wrapper) = 0;
1168     rb_econv_close(ec);
1169     switch (ret) {
1170       case econv_finished:
1171         len = dp - (unsigned char*)RSTRING_PTR(newstr);
1172         rb_str_set_len(newstr, len);
1173         rb_enc_associate(newstr, to);
1174         return newstr;
1175
1176       default:
1177         return Qnil;
1178     }
1179 }
1180
1181 VALUE
1182 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1183 {
1184     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1185 }
1186
1187 VALUE
1188 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1189 {
1190     rb_encoding *ienc;
1191     VALUE str;
1192     const int eidx = rb_enc_to_index(eenc);
1193
1194     if (!ptr) {
1195         return rb_enc_str_new(ptr, len, eenc);
1196     }
1197
1198     /* ASCII-8BIT case, no conversion */
1199     if ((eidx == rb_ascii8bit_encindex()) ||
1200         (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1201         return rb_str_new(ptr, len);
1202     }
1203     /* no default_internal or same encoding, no conversion */
1204     ienc = rb_default_internal_encoding();
1205     if (!ienc || eenc == ienc) {
1206         return rb_enc_str_new(ptr, len, eenc);
1207     }
1208     /* ASCII compatible, and ASCII only string, no conversion in
1209      * default_internal */
1210     if ((eidx == rb_ascii8bit_encindex()) ||
1211         (eidx == rb_usascii_encindex()) ||
1212         (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1213         return rb_enc_str_new(ptr, len, ienc);
1214     }
1215     /* convert from the given encoding to default_internal */
1216     str = rb_enc_str_new(NULL, 0, ienc);
1217     /* when the conversion failed for some reason, just ignore the
1218      * default_internal and result in the given encoding as-is. */
1219     if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1220         rb_str_initialize(str, ptr, len, eenc);
1221     }
1222     return str;
1223 }
1224
1225 VALUE
1226 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1227 {
1228     int eidx = rb_enc_to_index(eenc);
1229     if (eidx == rb_usascii_encindex() &&
1230         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1231         rb_enc_associate_index(str, rb_ascii8bit_encindex());
1232         return str;
1233     }
1234     rb_enc_associate_index(str, eidx);
1235     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1236 }
1237
1238 VALUE
1239 rb_external_str_new(const char *ptr, long len)
1240 {
1241     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1242 }
1243
1244 VALUE
1245 rb_external_str_new_cstr(const char *ptr)
1246 {
1247     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1248 }
1249
1250 VALUE
1251 rb_locale_str_new(const char *ptr, long len)
1252 {
1253     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1254 }
1255
1256 VALUE
1257 rb_locale_str_new_cstr(const char *ptr)
1258 {
1259     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1260 }
1261
1262 VALUE
1263 rb_filesystem_str_new(const char *ptr, long len)
1264 {
1265     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1266 }
1267
1268 VALUE
1269 rb_filesystem_str_new_cstr(const char *ptr)
1270 {
1271     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1272 }
1273
1274 VALUE
1275 rb_str_export(VALUE str)
1276 {
1277     return rb_str_export_to_enc(str, rb_default_external_encoding());
1278 }
1279
1280 VALUE
1281 rb_str_export_locale(VALUE str)
1282 {
1283     return rb_str_export_to_enc(str, rb_locale_encoding());
1284 }
1285
1286 VALUE
1287 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1288 {
1289     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1290 }
1291
1292 static VALUE
1293 str_replace_shared_without_enc(VALUE str2, VALUE str)
1294 {
1295     const int termlen = TERM_LEN(str);
1296     char *ptr;
1297     long len;
1298
1299     RSTRING_GETMEM(str, ptr, len);
1300     if (str_embed_capa(str2) >= len + termlen) {
1301         char *ptr2 = RSTRING(str2)->as.embed.ary;
1302         STR_SET_EMBED(str2);
1303         memcpy(ptr2, RSTRING_PTR(str), len);
1304         STR_SET_EMBED_LEN(str2, len);
1305         TERM_FILL(ptr2+len, termlen);
1306     }
1307     else {
1308         VALUE root;
1309         if (STR_SHARED_P(str)) {
1310             root = RSTRING(str)->as.heap.aux.shared;
1311             RSTRING_GETMEM(str, ptr, len);
1312         }
1313         else {
1314             root = rb_str_new_frozen(str);
1315             RSTRING_GETMEM(root, ptr, len);
1316         }
1317         assert(OBJ_FROZEN(root));
1318         if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1319             if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1320                 rb_fatal("about to free a possible shared root");
1321             }
1322             char *ptr2 = STR_HEAP_PTR(str2);
1323             if (ptr2 != ptr) {
1324                 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1325             }
1326         }
1327         FL_SET(str2, STR_NOEMBED);
1328         RSTRING(str2)->as.heap.len = len;
1329         RSTRING(str2)->as.heap.ptr = ptr;
1330         STR_SET_SHARED(str2, root);
1331     }
1332     return str2;
1333 }
1334
1335 static VALUE
1336 str_replace_shared(VALUE str2, VALUE str)
1337 {
1338     str_replace_shared_without_enc(str2, str);
1339     rb_enc_cr_str_exact_copy(str2, str);
1340     return str2;
1341 }
1342
1343 static VALUE
1344 str_new_shared(VALUE klass, VALUE str)
1345 {
1346     return str_replace_shared(str_alloc_heap(klass), str);
1347 }
1348
1349 VALUE
1350 rb_str_new_shared(VALUE str)
1351 {
1352     return str_new_shared(rb_obj_class(str), str);
1353 }
1354
1355 VALUE
1356 rb_str_new_frozen(VALUE orig)
1357 {
1358     if (OBJ_FROZEN(orig)) return orig;
1359     return str_new_frozen(rb_obj_class(orig), orig);
1360 }
1361
1362 static VALUE
1363 rb_str_new_frozen_String(VALUE orig)
1364 {
1365     if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1366     return str_new_frozen(rb_cString, orig);
1367 }
1368
1369 VALUE
1370 rb_str_tmp_frozen_acquire(VALUE orig)
1371 {
1372     if (OBJ_FROZEN_RAW(orig)) return orig;
1373     return str_new_frozen_buffer(0, orig, FALSE);
1374 }
1375
1376 void
1377 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1378 {
1379     if (RBASIC_CLASS(tmp) != 0)
1380         return;
1381
1382     if (STR_EMBED_P(tmp)) {
1383         assert(OBJ_FROZEN_RAW(tmp));
1384     }
1385     else if (FL_TEST_RAW(orig, STR_SHARED) &&
1386             !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1387         VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1388
1389         if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1390             assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1391             assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1392
1393             /* Unshare orig since the root (tmp) only has this one child. */
1394             FL_UNSET_RAW(orig, STR_SHARED);
1395             RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1396             RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1397             assert(OBJ_FROZEN_RAW(tmp));
1398
1399             /* Make tmp embedded and empty so it is safe for sweeping. */
1400             STR_SET_EMBED(tmp);
1401             STR_SET_EMBED_LEN(tmp, 0);
1402         }
1403     }
1404 }
1405
1406 static VALUE
1407 str_new_frozen(VALUE klass, VALUE orig)
1408 {
1409     return str_new_frozen_buffer(klass, orig, TRUE);
1410 }
1411
1412 static VALUE
1413 heap_str_make_shared(VALUE klass, VALUE orig)
1414 {
1415     assert(!STR_EMBED_P(orig));
1416     assert(!STR_SHARED_P(orig));
1417
1418     VALUE str = str_alloc_heap(klass);
1419     STR_SET_NOEMBED(str);
1420     RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1421     RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1422     RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1423     RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1424     RBASIC(orig)->flags &= ~STR_NOFREE;
1425     STR_SET_SHARED(orig, str);
1426     if (klass == 0)
1427         FL_UNSET_RAW(str, STR_BORROWED);
1428     return str;
1429 }
1430
1431 static VALUE
1432 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1433 {
1434     VALUE str;
1435
1436     long len = RSTRING_LEN(orig);
1437
1438     if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, 1)) {
1439         str = str_new(klass, RSTRING_PTR(orig), len);
1440         assert(STR_EMBED_P(str));
1441     }
1442     else {
1443         if (FL_TEST_RAW(orig, STR_SHARED)) {
1444             VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1445             long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1446             long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1447             assert(ofs >= 0);
1448             assert(rest >= 0);
1449             assert(ofs + rest <= RSTRING_LEN(shared));
1450 #if !USE_RVARGC
1451             assert(!STR_EMBED_P(shared));
1452 #endif
1453             assert(OBJ_FROZEN(shared));
1454
1455             if ((ofs > 0) || (rest > 0) ||
1456                 (klass != RBASIC(shared)->klass) ||
1457                 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1458                 str = str_new_shared(klass, shared);
1459                 assert(!STR_EMBED_P(str));
1460                 RSTRING(str)->as.heap.ptr += ofs;
1461                 RSTRING(str)->as.heap.len -= ofs + rest;
1462             }
1463             else {
1464                 if (RBASIC_CLASS(shared) == 0)
1465                     FL_SET_RAW(shared, STR_BORROWED);
1466                 return shared;
1467             }
1468         }
1469         else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1470             str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1471             STR_SET_EMBED(str);
1472             memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1473             STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1474             TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1475         }
1476         else {
1477             str = heap_str_make_shared(klass, orig);
1478         }
1479     }
1480
1481     if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1482     OBJ_FREEZE(str);
1483     return str;
1484 }
1485
1486 VALUE
1487 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1488 {
1489     return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1490 }
1491
1492 static VALUE
1493 str_new_empty_String(VALUE str)
1494 {
1495     VALUE v = rb_str_new(0, 0);
1496     rb_enc_copy(v, str);
1497     return v;
1498 }
1499
1500 #define STR_BUF_MIN_SIZE 63
1501 #if !USE_RVARGC
1502 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1503 #endif
1504
1505 VALUE
1506 rb_str_buf_new(long capa)
1507 {
1508     if (STR_EMBEDDABLE_P(capa, 1)) {
1509         return str_alloc_embed(rb_cString, capa + 1);
1510     }
1511
1512     VALUE str = str_alloc_heap(rb_cString);
1513
1514 #if !USE_RVARGC
1515     if (capa < STR_BUF_MIN_SIZE) {
1516         capa = STR_BUF_MIN_SIZE;
1517     }
1518 #endif
1519     FL_SET(str, STR_NOEMBED);
1520     RSTRING(str)->as.heap.aux.capa = capa;
1521     RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1522     RSTRING(str)->as.heap.ptr[0] = '\0';
1523
1524     return str;
1525 }
1526
1527 VALUE
1528 rb_str_buf_new_cstr(const char *ptr)
1529 {
1530     VALUE str;
1531     long len = strlen(ptr);
1532
1533     str = rb_str_buf_new(len);
1534     rb_str_buf_cat(str, ptr, len);
1535
1536     return str;
1537 }
1538
1539 VALUE
1540 rb_str_tmp_new(long len)
1541 {
1542     return str_new(0, 0, len);
1543 }
1544
1545 void
1546 rb_str_free(VALUE str)
1547 {
1548     if (FL_TEST(str, RSTRING_FSTR)) {
1549         st_data_t fstr = (st_data_t)str;
1550
1551         RB_VM_LOCK_ENTER();
1552         {
1553             st_delete(rb_vm_fstring_table(), &fstr, NULL);
1554             RB_DEBUG_COUNTER_INC(obj_str_fstr);
1555         }
1556         RB_VM_LOCK_LEAVE();
1557     }
1558
1559     if (STR_EMBED_P(str)) {
1560         RB_DEBUG_COUNTER_INC(obj_str_embed);
1561     }
1562     else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1563         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1564         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1565     }
1566     else {
1567         RB_DEBUG_COUNTER_INC(obj_str_ptr);
1568         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1569     }
1570 }
1571
1572 RUBY_FUNC_EXPORTED size_t
1573 rb_str_memsize(VALUE str)
1574 {
1575     if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1576         return STR_HEAP_SIZE(str);
1577     }
1578     else {
1579         return 0;
1580     }
1581 }
1582
1583 VALUE
1584 rb_str_to_str(VALUE str)
1585 {
1586     return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1587 }
1588
1589 static inline void str_discard(VALUE str);
1590 static void str_shared_replace(VALUE str, VALUE str2);
1591
1592 void
1593 rb_str_shared_replace(VALUE str, VALUE str2)
1594 {
1595     if (str != str2) str_shared_replace(str, str2);
1596 }
1597
1598 static void
1599 str_shared_replace(VALUE str, VALUE str2)
1600 {
1601     rb_encoding *enc;
1602     int cr;
1603     int termlen;
1604
1605     RUBY_ASSERT(str2 != str);
1606     enc = STR_ENC_GET(str2);
1607     cr = ENC_CODERANGE(str2);
1608     str_discard(str);
1609     termlen = rb_enc_mbminlen(enc);
1610
1611     if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1612         STR_SET_EMBED(str);
1613         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1614         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1615         rb_enc_associate(str, enc);
1616         ENC_CODERANGE_SET(str, cr);
1617     }
1618     else {
1619 #if USE_RVARGC
1620         if (STR_EMBED_P(str2)) {
1621             assert(!FL_TEST(str2, STR_SHARED));
1622             long len = RSTRING(str2)->as.embed.len;
1623             assert(len + termlen <= str_embed_capa(str2));
1624
1625             char *new_ptr = ALLOC_N(char, len + termlen);
1626             memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1627             RSTRING(str2)->as.heap.ptr = new_ptr;
1628             RSTRING(str2)->as.heap.len = len;
1629             RSTRING(str2)->as.heap.aux.capa = len;
1630             STR_SET_NOEMBED(str2);
1631         }
1632 #endif
1633
1634         STR_SET_NOEMBED(str);
1635         FL_UNSET(str, STR_SHARED);
1636         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1637         RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1638
1639         if (FL_TEST(str2, STR_SHARED)) {
1640             VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1641             STR_SET_SHARED(str, shared);
1642         }
1643         else {
1644             RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1645         }
1646
1647         /* abandon str2 */
1648         STR_SET_EMBED(str2);
1649         RSTRING_PTR(str2)[0] = 0;
1650         STR_SET_EMBED_LEN(str2, 0);
1651         rb_enc_associate(str, enc);
1652         ENC_CODERANGE_SET(str, cr);
1653     }
1654 }
1655
1656 VALUE
1657 rb_obj_as_string(VALUE obj)
1658 {
1659     VALUE str;
1660
1661     if (RB_TYPE_P(obj, T_STRING)) {
1662         return obj;
1663     }
1664     str = rb_funcall(obj, idTo_s, 0);
1665     return rb_obj_as_string_result(str, obj);
1666 }
1667
1668 MJIT_FUNC_EXPORTED VALUE
1669 rb_obj_as_string_result(VALUE str, VALUE obj)
1670 {
1671     if (!RB_TYPE_P(str, T_STRING))
1672         return rb_any_to_s(obj);
1673     return str;
1674 }
1675
1676 static VALUE
1677 str_replace(VALUE str, VALUE str2)
1678 {
1679     long len;
1680
1681     len = RSTRING_LEN(str2);
1682     if (STR_SHARED_P(str2)) {
1683         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1684         assert(OBJ_FROZEN(shared));
1685         STR_SET_NOEMBED(str);
1686         RSTRING(str)->as.heap.len = len;
1687         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1688         STR_SET_SHARED(str, shared);
1689         rb_enc_cr_str_exact_copy(str, str2);
1690     }
1691     else {
1692         str_replace_shared(str, str2);
1693     }
1694
1695     return str;
1696 }
1697
1698 static inline VALUE
1699 ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1700 {
1701     assert(size > 0);
1702     RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1703                            T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
1704     return (VALUE)str;
1705 }
1706
1707 static inline VALUE
1708 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1709 {
1710     size_t size = str_embed_size(capa);
1711     assert(rb_gc_size_allocatable_p(size));
1712 #if !USE_RVARGC
1713     assert(size <= sizeof(struct RString));
1714 #endif
1715     return ec_str_alloc(ec, klass, size);
1716 }
1717
1718 static inline VALUE
1719 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1720 {
1721     return ec_str_alloc(ec, klass, sizeof(struct RString));
1722 }
1723
1724 static inline VALUE
1725 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1726 {
1727     const VALUE flag_mask =
1728 #if !USE_RVARGC
1729         RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1730 #endif
1731         ENC_CODERANGE_MASK | ENCODING_MASK |
1732         FL_FREEZE
1733         ;
1734     VALUE flags = FL_TEST_RAW(str, flag_mask);
1735     int encidx = 0;
1736     if (STR_EMBED_P(str)) {
1737         long len = RSTRING_EMBED_LEN(str);
1738
1739         assert(str_embed_capa(dup) >= len + 1);
1740         STR_SET_EMBED_LEN(dup, len);
1741         MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1742     }
1743     else {
1744         VALUE root = str;
1745         if (FL_TEST_RAW(str, STR_SHARED)) {
1746             root = RSTRING(str)->as.heap.aux.shared;
1747         }
1748         else if (UNLIKELY(!(flags & FL_FREEZE))) {
1749             root = str = str_new_frozen(klass, str);
1750             flags = FL_TEST_RAW(str, flag_mask);
1751         }
1752         assert(!STR_SHARED_P(root));
1753         assert(RB_OBJ_FROZEN_RAW(root));
1754 #if USE_RVARGC
1755         if (1) {
1756 #else
1757         if (STR_EMBED_P(root)) {
1758             MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1759                    char, RSTRING_EMBED_LEN_MAX + 1);
1760         }
1761         else {
1762 #endif
1763             RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1764             RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1765             RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1766             flags |= RSTRING_NOEMBED | STR_SHARED;
1767         }
1768     }
1769
1770     if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1771         encidx = rb_enc_get_index(str);
1772         flags &= ~ENCODING_MASK;
1773     }
1774     FL_SET_RAW(dup, flags & ~FL_FREEZE);
1775     if (encidx) rb_enc_associate_index(dup, encidx);
1776     return dup;
1777 }
1778
1779 static inline VALUE
1780 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1781 {
1782     VALUE dup;
1783     if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1784         dup = ec_str_alloc_heap(ec, klass);
1785     }
1786     else {
1787         dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1788     }
1789
1790     return str_duplicate_setup(klass, str, dup);
1791 }
1792
1793 static inline VALUE
1794 str_duplicate(VALUE klass, VALUE str)
1795 {
1796     VALUE dup;
1797     if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1798         dup = str_alloc_heap(klass);
1799     }
1800     else {
1801        dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1802     }
1803
1804     return str_duplicate_setup(klass, str, dup);
1805 }
1806
1807 VALUE
1808 rb_str_dup(VALUE str)
1809 {
1810     return str_duplicate(rb_obj_class(str), str);
1811 }
1812
1813 VALUE
1814 rb_str_resurrect(VALUE str)
1815 {
1816     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1817     return str_duplicate(rb_cString, str);
1818 }
1819
1820 VALUE
1821 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1822 {
1823     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1824     return ec_str_duplicate(ec, rb_cString, str);
1825 }
1826
1827 /*
1828  *  call-seq:
1829  *    String.new(string = '') -> new_string
1830  *    String.new(string = '', encoding: encoding) -> new_string
1831  *    String.new(string = '', capacity: size) -> new_string
1832  *
1833  *  Returns a new \String that is a copy of +string+.
1834  *
1835  *  With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1836  *    s = String.new
1837  *    s # => ""
1838  *    s.encoding # => #<Encoding:ASCII-8BIT>
1839  *
1840  *  With the single \String argument +string+, returns a copy of +string+
1841  *  with the same encoding as +string+:
1842  *    s = String.new("Que veut dire \u{e7}a?")
1843  *    s # => "Que veut dire \u{e7}a?"
1844  *    s.encoding # => #<Encoding:UTF-8>
1845  *
1846  *  Literal strings like <tt>""</tt> or here-documents always use
1847  *  {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1848  *
1849  *  With keyword +encoding+, returns a copy of +str+
1850  *  with the specified encoding:
1851  *    s = String.new(encoding: 'ASCII')
1852  *    s.encoding # => #<Encoding:US-ASCII>
1853  *    s = String.new('foo', encoding: 'ASCII')
1854  *    s.encoding # => #<Encoding:US-ASCII>
1855  *
1856  *  Note that these are equivalent:
1857  *    s0 = String.new('foo', encoding: 'ASCII')
1858  *    s1 = 'foo'.force_encoding('ASCII')
1859  *    s0.encoding == s1.encoding # => true
1860  *
1861  *  With keyword +capacity+, returns a copy of +str+;
1862  *  the given +capacity+ may set the size of the internal buffer,
1863  *  which may affect performance:
1864  *    String.new(capacity: 1) # => ""
1865  *    String.new(capacity: 4096) # => ""
1866  *
1867  *  The +string+, +encoding+, and +capacity+ arguments may all be used together:
1868  *
1869  *    String.new('hello', encoding: 'UTF-8', capacity: 25)
1870  *
1871  */
1872
1873 static VALUE
1874 rb_str_init(int argc, VALUE *argv, VALUE str)
1875 {
1876     static ID keyword_ids[2];
1877     VALUE orig, opt, venc, vcapa;
1878     VALUE kwargs[2];
1879     rb_encoding *enc = 0;
1880     int n;
1881
1882     if (!keyword_ids[0]) {
1883         keyword_ids[0] = rb_id_encoding();
1884         CONST_ID(keyword_ids[1], "capacity");
1885     }
1886
1887     n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1888     if (!NIL_P(opt)) {
1889         rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1890         venc = kwargs[0];
1891         vcapa = kwargs[1];
1892         if (venc != Qundef && !NIL_P(venc)) {
1893             enc = rb_to_encoding(venc);
1894         }
1895         if (vcapa != Qundef && !NIL_P(vcapa)) {
1896             long capa = NUM2LONG(vcapa);
1897             long len = 0;
1898             int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1899
1900             if (capa < STR_BUF_MIN_SIZE) {
1901                 capa = STR_BUF_MIN_SIZE;
1902             }
1903             if (n == 1) {
1904                 StringValue(orig);
1905                 len = RSTRING_LEN(orig);
1906                 if (capa < len) {
1907                     capa = len;
1908                 }
1909                 if (orig == str) n = 0;
1910             }
1911             str_modifiable(str);
1912             if (STR_EMBED_P(str)) { /* make noembed always */
1913                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1914 #if USE_RVARGC
1915                 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1916                 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1917 #else
1918                 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1919 #endif
1920                 RSTRING(str)->as.heap.ptr = new_ptr;
1921             }
1922             else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1923                 const size_t size = (size_t)capa + termlen;
1924                 const char *const old_ptr = RSTRING_PTR(str);
1925                 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1926                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1927                 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1928                 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1929                 RSTRING(str)->as.heap.ptr = new_ptr;
1930             }
1931             else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1932                 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1933                         (size_t)capa + termlen, STR_HEAP_SIZE(str));
1934             }
1935             RSTRING(str)->as.heap.len = len;
1936             TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1937             if (n == 1) {
1938                 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1939                 rb_enc_cr_str_exact_copy(str, orig);
1940             }
1941             FL_SET(str, STR_NOEMBED);
1942             RSTRING(str)->as.heap.aux.capa = capa;
1943         }
1944         else if (n == 1) {
1945             rb_str_replace(str, orig);
1946         }
1947         if (enc) {
1948             rb_enc_associate(str, enc);
1949             ENC_CODERANGE_CLEAR(str);
1950         }
1951     }
1952     else if (n == 1) {
1953         rb_str_replace(str, orig);
1954     }
1955     return str;
1956 }
1957
1958 #ifdef NONASCII_MASK
1959 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1960
1961 /*
1962  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1963  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1964  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1965  *
1966  * if (!(byte & 0x80))
1967  *   byte |= 0x40;          // turn on bit6
1968  * return ((byte>>6) & 1);  // bit6 represent whether this byte is leading or not.
1969  *
1970  * This function calculates whether a byte is leading or not for all bytes
1971  * in the argument word by concurrently using the above logic, and then
1972  * adds up the number of leading bytes in the word.
1973  */
1974 static inline uintptr_t
1975 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1976 {
1977     uintptr_t d = *s;
1978
1979     /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1980     d = (d>>6) | (~d>>7);
1981     d &= NONASCII_MASK >> 7;
1982
1983     /* Gather all bytes. */
1984 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1985     /* use only if it can use POPCNT */
1986     return rb_popcount_intptr(d);
1987 #else
1988     d += (d>>8);
1989     d += (d>>16);
1990 # if SIZEOF_VOIDP == 8
1991     d += (d>>32);
1992 # endif
1993     return (d&0xF);
1994 #endif
1995 }
1996 #endif
1997
1998 static inline long
1999 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2000 {
2001     long c;
2002     const char *q;
2003
2004     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2005         long diff = (long)(e - p);
2006         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2007     }
2008 #ifdef NONASCII_MASK
2009     else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2010         uintptr_t len = 0;
2011         if ((int)sizeof(uintptr_t) * 2 < e - p) {
2012             const uintptr_t *s, *t;
2013             const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2014             s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2015             t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2016             while (p < (const char *)s) {
2017                 if (is_utf8_lead_byte(*p)) len++;
2018                 p++;
2019             }
2020             while (s < t) {
2021                 len += count_utf8_lead_bytes_with_word(s);
2022                 s++;
2023             }
2024             p = (const char *)s;
2025         }
2026         while (p < e) {
2027             if (is_utf8_lead_byte(*p)) len++;
2028             p++;
2029         }
2030         return (long)len;
2031     }
2032 #endif
2033     else if (rb_enc_asciicompat(enc)) {
2034         c = 0;
2035         if (ENC_CODERANGE_CLEAN_P(cr)) {
2036             while (p < e) {
2037                 if (ISASCII(*p)) {
2038                     q = search_nonascii(p, e);
2039                     if (!q)
2040                         return c + (e - p);
2041                     c += q - p;
2042                     p = q;
2043                 }
2044                 p += rb_enc_fast_mbclen(p, e, enc);
2045                 c++;
2046             }
2047         }
2048         else {
2049             while (p < e) {
2050                 if (ISASCII(*p)) {
2051                     q = search_nonascii(p, e);
2052                     if (!q)
2053                         return c + (e - p);
2054                     c += q - p;
2055                     p = q;
2056                 }
2057                 p += rb_enc_mbclen(p, e, enc);
2058                 c++;
2059             }
2060         }
2061         return c;
2062     }
2063
2064     for (c=0; p<e; c++) {
2065         p += rb_enc_mbclen(p, e, enc);
2066     }
2067     return c;
2068 }
2069
2070 long
2071 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2072 {
2073     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2074 }
2075
2076 /* To get strlen with cr
2077  * Note that given cr is not used.
2078  */
2079 long
2080 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2081 {
2082     long c;
2083     const char *q;
2084     int ret;
2085
2086     *cr = 0;
2087     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2088         long diff = (long)(e - p);
2089         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2090     }
2091     else if (rb_enc_asciicompat(enc)) {
2092         c = 0;
2093         while (p < e) {
2094             if (ISASCII(*p)) {
2095                 q = search_nonascii(p, e);
2096                 if (!q) {
2097                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2098                     return c + (e - p);
2099                 }
2100                 c += q - p;
2101                 p = q;
2102             }
2103             ret = rb_enc_precise_mbclen(p, e, enc);
2104             if (MBCLEN_CHARFOUND_P(ret)) {
2105                 *cr |= ENC_CODERANGE_VALID;
2106                 p += MBCLEN_CHARFOUND_LEN(ret);
2107             }
2108             else {
2109                 *cr = ENC_CODERANGE_BROKEN;
2110                 p++;
2111             }
2112             c++;
2113         }
2114         if (!*cr) *cr = ENC_CODERANGE_7BIT;
2115         return c;
2116     }
2117
2118     for (c=0; p<e; c++) {
2119         ret = rb_enc_precise_mbclen(p, e, enc);
2120         if (MBCLEN_CHARFOUND_P(ret)) {
2121             *cr |= ENC_CODERANGE_VALID;
2122             p += MBCLEN_CHARFOUND_LEN(ret);
2123         }
2124         else {
2125             *cr = ENC_CODERANGE_BROKEN;
2126             if (p + rb_enc_mbminlen(enc) <= e)
2127                 p += rb_enc_mbminlen(enc);
2128             else
2129                 p = e;
2130         }
2131     }
2132     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2133     return c;
2134 }
2135
2136 /* enc must be str's enc or rb_enc_check(str, str2) */
2137 static long
2138 str_strlen(VALUE str, rb_encoding *enc)
2139 {
2140     const char *p, *e;
2141     int cr;
2142
2143     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2144     if (!enc) enc = STR_ENC_GET(str);
2145     p = RSTRING_PTR(str);
2146     e = RSTRING_END(str);
2147     cr = ENC_CODERANGE(str);
2148
2149     if (cr == ENC_CODERANGE_UNKNOWN) {
2150         long n = rb_enc_strlen_cr(p, e, enc, &cr);
2151         if (cr) ENC_CODERANGE_SET(str, cr);
2152         return n;
2153     }
2154     else {
2155         return enc_strlen(p, e, enc, cr);
2156     }
2157 }
2158
2159 long
2160 rb_str_strlen(VALUE str)
2161 {
2162     return str_strlen(str, NULL);
2163 }
2164
2165 /*
2166  *  call-seq:
2167  *    length -> integer
2168  *
2169  *  Returns the count of characters (not bytes) in +self+:
2170  *
2171  *    "\x80\u3042".length # => 2
2172  *    "hello".length # => 5
2173  *
2174  *  String#size is an alias for String#length.
2175  *
2176  *  Related: String#bytesize.
2177  */
2178
2179 VALUE
2180 rb_str_length(VALUE str)
2181 {
2182     return LONG2NUM(str_strlen(str, NULL));
2183 }
2184
2185 /*
2186  *  call-seq:
2187  *    bytesize -> integer
2188  *
2189  *  Returns the count  of bytes in +self+:
2190  *
2191  *    "\x80\u3042".bytesize # => 4
2192  *    "hello".bytesize # => 5
2193  *
2194  *  Related: String#length.
2195  */
2196
2197 static VALUE
2198 rb_str_bytesize(VALUE str)
2199 {
2200     return LONG2NUM(RSTRING_LEN(str));
2201 }
2202
2203 /*
2204  *  call-seq:
2205  *    empty? -> true or false
2206  *
2207  *  Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2208  *
2209  *    "hello".empty? # => false
2210  *    " ".empty? # => false
2211  *    "".empty? # => true
2212  *
2213  */
2214
2215 static VALUE
2216 rb_str_empty(VALUE str)
2217 {
2218     return RBOOL(RSTRING_LEN(str) == 0);
2219 }
2220
2221 /*
2222  *  call-seq:
2223  *    string + other_string -> new_string
2224  *
2225  *  Returns a new \String containing +other_string+ concatenated to +self+:
2226  *
2227  *    "Hello from " + self.to_s # => "Hello from main"
2228  *
2229  */
2230
2231 VALUE
2232 rb_str_plus(VALUE str1, VALUE str2)
2233 {
2234     VALUE str3;
2235     rb_encoding *enc;
2236     char *ptr1, *ptr2, *ptr3;
2237     long len1, len2;
2238     int termlen;
2239
2240     StringValue(str2);
2241     enc = rb_enc_check_str(str1, str2);
2242     RSTRING_GETMEM(str1, ptr1, len1);
2243     RSTRING_GETMEM(str2, ptr2, len2);
2244     termlen = rb_enc_mbminlen(enc);
2245     if (len1 > LONG_MAX - len2) {
2246         rb_raise(rb_eArgError, "string size too big");
2247     }
2248     str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2249     ptr3 = RSTRING_PTR(str3);
2250     memcpy(ptr3, ptr1, len1);
2251     memcpy(ptr3+len1, ptr2, len2);
2252     TERM_FILL(&ptr3[len1+len2], termlen);
2253
2254     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2255                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2256     RB_GC_GUARD(str1);
2257     RB_GC_GUARD(str2);
2258     return str3;
2259 }
2260
2261 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2262 MJIT_FUNC_EXPORTED VALUE
2263 rb_str_opt_plus(VALUE str1, VALUE str2)
2264 {
2265     assert(RBASIC_CLASS(str1) == rb_cString);
2266     assert(RBASIC_CLASS(str2) == rb_cString);
2267     long len1, len2;
2268     MAYBE_UNUSED(char) *ptr1, *ptr2;
2269     RSTRING_GETMEM(str1, ptr1, len1);
2270     RSTRING_GETMEM(str2, ptr2, len2);
2271     int enc1 = rb_enc_get_index(str1);
2272     int enc2 = rb_enc_get_index(str2);
2273
2274     if (enc1 < 0) {
2275         return Qundef;
2276     }
2277     else if (enc2 < 0) {
2278         return Qundef;
2279     }
2280     else if (enc1 != enc2) {
2281         return Qundef;
2282     }
2283     else if (len1 > LONG_MAX - len2) {
2284         return Qundef;
2285     }
2286     else {
2287         return rb_str_plus(str1, str2);
2288     }
2289
2290 }
2291
2292 /*
2293  *  call-seq:
2294  *    string * integer -> new_string
2295  *
2296  *  Returns a new \String containing +integer+ copies of +self+:
2297  *
2298  *    "Ho! " * 3 # => "Ho! Ho! Ho! "
2299  *    "Ho! " * 0 # => ""
2300  *
2301  */
2302
2303 VALUE
2304 rb_str_times(VALUE str, VALUE times)
2305 {
2306     VALUE str2;
2307     long n, len;
2308     char *ptr2;
2309     int termlen;
2310
2311     if (times == INT2FIX(1)) {
2312         return str_duplicate(rb_cString, str);
2313     }
2314     if (times == INT2FIX(0)) {
2315         str2 = str_alloc_embed(rb_cString, 0);
2316         rb_enc_copy(str2, str);
2317         return str2;
2318     }
2319     len = NUM2LONG(times);
2320     if (len < 0) {
2321         rb_raise(rb_eArgError, "negative argument");
2322     }
2323     if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2324         if (STR_EMBEDDABLE_P(len, 1)) {
2325             str2 = str_alloc_embed(rb_cString, len + 1);
2326             memset(RSTRING_PTR(str2), 0, len + 1);
2327         }
2328         else {
2329             str2 = str_alloc_heap(rb_cString);
2330             RSTRING(str2)->as.heap.aux.capa = len;
2331             RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2332             STR_SET_NOEMBED(str2);
2333         }
2334         STR_SET_LEN(str2, len);
2335         rb_enc_copy(str2, str);
2336         return str2;
2337     }
2338     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
2339         rb_raise(rb_eArgError, "argument too big");
2340     }
2341
2342     len *= RSTRING_LEN(str);
2343     termlen = TERM_LEN(str);
2344     str2 = str_new0(rb_cString, 0, len, termlen);
2345     ptr2 = RSTRING_PTR(str2);
2346     if (len) {
2347         n = RSTRING_LEN(str);
2348         memcpy(ptr2, RSTRING_PTR(str), n);
2349         while (n <= len/2) {
2350             memcpy(ptr2 + n, ptr2, n);
2351             n *= 2;
2352         }
2353         memcpy(ptr2 + n, ptr2, len-n);
2354     }
2355     STR_SET_LEN(str2, len);
2356     TERM_FILL(&ptr2[len], termlen);
2357     rb_enc_cr_str_copy_for_substr(str2, str);
2358
2359     return str2;
2360 }
2361
2362 /*
2363  *  call-seq:
2364  *    string % object -> new_string
2365  *
2366  *  Returns the result of formatting +object+ into the format specification +self+
2367  *  (see Kernel#sprintf for formatting details):
2368  *
2369  *    "%05d" % 123 # => "00123"
2370  *
2371  *  If +self+ contains multiple substitutions, +object+ must be
2372  *  an \Array or \Hash containing the values to be substituted:
2373  *
2374  *    "%-5s: %016x" % [ "ID", self.object_id ] # => "ID   : 00002b054ec93168"
2375  *    "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2376  *    "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2377  *
2378  */
2379
2380 static VALUE
2381 rb_str_format_m(VALUE str, VALUE arg)
2382 {
2383     VALUE tmp = rb_check_array_type(arg);
2384
2385     if (!NIL_P(tmp)) {
2386         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2387     }
2388     return rb_str_format(1, &arg, str);
2389 }
2390
2391 static inline void
2392 rb_check_lockedtmp(VALUE str)
2393 {
2394     if (FL_TEST(str, STR_TMPLOCK)) {
2395         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2396     }
2397 }
2398
2399 static inline void
2400 str_modifiable(VALUE str)
2401 {
2402     rb_check_lockedtmp(str);
2403     rb_check_frozen(str);
2404 }
2405
2406 static inline int
2407 str_dependent_p(VALUE str)
2408 {
2409     if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2410         return 0;
2411     }
2412     else {
2413         return 1;
2414     }
2415 }
2416
2417 static inline int
2418 str_independent(VALUE str)
2419 {
2420     str_modifiable(str);
2421     return !str_dependent_p(str);
2422 }
2423
2424 static void
2425 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2426 {
2427     char *ptr;
2428     char *oldptr;
2429     long capa = len + expand;
2430
2431     if (len > capa) len = capa;
2432
2433     if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2434         ptr = RSTRING(str)->as.heap.ptr;
2435         STR_SET_EMBED(str);
2436         memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2437         TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2438         STR_SET_EMBED_LEN(str, len);
2439         return;
2440     }
2441
2442     ptr = ALLOC_N(char, (size_t)capa + termlen);
2443     oldptr = RSTRING_PTR(str);
2444     if (oldptr) {
2445         memcpy(ptr, oldptr, len);
2446     }
2447     if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2448         xfree(oldptr);
2449     }
2450     STR_SET_NOEMBED(str);
2451     FL_UNSET(str, STR_SHARED|STR_NOFREE);
2452     TERM_FILL(ptr + len, termlen);
2453     RSTRING(str)->as.heap.ptr = ptr;
2454     RSTRING(str)->as.heap.len = len;
2455     RSTRING(str)->as.heap.aux.capa = capa;
2456 }
2457
2458 void
2459 rb_str_modify(VALUE str)
2460 {
2461     if (!str_independent(str))
2462         str_make_independent(str);
2463     ENC_CODERANGE_CLEAR(str);
2464 }
2465
2466 void
2467 rb_str_modify_expand(VALUE str, long expand)
2468 {
2469     int termlen = TERM_LEN(str);
2470     long len = RSTRING_LEN(str);
2471
2472     if (expand < 0) {
2473         rb_raise(rb_eArgError, "negative expanding string size");
2474     }
2475     if (expand >= LONG_MAX - len) {
2476         rb_raise(rb_eArgError, "string size too big");
2477     }
2478
2479     if (!str_independent(str)) {
2480         str_make_independent_expand(str, len, expand, termlen);
2481     }
2482     else if (expand > 0) {
2483         RESIZE_CAPA_TERM(str, len + expand, termlen);
2484     }
2485     ENC_CODERANGE_CLEAR(str);
2486 }
2487
2488 /* As rb_str_modify(), but don't clear coderange */
2489 static void
2490 str_modify_keep_cr(VALUE str)
2491 {
2492     if (!str_independent(str))
2493         str_make_independent(str);
2494     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2495         /* Force re-scan later */
2496         ENC_CODERANGE_CLEAR(str);
2497 }
2498
2499 static inline void
2500 str_discard(VALUE str)
2501 {
2502     str_modifiable(str);
2503     if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2504         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2505         RSTRING(str)->as.heap.ptr = 0;
2506         RSTRING(str)->as.heap.len = 0;
2507     }
2508 }
2509
2510 void
2511 rb_must_asciicompat(VALUE str)
2512 {
2513     rb_encoding *enc = rb_enc_get(str);
2514     if (!rb_enc_asciicompat(enc)) {
2515         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2516     }
2517 }
2518
2519 VALUE
2520 rb_string_value(volatile VALUE *ptr)
2521 {
2522     VALUE s = *ptr;
2523     if (!RB_TYPE_P(s, T_STRING)) {
2524         s = rb_str_to_str(s);
2525         *ptr = s;
2526     }
2527     return s;
2528 }
2529
2530 char *
2531 rb_string_value_ptr(volatile VALUE *ptr)
2532 {
2533     VALUE str = rb_string_value(ptr);
2534     return RSTRING_PTR(str);
2535 }
2536
2537 static int
2538 zero_filled(const char *s, int n)
2539 {
2540     for (; n > 0; --n) {
2541         if (*s++) return 0;
2542     }
2543     return 1;
2544 }
2545
2546 static const char *
2547 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2548 {
2549     const char *e = s + len;
2550
2551     for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2552         if (zero_filled(s, minlen)) return s;
2553     }
2554     return 0;
2555 }
2556
2557 static char *
2558 str_fill_term(VALUE str, char *s, long len, int termlen)
2559 {
2560     /* This function assumes that (capa + termlen) bytes of memory
2561      * is allocated, like many other functions in this file.
2562      */
2563     if (str_dependent_p(str)) {
2564         if (!zero_filled(s + len, termlen))
2565             str_make_independent_expand(str, len, 0L, termlen);
2566     }
2567     else {
2568         TERM_FILL(s + len, termlen);
2569         return s;
2570     }
2571     return RSTRING_PTR(str);
2572 }
2573
2574 void
2575 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2576 {
2577     long capa = str_capacity(str, oldtermlen) + oldtermlen;
2578     long len = RSTRING_LEN(str);
2579
2580     assert(capa >= len);
2581     if (capa - len < termlen) {
2582         rb_check_lockedtmp(str);
2583         str_make_independent_expand(str, len, 0L, termlen);
2584     }
2585     else if (str_dependent_p(str)) {
2586         if (termlen > oldtermlen)
2587             str_make_independent_expand(str, len, 0L, termlen);
2588     }
2589     else {
2590         if (!STR_EMBED_P(str)) {
2591             /* modify capa instead of realloc */
2592             assert(!FL_TEST((str), STR_SHARED));
2593             RSTRING(str)->as.heap.aux.capa = capa - termlen;
2594         }
2595         if (termlen > oldtermlen) {
2596             TERM_FILL(RSTRING_PTR(str) + len, termlen);
2597         }
2598     }
2599
2600     return;
2601 }
2602
2603 static char *
2604 str_null_check(VALUE str, int *w)
2605 {
2606     char *s = RSTRING_PTR(str);
2607     long len = RSTRING_LEN(str);
2608     rb_encoding *enc = rb_enc_get(str);
2609     const int minlen = rb_enc_mbminlen(enc);
2610
2611     if (minlen > 1) {
2612         *w = 1;
2613         if (str_null_char(s, len, minlen, enc)) {
2614             return NULL;
2615         }
2616         return str_fill_term(str, s, len, minlen);
2617     }
2618     *w = 0;
2619     if (!s || memchr(s, 0, len)) {
2620         return NULL;
2621     }
2622     if (s[len]) {
2623         s = str_fill_term(str, s, len, minlen);
2624     }
2625     return s;
2626 }
2627
2628 char *
2629 rb_str_to_cstr(VALUE str)
2630 {
2631     int w;
2632     return str_null_check(str, &w);
2633 }
2634
2635 char *
2636 rb_string_value_cstr(volatile VALUE *ptr)
2637 {
2638     VALUE str = rb_string_value(ptr);
2639     int w;
2640     char *s = str_null_check(str, &w);
2641     if (!s) {
2642         if (w) {
2643             rb_raise(rb_eArgError, "string contains null char");
2644         }
2645         rb_raise(rb_eArgError, "string contains null byte");
2646     }
2647     return s;
2648 }
2649
2650 char *
2651 rb_str_fill_terminator(VALUE str, const int newminlen)
2652 {
2653     char *s = RSTRING_PTR(str);
2654     long len = RSTRING_LEN(str);
2655     return str_fill_term(str, s, len, newminlen);
2656 }
2657
2658 VALUE
2659 rb_check_string_type(VALUE str)
2660 {
2661     str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2662     return str;
2663 }
2664
2665 /*
2666  *  call-seq:
2667  *    String.try_convert(object) -> object, new_string, or nil
2668  *
2669  *  If +object+ is a \String object, returns +object+.
2670  *
2671  *  Otherwise if +object+ responds to <tt>:to_str</tt>,
2672  *  calls <tt>object.to_str</tt> and returns the result.
2673  *
2674  *  Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2675  *
2676  *  Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2677  */
2678 static VALUE
2679 rb_str_s_try_convert(VALUE dummy, VALUE str)
2680 {
2681     return rb_check_string_type(str);
2682 }
2683
2684 static char*
2685 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2686 {
2687     long nth = *nthp;
2688     if (rb_enc_mbmaxlen(enc) == 1) {
2689         p += nth;
2690     }
2691     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2692         p += nth * rb_enc_mbmaxlen(enc);
2693     }
2694     else if (rb_enc_asciicompat(enc)) {
2695         const char *p2, *e2;
2696         int n;
2697
2698         while (p < e && 0 < nth) {
2699             e2 = p + nth;
2700             if (e < e2) {
2701                 *nthp = nth;
2702                 return (char *)e;
2703             }
2704             if (ISASCII(*p)) {
2705                 p2 = search_nonascii(p, e2);
2706                 if (!p2) {
2707                     nth -= e2 - p;
2708                     *nthp = nth;
2709                     return (char *)e2;
2710                 }
2711                 nth -= p2 - p;
2712                 p = p2;
2713             }
2714             n = rb_enc_mbclen(p, e, enc);
2715             p += n;
2716             nth--;
2717         }
2718         *nthp = nth;
2719         if (nth != 0) {
2720             return (char *)e;
2721         }
2722         return (char *)p;
2723     }
2724     else {
2725         while (p < e && nth--) {
2726             p += rb_enc_mbclen(p, e, enc);
2727         }
2728     }
2729     if (p > e) p = e;
2730     *nthp = nth;
2731     return (char*)p;
2732 }
2733
2734 char*
2735 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2736 {
2737     return str_nth_len(p, e, &nth, enc);
2738 }
2739
2740 static char*
2741 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2742 {
2743     if (singlebyte)
2744         p += nth;
2745     else {
2746         p = str_nth_len(p, e, &nth, enc);
2747     }
2748     if (!p) return 0;
2749     if (p > e) p = e;
2750     return (char *)p;
2751 }
2752
2753 /* char offset to byte offset */
2754 static long
2755 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2756 {
2757     const char *pp = str_nth(p, e, nth, enc, singlebyte);
2758     if (!pp) return e - p;
2759     return pp - p;
2760 }
2761
2762 long
2763 rb_str_offset(VALUE str, long pos)
2764 {
2765     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2766                       STR_ENC_GET(str), single_byte_optimizable(str));
2767 }
2768
2769 #ifdef NONASCII_MASK
2770 static char *
2771 str_utf8_nth(const char *p, const char *e, long *nthp)
2772 {
2773     long nth = *nthp;
2774     if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2775         const uintptr_t *s, *t;
2776         const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2777         s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2778         t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2779         while (p < (const char *)s) {
2780             if (is_utf8_lead_byte(*p)) nth--;
2781             p++;
2782         }
2783         do {
2784             nth -= count_utf8_lead_bytes_with_word(s);
2785             s++;
2786         } while (s < t && (int)SIZEOF_VOIDP <= nth);
2787         p = (char *)s;
2788     }
2789     while (p < e) {
2790         if (is_utf8_lead_byte(*p)) {
2791             if (nth == 0) break;
2792             nth--;
2793         }
2794         p++;
2795     }
2796     *nthp = nth;
2797     return (char *)p;
2798 }
2799
2800 static long
2801 str_utf8_offset(const char *p, const char *e, long nth)
2802 {
2803     const char *pp = str_utf8_nth(p, e, &nth);
2804     return pp - p;
2805 }
2806 #endif
2807
2808 /* byte offset to char offset */
2809 long
2810 rb_str_sublen(VALUE str, long pos)
2811 {
2812     if (single_byte_optimizable(str) || pos < 0)
2813         return pos;
2814     else {
2815         char *p = RSTRING_PTR(str);
2816         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2817     }
2818 }
2819
2820 VALUE
2821 rb_str_subseq(VALUE str, long beg, long len)
2822 {
2823     VALUE str2;
2824
2825     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2826         SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2827         long olen;
2828         str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2829         RSTRING(str2)->as.heap.ptr += beg;
2830         olen = RSTRING(str2)->as.heap.len;
2831         if (olen > len) RSTRING(str2)->as.heap.len = len;
2832     }
2833     else {
2834         str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2835         RB_GC_GUARD(str);
2836     }
2837
2838     rb_enc_cr_str_copy_for_substr(str2, str);
2839
2840     return str2;
2841 }
2842
2843 char *
2844 rb_str_subpos(VALUE str, long beg, long *lenp)
2845 {
2846     long len = *lenp;
2847     long slen = -1L;
2848     long blen = RSTRING_LEN(str);
2849     rb_encoding *enc = STR_ENC_GET(str);
2850     char *p, *s = RSTRING_PTR(str), *e = s + blen;
2851
2852     if (len < 0) return 0;
2853     if (!blen) {
2854         len = 0;
2855     }
2856     if (single_byte_optimizable(str)) {
2857         if (beg > blen) return 0;
2858         if (beg < 0) {
2859             beg += blen;
2860             if (beg < 0) return 0;
2861         }
2862         if (len > blen - beg)
2863             len = blen - beg;
2864         if (len < 0) return 0;
2865         p = s + beg;
2866         goto end;
2867     }
2868     if (beg < 0) {
2869         if (len > -beg) len = -beg;
2870         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2871             beg = -beg;
2872             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2873             p = e;
2874             if (!p) return 0;
2875             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2876             if (!p) return 0;
2877             len = e - p;
2878             goto end;
2879         }
2880         else {
2881             slen = str_strlen(str, enc);
2882             beg += slen;
2883             if (beg < 0) return 0;
2884             p = s + beg;
2885             if (len == 0) goto end;
2886         }
2887     }
2888     else if (beg > 0 && beg > RSTRING_LEN(str)) {
2889         return 0;
2890     }
2891     if (len == 0) {
2892         if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2893         p = s + beg;
2894     }
2895 #ifdef NONASCII_MASK
2896     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2897         enc == rb_utf8_encoding()) {
2898         p = str_utf8_nth(s, e, &beg);
2899         if (beg > 0) return 0;
2900         len = str_utf8_offset(p, e, len);
2901     }
2902 #endif
2903     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2904         int char_sz = rb_enc_mbmaxlen(enc);
2905
2906         p = s + beg * char_sz;
2907         if (p > e) {
2908             return 0;
2909         }
2910         else if (len * char_sz > e - p)
2911             len = e - p;
2912         else
2913             len *= char_sz;
2914     }
2915     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2916         if (beg > 0) return 0;
2917         len = 0;
2918     }
2919     else {
2920         len = str_offset(p, e, len, enc, 0);
2921     }
2922   end:
2923     *lenp = len;
2924     RB_GC_GUARD(str);
2925     return p;
2926 }
2927
2928 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2929
2930 VALUE
2931 rb_str_substr(VALUE str, long beg, long len)
2932 {
2933     return str_substr(str, beg, len, TRUE);
2934 }
2935
2936 static VALUE
2937 str_substr(VALUE str, long beg, long len, int empty)
2938 {
2939     VALUE str2;
2940     char *p = rb_str_subpos(str, beg, &len);
2941
2942     if (!p) return Qnil;
2943     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2944         SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2945         long ofs = p - RSTRING_PTR(str);
2946         str2 = rb_str_new_frozen(str);
2947         str2 = str_new_shared(rb_cString, str2);
2948         RSTRING(str2)->as.heap.ptr += ofs;
2949         RSTRING(str2)->as.heap.len = len;
2950         ENC_CODERANGE_CLEAR(str2);
2951     }
2952     else {
2953         if (!len && !empty) return Qnil;
2954         str2 = rb_str_new(p, len);
2955         RB_GC_GUARD(str);
2956     }
2957     rb_enc_cr_str_copy_for_substr(str2, str);
2958
2959     return str2;
2960 }
2961
2962 VALUE
2963 rb_str_freeze(VALUE str)
2964 {
2965     if (OBJ_FROZEN(str)) return str;
2966     rb_str_resize(str, RSTRING_LEN(str));
2967     return rb_obj_freeze(str);
2968 }
2969
2970
2971 /*
2972  * call-seq:
2973  *   +string -> new_string or self
2974  *
2975  * Returns +self+ if +self+ is not frozen.
2976  *
2977  * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2978  */
2979 static VALUE
2980 str_uplus(VALUE str)
2981 {
2982     if (OBJ_FROZEN(str)) {
2983         return rb_str_dup(str);
2984     }
2985     else {
2986         return str;
2987     }
2988 }
2989
2990 /*
2991  * call-seq:
2992  *   -string -> frozen_string
2993  *
2994  * Returns a frozen, possibly pre-existing copy of the string.
2995  *
2996  * The returned \String will be deduplicated as long as it does not have
2997  * any instance variables set on it.
2998  */
2999 static VALUE
3000 str_uminus(VALUE str)
3001 {
3002     if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3003         str = rb_str_dup(str);
3004     }
3005     return rb_fstring(str);
3006 }
3007
3008 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3009 #define rb_str_dup_frozen rb_str_new_frozen
3010
3011 VALUE
3012 rb_str_locktmp(VALUE str)
3013 {
3014     if (FL_TEST(str, STR_TMPLOCK)) {
3015         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3016     }
3017     FL_SET(str, STR_TMPLOCK);
3018     return str;
3019 }
3020
3021 VALUE
3022 rb_str_unlocktmp(VALUE str)
3023 {
3024     if (!FL_TEST(str, STR_TMPLOCK)) {
3025         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3026     }
3027     FL_UNSET(str, STR_TMPLOCK);
3028     return str;
3029 }
3030
3031 RUBY_FUNC_EXPORTED VALUE
3032 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3033 {
3034     rb_str_locktmp(str);
3035     return rb_ensure(func, arg, rb_str_unlocktmp, str);
3036 }
3037
3038 void
3039 rb_str_set_len(VALUE str, long len)
3040 {
3041     long capa;
3042     const int termlen = TERM_LEN(str);
3043
3044     str_modifiable(str);
3045     if (STR_SHARED_P(str)) {
3046         rb_raise(rb_eRuntimeError, "can't set length of shared string");
3047     }
3048     if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3049         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3050     }
3051     STR_SET_LEN(str, len);
3052     TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3053 }
3054
3055 VALUE
3056 rb_str_resize(VALUE str, long len)
3057 {
3058     long slen;
3059     int independent;
3060
3061     if (len < 0) {
3062         rb_raise(rb_eArgError, "negative string size (or size too big)");
3063     }
3064
3065     independent = str_independent(str);
3066     ENC_CODERANGE_CLEAR(str);
3067     slen = RSTRING_LEN(str);
3068
3069     {
3070         long capa;
3071         const int termlen = TERM_LEN(str);
3072         if (STR_EMBED_P(str)) {
3073             if (len == slen) return str;
3074             if (str_embed_capa(str) >= len + termlen) {
3075                 STR_SET_EMBED_LEN(str, len);
3076                 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3077                 return str;
3078             }
3079             str_make_independent_expand(str, slen, len - slen, termlen);
3080         }
3081         else if (str_embed_capa(str) >= len + termlen) {
3082             char *ptr = STR_HEAP_PTR(str);
3083             STR_SET_EMBED(str);
3084             if (slen > len) slen = len;
3085             if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3086             TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3087             STR_SET_EMBED_LEN(str, len);
3088             if (independent) ruby_xfree(ptr);
3089             return str;
3090         }
3091         else if (!independent) {
3092             if (len == slen) return str;
3093             str_make_independent_expand(str, slen, len - slen, termlen);
3094         }
3095         else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3096                  (capa - len) > (len < 1024 ? len : 1024)) {
3097             SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3098                             (size_t)len + termlen, STR_HEAP_SIZE(str));
3099             RSTRING(str)->as.heap.aux.capa = len;
3100         }
3101         else if (len == slen) return str;
3102         RSTRING(str)->as.heap.len = len;
3103         TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3104     }
3105     return str;
3106 }
3107
3108 static VALUE
3109 str_buf_cat(VALUE str, const char *ptr, long len)
3110 {
3111     long capa, total, olen, off = -1;
3112     char *sptr;
3113     const int termlen = TERM_LEN(str);
3114 #if !USE_RVARGC
3115     assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3116 #endif
3117
3118     RSTRING_GETMEM(str, sptr, olen);
3119     if (ptr >= sptr && ptr <= sptr + olen) {
3120         off = ptr - sptr;
3121     }
3122     rb_str_modify(str);
3123     if (len == 0) return 0;
3124     if (STR_EMBED_P(str)) {
3125         capa = str_embed_capa(str) - termlen;
3126         sptr = RSTRING(str)->as.embed.ary;
3127         olen = RSTRING_EMBED_LEN(str);
3128     }
3129     else {
3130         capa = RSTRING(str)->as.heap.aux.capa;
3131         sptr = RSTRING(str)->as.heap.ptr;
3132         olen = RSTRING(str)->as.heap.len;
3133     }
3134     if (olen > LONG_MAX - len) {
3135         rb_raise(rb_eArgError, "string sizes too big");
3136     }
3137     total = olen + len;
3138     if (capa < total) {
3139         if (total >= LONG_MAX / 2) {
3140             capa = total;
3141         }
3142         while (total > capa) {
3143             capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3144         }
3145         RESIZE_CAPA_TERM(str, capa, termlen);
3146         sptr = RSTRING_PTR(str);
3147     }
3148     if (off != -1) {
3149         ptr = sptr + off;
3150     }
3151     memcpy(sptr + olen, ptr, len);
3152     STR_SET_LEN(str, total);
3153     TERM_FILL(sptr + total, termlen); /* sentinel */
3154
3155     return str;
3156 }
3157
3158 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3159
3160 VALUE
3161 rb_str_cat(VALUE str, const char *ptr, long len)
3162 {
3163     if (len == 0) return str;
3164     if (len < 0) {
3165         rb_raise(rb_eArgError, "negative string size (or size too big)");
3166     }
3167     return str_buf_cat(str, ptr, len);
3168 }
3169
3170 VALUE
3171 rb_str_cat_cstr(VALUE str, const char *ptr)
3172 {
3173     must_not_null(ptr);
3174     return rb_str_buf_cat(str, ptr, strlen(ptr));
3175 }
3176
3177 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3178 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3179 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3180
3181 static VALUE
3182 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3183     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3184 {
3185     int str_encindex = ENCODING_GET(str);
3186     int res_encindex;
3187     int str_cr, res_cr;
3188     rb_encoding *str_enc, *ptr_enc;
3189
3190     str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3191
3192     if (str_encindex == ptr_encindex) {
3193         if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3194             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3195         }
3196     }
3197     else {
3198         str_enc = rb_enc_from_index(str_encindex);
3199         ptr_enc = rb_enc_from_index(ptr_encindex);
3200         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3201             if (len == 0)
3202                 return str;
3203             if (RSTRING_LEN(str) == 0) {
3204                 rb_str_buf_cat(str, ptr, len);
3205                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3206                 return str;
3207             }
3208             goto incompatible;
3209         }
3210         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3211             ptr_cr = coderange_scan(ptr, len, ptr_enc);
3212         }
3213         if (str_cr == ENC_CODERANGE_UNKNOWN) {
3214             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3215                 str_cr = rb_enc_str_coderange(str);
3216             }
3217         }
3218     }
3219     if (ptr_cr_ret)
3220         *ptr_cr_ret = ptr_cr;
3221
3222     if (str_encindex != ptr_encindex &&
3223         str_cr != ENC_CODERANGE_7BIT &&
3224         ptr_cr != ENC_CODERANGE_7BIT) {
3225         str_enc = rb_enc_from_index(str_encindex);
3226         ptr_enc = rb_enc_from_index(ptr_encindex);
3227         goto incompatible;
3228     }
3229
3230     if (str_cr == ENC_CODERANGE_UNKNOWN) {
3231         res_encindex = str_encindex;
3232         res_cr = ENC_CODERANGE_UNKNOWN;
3233     }
3234     else if (str_cr == ENC_CODERANGE_7BIT) {
3235         if (ptr_cr == ENC_CODERANGE_7BIT) {
3236             res_encindex = str_encindex;
3237             res_cr = ENC_CODERANGE_7BIT;
3238         }
3239         else {
3240             res_encindex = ptr_encindex;
3241             res_cr = ptr_cr;
3242         }
3243     }
3244     else if (str_cr == ENC_CODERANGE_VALID) {
3245         res_encindex = str_encindex;
3246         if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3247             res_cr = str_cr;
3248         else
3249             res_cr = ptr_cr;
3250     }
3251     else { /* str_cr == ENC_CODERANGE_BROKEN */
3252         res_encindex = str_encindex;
3253         res_cr = str_cr;
3254         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3255     }
3256
3257     if (len < 0) {
3258         rb_raise(rb_eArgError, "negative string size (or size too big)");
3259     }
3260     str_buf_cat(str, ptr, len);
3261     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3262     return str;
3263
3264   incompatible:
3265     rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3266              rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3267     UNREACHABLE_RETURN(Qundef);
3268 }
3269
3270 VALUE
3271 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3272 {
3273     return rb_enc_cr_str_buf_cat(str, ptr, len,
3274         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3275 }
3276
3277 VALUE
3278 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3279 {
3280     /* ptr must reference NUL terminated ASCII string. */
3281     int encindex = ENCODING_GET(str);
3282     rb_encoding *enc = rb_enc_from_index(encindex);
3283     if (rb_enc_asciicompat(enc)) {
3284         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3285             encindex, ENC_CODERANGE_7BIT, 0);
3286     }
3287     else {
3288         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3289         while (*ptr) {
3290             unsigned int c = (unsigned char)*ptr;
3291             int len = rb_enc_codelen(c, enc);
3292             rb_enc_mbcput(c, buf, enc);
3293             rb_enc_cr_str_buf_cat(str, buf, len,
3294                 encindex, ENC_CODERANGE_VALID, 0);
3295             ptr++;
3296         }
3297         return str;
3298     }
3299 }
3300
3301 VALUE
3302 rb_str_buf_append(VALUE str, VALUE str2)
3303 {
3304     int str2_cr;
3305
3306     str2_cr = ENC_CODERANGE(str2);
3307
3308     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3309         ENCODING_GET(str2), str2_cr, &str2_cr);
3310
3311     ENC_CODERANGE_SET(str2, str2_cr);
3312
3313     return str;
3314 }
3315
3316 VALUE
3317 rb_str_append(VALUE str, VALUE str2)
3318 {
3319     StringValue(str2);
3320     return rb_str_buf_append(str, str2);
3321 }
3322
3323 #define MIN_PRE_ALLOC_SIZE 48
3324
3325 MJIT_FUNC_EXPORTED VALUE
3326 rb_str_concat_literals(size_t num, const VALUE *strary)
3327 {
3328     VALUE str;
3329     size_t i, s;
3330     long len = 1;
3331
3332     if (UNLIKELY(!num)) return rb_str_new(0, 0);
3333     if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3334
3335     for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3336     if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3337         str = rb_str_resurrect(strary[0]);
3338         s = 1;
3339     }
3340     else {
3341         str = rb_str_buf_new(len);
3342         rb_enc_copy(str, strary[0]);
3343         s = 0;
3344     }
3345
3346     for (i = s; i < num; ++i) {
3347         const VALUE v = strary[i];
3348         int encidx = ENCODING_GET(v);
3349
3350         rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3351                               encidx, ENC_CODERANGE(v), NULL);
3352         if (encidx != ENCINDEX_US_ASCII) {
3353             if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3354                 rb_enc_set_index(str, encidx);
3355         }
3356     }
3357     return str;
3358 }
3359
3360 /*
3361  *  call-seq:
3362  *     concat(*objects) -> string
3363  *
3364  *  Concatenates each object in +objects+ to +self+ and returns +self+:
3365  *
3366  *    s = 'foo'
3367  *    s.concat('bar', 'baz') # => "foobarbaz"
3368  *    s                      # => "foobarbaz"
3369  *
3370  *  For each given object +object+ that is an \Integer,
3371  *  the value is considered a codepoint and converted to a character before concatenation:
3372  *
3373  *    s = 'foo'
3374  *    s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3375  *
3376  *  Related: String#<<, which takes a single argument.
3377  */
3378 static VALUE
3379 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3380 {
3381     str_modifiable(str);
3382
3383     if (argc == 1) {
3384         return rb_str_concat(str, argv[0]);
3385     }
3386     else if (argc > 1) {
3387         int i;
3388         VALUE arg_str = rb_str_tmp_new(0);
3389         rb_enc_copy(arg_str, str);
3390         for (i = 0; i < argc; i++) {
3391             rb_str_concat(arg_str, argv[i]);
3392         }
3393         rb_str_buf_append(str, arg_str);
3394     }
3395
3396     return str;
3397 }
3398
3399 /*
3400  *  call-seq:
3401  *    string << object -> string
3402  *
3403  *  Concatenates +object+ to +self+ and returns +self+:
3404  *
3405  *    s = 'foo'
3406  *    s << 'bar' # => "foobar"
3407  *    s          # => "foobar"
3408  *
3409  *  If +object+ is an \Integer,
3410  *  the value is considered a codepoint and converted to a character before concatenation:
3411  *
3412  *    s = 'foo'
3413  *    s << 33 # => "foo!"
3414  *
3415  *  Related: String#concat, which takes multiple arguments.
3416  */
3417 VALUE
3418 rb_str_concat(VALUE str1, VALUE str2)
3419 {
3420     unsigned int code;
3421     rb_encoding *enc = STR_ENC_GET(str1);
3422     int encidx;
3423
3424     if (RB_INTEGER_TYPE_P(str2)) {
3425         if (rb_num_to_uint(str2, &code) == 0) {
3426         }
3427         else if (FIXNUM_P(str2)) {
3428             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3429         }
3430         else {
3431             rb_raise(rb_eRangeError, "bignum out of char range");
3432         }
3433     }
3434     else {
3435         return rb_str_append(str1, str2);
3436     }
3437
3438     encidx = rb_enc_to_index(enc);
3439     if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3440         /* US-ASCII automatically extended to ASCII-8BIT */
3441         char buf[1];
3442         buf[0] = (char)code;
3443         if (code > 0xFF) {
3444             rb_raise(rb_eRangeError, "%u out of char range", code);
3445         }
3446         rb_str_cat(str1, buf, 1);
3447         if (encidx == ENCINDEX_US_ASCII && code > 127) {
3448             rb_enc_associate_index(str1, ENCINDEX_ASCII);
3449             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3450         }
3451     }
3452     else {
3453         long pos = RSTRING_LEN(str1);
3454         int cr = ENC_CODERANGE(str1);
3455         int len;
3456         char *buf;
3457
3458         switch (len = rb_enc_codelen(code, enc)) {
3459           case ONIGERR_INVALID_CODE_POINT_VALUE:
3460             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3461             break;
3462           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3463           case 0:
3464             rb_raise(rb_eRangeError, "%u out of char range", code);
3465             break;
3466         }
3467         buf = ALLOCA_N(char, len + 1);
3468         rb_enc_mbcput(code, buf, enc);
3469         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3470             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3471         }
3472         rb_str_resize(str1, pos+len);
3473         memcpy(RSTRING_PTR(str1) + pos, buf, len);
3474         if (cr == ENC_CODERANGE_7BIT && code > 127)
3475             cr = ENC_CODERANGE_VALID;
3476         ENC_CODERANGE_SET(str1, cr);
3477     }
3478     return str1;
3479 }
3480
3481 /*
3482  *  call-seq:
3483  *    prepend(*other_strings)  -> string
3484  *
3485  *  Prepends each string in +other_strings+ to +self+ and returns +self+:
3486  *
3487  *    s = 'foo'
3488  *    s.prepend('bar', 'baz') # => "barbazfoo"
3489  *    s                       # => "barbazfoo"
3490  *
3491  *  Related: String#concat.
3492  */
3493
3494 static VALUE
3495 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3496 {
3497     str_modifiable(str);
3498
3499     if (argc == 1) {
3500         rb_str_update(str, 0L, 0L, argv[0]);
3501     }
3502     else if (argc > 1) {
3503         int i;
3504         VALUE arg_str = rb_str_tmp_new(0);
3505         rb_enc_copy(arg_str, str);
3506         for (i = 0; i < argc; i++) {
3507             rb_str_append(arg_str, argv[i]);
3508         }
3509         rb_str_update(str, 0L, 0L, arg_str);
3510     }
3511
3512     return str;
3513 }
3514
3515 st_index_t
3516 rb_str_hash(VALUE str)
3517 {
3518     int e = ENCODING_GET(str);
3519     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3520         e = 0;
3521     }
3522     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3523 }
3524
3525 int
3526 rb_str_hash_cmp(VALUE str1, VALUE str2)
3527 {
3528     long len1, len2;
3529     const char *ptr1, *ptr2;
3530     RSTRING_GETMEM(str1, ptr1, len1);
3531     RSTRING_GETMEM(str2, ptr2, len2);
3532     return (len1 != len2 ||
3533             !rb_str_comparable(str1, str2) ||
3534             memcmp(ptr1, ptr2, len1) != 0);
3535 }
3536
3537 /*
3538  * call-seq:
3539  *   hash -> integer
3540  *
3541  * Returns the integer hash value for +self+.
3542  * The value is based on the length, content and encoding of +self+.
3543  *
3544  * Related: Object#hash.
3545  */
3546
3547 static VALUE
3548 rb_str_hash_m(VALUE str)
3549 {
3550     st_index_t hval = rb_str_hash(str);
3551     return ST2FIX(hval);
3552 }
3553
3554 #define lesser(a,b) (((a)>(b))?(b):(a))
3555
3556 int
3557 rb_str_comparable(VALUE str1, VALUE str2)
3558 {
3559     int idx1, idx2;
3560     int rc1, rc2;
3561
3562     if (RSTRING_LEN(str1) == 0) return TRUE;
3563     if (RSTRING_LEN(str2) == 0) return TRUE;
3564     idx1 = ENCODING_GET(str1);
3565     idx2 = ENCODING_GET(str2);
3566     if (idx1 == idx2) return TRUE;
3567     rc1 = rb_enc_str_coderange(str1);
3568     rc2 = rb_enc_str_coderange(str2);
3569     if (rc1 == ENC_CODERANGE_7BIT) {
3570         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3571         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3572             return TRUE;
3573     }
3574     if (rc2 == ENC_CODERANGE_7BIT) {
3575         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3576             return TRUE;
3577     }
3578     return FALSE;
3579 }
3580
3581 int
3582 rb_str_cmp(VALUE str1, VALUE str2)
3583 {
3584     long len1, len2;
3585     const char *ptr1, *ptr2;
3586     int retval;
3587
3588     if (str1 == str2) return 0;
3589     RSTRING_GETMEM(str1, ptr1, len1);
3590     RSTRING_GETMEM(str2, ptr2, len2);
3591     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3592         if (len1 == len2) {
3593             if (!rb_str_comparable(str1, str2)) {
3594                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3595                     return 1;
3596                 return -1;
3597             }
3598             return 0;
3599         }
3600         if (len1 > len2) return 1;
3601         return -1;
3602     }
3603     if (retval > 0) return 1;
3604     return -1;
3605 }
3606
3607 /*
3608  *  call-seq:
3609  *    string == object -> true or false
3610  *    string === object -> true or false
3611  *
3612  *  Returns +true+ if +object+ has the same length and content;
3613  *  as +self+; +false+ otherwise:
3614  *
3615  *    s = 'foo'
3616  *    s == 'foo' # => true
3617  *    s == 'food' # => false
3618  *    s == 'FOO' # => false
3619  *
3620  *  Returns +false+ if the two strings' encodings are not compatible:
3621  *    "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3622  *
3623  *  If +object+ is not an instance of \String but responds to +to_str+, then the
3624  *  two strings are compared using <code>object.==</code>.
3625  */
3626
3627 VALUE
3628 rb_str_equal(VALUE str1, VALUE str2)
3629 {
3630     if (str1 == str2) return Qtrue;
3631     if (!RB_TYPE_P(str2, T_STRING)) {
3632         if (!rb_respond_to(str2, idTo_str)) {
3633             return Qfalse;
3634         }
3635         return rb_equal(str2, str1);
3636     }
3637     return rb_str_eql_internal(str1, str2);
3638 }
3639
3640 /*
3641  * call-seq:
3642  *   eql?(object) -> true or false
3643  *
3644  *  Returns +true+ if +object+ has the same length and content;
3645  *  as +self+; +false+ otherwise:
3646  *
3647  *    s = 'foo'
3648  *    s.eql?('foo') # => true
3649  *    s.eql?('food') # => false
3650  *    s.eql?('FOO') # => false
3651  *
3652  *  Returns +false+ if the two strings' encodings are not compatible:
3653  *
3654  *    "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3655  *
3656  */
3657
3658 MJIT_FUNC_EXPORTED VALUE
3659 rb_str_eql(VALUE str1, VALUE str2)
3660 {
3661     if (str1 == str2) return Qtrue;
3662     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3663     return rb_str_eql_internal(str1, str2);
3664 }
3665
3666 /*
3667  *  call-seq:
3668  *    string <=> other_string -> -1, 0, 1, or nil
3669  *
3670  *  Compares +self+ and +other_string+, returning:
3671  *
3672  *  - -1 if +other_string+ is larger.
3673  *  - 0 if the two are equal.
3674  *  - 1 if +other_string+ is smaller.
3675  *  - +nil+ if the two are incomparable.
3676  *
3677  *  Examples:
3678  *
3679  *    'foo' <=> 'foo' # => 0
3680  *    'foo' <=> 'food' # => -1
3681  *    'food' <=> 'foo' # => 1
3682  *    'FOO' <=> 'foo' # => -1
3683  *    'foo' <=> 'FOO' # => 1
3684  *    'foo' <=> 1 # => nil
3685  *
3686  */
3687
3688 static VALUE
3689 rb_str_cmp_m(VALUE str1, VALUE str2)
3690 {
3691     int result;
3692     VALUE s = rb_check_string_type(str2);
3693     if (NIL_P(s)) {
3694         return rb_invcmp(str1, str2);
3695     }
3696     result = rb_str_cmp(str1, s);
3697     return INT2FIX(result);
3698 }
3699
3700 static VALUE str_casecmp(VALUE str1, VALUE str2);
3701 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3702
3703 /*
3704  *  call-seq:
3705  *    casecmp(other_string) -> -1, 0, 1, or nil
3706  *
3707  *  Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3708  *
3709  *  - -1 if <tt>other_string.downcase</tt> is larger.
3710  *  - 0 if the two are equal.
3711  *  - 1 if <tt>other_string.downcase</tt> is smaller.
3712  *  - +nil+ if the two are incomparable.
3713  *
3714  *  Examples:
3715  *
3716  *    'foo'.casecmp('foo') # => 0
3717  *    'foo'.casecmp('food') # => -1
3718  *    'food'.casecmp('foo') # => 1
3719  *    'FOO'.casecmp('foo') # => 0
3720  *    'foo'.casecmp('FOO') # => 0
3721  *    'foo'.casecmp(1) # => nil
3722  *
3723  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
3724  *
3725  *  Related: String#casecmp?.
3726  *
3727  */
3728
3729 static VALUE
3730 rb_str_casecmp(VALUE str1, VALUE str2)
3731 {
3732     VALUE s = rb_check_string_type(str2);
3733     if (NIL_P(s)) {
3734         return Qnil;
3735     }
3736     return str_casecmp(str1, s);
3737 }
3738
3739 static VALUE
3740 str_casecmp(VALUE str1, VALUE str2)
3741 {
3742     long len;
3743     rb_encoding *enc;
3744     const char *p1, *p1end, *p2, *p2end;
3745
3746     enc = rb_enc_compatible(str1, str2);
3747     if (!enc) {
3748         return Qnil;
3749     }
3750
3751     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3752     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3753     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3754         while (p1 < p1end && p2 < p2end) {
3755             if (*p1 != *p2) {
3756                 unsigned int c1 = TOLOWER(*p1 & 0xff);
3757                 unsigned int c2 = TOLOWER(*p2 & 0xff);
3758                 if (c1 != c2)
3759                     return INT2FIX(c1 < c2 ? -1 : 1);
3760             }
3761             p1++;
3762             p2++;
3763         }
3764     }
3765     else {
3766         while (p1 < p1end && p2 < p2end) {
3767             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3768             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3769
3770             if (0 <= c1 && 0 <= c2) {
3771                 c1 = TOLOWER(c1);
3772                 c2 = TOLOWER(c2);
3773                 if (c1 != c2)
3774                     return INT2FIX(c1 < c2 ? -1 : 1);
3775             }
3776             else {
3777                 int r;
3778                 l1 = rb_enc_mbclen(p1, p1end, enc);
3779                 l2 = rb_enc_mbclen(p2, p2end, enc);
3780                 len = l1 < l2 ? l1 : l2;
3781                 r = memcmp(p1, p2, len);
3782                 if (r != 0)
3783                     return INT2FIX(r < 0 ? -1 : 1);
3784                 if (l1 != l2)
3785                     return INT2FIX(l1 < l2 ? -1 : 1);
3786             }
3787             p1 += l1;
3788             p2 += l2;
3789         }
3790     }
3791     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3792     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3793     return INT2FIX(-1);
3794 }
3795
3796 /*
3797  *  call-seq:
3798  *    casecmp?(other_string) -> true, false, or nil
3799  *
3800  *  Returns +true+ if +self+ and +other_string+ are equal after
3801  *  Unicode case folding, otherwise +false+:
3802  *
3803  *    'foo'.casecmp?('foo') # => true
3804  *    'foo'.casecmp?('food') # => false
3805  *    'food'.casecmp?('foo') # => false
3806  *    'FOO'.casecmp?('foo') # => true
3807  *    'foo'.casecmp?('FOO') # => true
3808  *
3809  *  Returns +nil+ if the two values are incomparable:
3810  *
3811  *    'foo'.casecmp?(1) # => nil
3812  *
3813  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
3814  *
3815  *  Related: String#casecmp.
3816  *
3817  */
3818
3819 static VALUE
3820 rb_str_casecmp_p(VALUE str1, VALUE str2)
3821 {
3822     VALUE s = rb_check_string_type(str2);
3823     if (NIL_P(s)) {
3824         return Qnil;
3825     }
3826     return str_casecmp_p(str1, s);
3827 }
3828
3829 static VALUE
3830 str_casecmp_p(VALUE str1, VALUE str2)
3831 {
3832     rb_encoding *enc;
3833     VALUE folded_str1, folded_str2;
3834     VALUE fold_opt = sym_fold;
3835
3836     enc = rb_enc_compatible(str1, str2);
3837     if (!enc) {
3838         return Qnil;
3839     }
3840
3841     folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3842     folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3843
3844     return rb_str_eql(folded_str1, folded_str2);
3845 }
3846
3847 static long
3848 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3849             const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3850 {
3851     const char *search_start = str_ptr;
3852     long pos, search_len = str_len - offset;
3853
3854     for (;;) {
3855         const char *t;
3856         pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3857         if (pos < 0) return pos;
3858         t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3859         if (t == search_start + pos) break;
3860         search_len -= t - search_start;
3861         if (search_len <= 0) return -1;
3862         offset += t - search_start;
3863         search_start = t;
3864     }
3865     return pos + offset;
3866 }
3867
3868 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3869
3870 static long
3871 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3872 {
3873     const char *str_ptr, *str_ptr_end, *sub_ptr;
3874     long str_len, sub_len;
3875     rb_encoding *enc;
3876
3877     enc = rb_enc_check(str, sub);
3878     if (is_broken_string(sub)) return -1;
3879
3880     str_ptr = RSTRING_PTR(str);
3881     str_ptr_end = RSTRING_END(str);
3882     str_len = RSTRING_LEN(str);
3883     sub_ptr = RSTRING_PTR(sub);
3884     sub_len = RSTRING_LEN(sub);
3885
3886     if (str_len < sub_len) return -1;
3887
3888     if (offset != 0) {
3889         long str_len_char, sub_len_char;
3890         int single_byte = single_byte_optimizable(str);
3891         str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3892         sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3893         if (offset < 0) {
3894             offset += str_len_char;
3895             if (offset < 0) return -1;
3896         }
3897         if (str_len_char - offset < sub_len_char) return -1;
3898         if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3899         str_ptr += offset;
3900     }
3901     if (sub_len == 0) return offset;
3902
3903     /* need proceed one character at a time */
3904     return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3905 }
3906
3907
3908 /*
3909  *  call-seq:
3910  *    index(substring, offset = 0) -> integer or nil
3911  *    index(regexp, offset = 0) -> integer or nil
3912  *
3913  *  Returns the \Integer index of the first occurrence of the given +substring+,
3914  *  or +nil+ if none found:
3915  *
3916  *    'foo'.index('f') # => 0
3917  *    'foo'.index('o') # => 1
3918  *    'foo'.index('oo') # => 1
3919  *    'foo'.index('ooo') # => nil
3920  *
3921  *  Returns the \Integer index of the first match for the given \Regexp +regexp+,
3922  *  or +nil+ if none found:
3923  *
3924  *    'foo'.index(/f/) # => 0
3925  *    'foo'.index(/o/) # => 1
3926  *    'foo'.index(/oo/) # => 1
3927  *    'foo'.index(/ooo/) # => nil
3928  *
3929  *  \Integer argument +offset+, if given, specifies the position in the
3930  *  string to begin the search:
3931  *
3932  *    'foo'.index('o', 1) # => 1
3933  *    'foo'.index('o', 2) # => 2
3934  *    'foo'.index('o', 3) # => nil
3935  *
3936  *  If +offset+ is negative, counts backward from the end of +self+:
3937  *
3938  *    'foo'.index('o', -1) # => 2
3939  *    'foo'.index('o', -2) # => 1
3940  *    'foo'.index('o', -3) # => 1
3941  *    'foo'.index('o', -4) # => nil
3942  *
3943  *  Related: String#rindex.
3944  */
3945
3946 static VALUE
3947 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3948 {
3949     VALUE sub;
3950     VALUE initpos;
3951     long pos;
3952
3953     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3954         pos = NUM2LONG(initpos);
3955     }
3956     else {
3957         pos = 0;
3958     }
3959     if (pos < 0) {
3960         pos += str_strlen(str, NULL);
3961         if (pos < 0) {
3962             if (RB_TYPE_P(sub, T_REGEXP)) {
3963                 rb_backref_set(Qnil);
3964             }
3965             return Qnil;
3966         }
3967     }
3968
3969     if (RB_TYPE_P(sub, T_REGEXP)) {
3970         if (pos > str_strlen(str, NULL))
3971             return Qnil;
3972         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3973                          rb_enc_check(str, sub), single_byte_optimizable(str));
3974
3975         if (rb_reg_search(sub, str, pos, 0) < 0) {
3976             return Qnil;
3977         }
3978         else {
3979             VALUE match = rb_backref_get();
3980             struct re_registers *regs = RMATCH_REGS(match);
3981             pos = rb_str_sublen(str, BEG(0));
3982             return LONG2NUM(pos);
3983         }
3984     }
3985     else {
3986         StringValue(sub);
3987         pos = rb_str_index(str, sub, pos);
3988         pos = rb_str_sublen(str, pos);
3989     }
3990
3991     if (pos == -1) return Qnil;
3992     return LONG2NUM(pos);
3993 }
3994
3995 #ifdef HAVE_MEMRCHR
3996 static long
3997 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3998 {
3999     char *hit, *adjusted;
4000     int c;
4001     long slen, searchlen;
4002     char *sbeg, *e, *t;
4003
4004     slen = RSTRING_LEN(sub);
4005     if (slen == 0) return pos;
4006     sbeg = RSTRING_PTR(str);
4007     e = RSTRING_END(str);
4008     t = RSTRING_PTR(sub);
4009     c = *t & 0xff;
4010     searchlen = s - sbeg + 1;
4011
4012     do {
4013         hit = memrchr(sbeg, c, searchlen);
4014         if (!hit) break;
4015         adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4016         if (hit != adjusted) {
4017             searchlen = adjusted - sbeg;
4018             continue;
4019         }
4020         if (memcmp(hit, t, slen) == 0)
4021             return rb_str_sublen(str, hit - sbeg);
4022         searchlen = adjusted - sbeg;
4023     } while (searchlen > 0);
4024
4025     return -1;
4026 }
4027 #else
4028 static long
4029 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4030 {
4031     long slen;
4032     char *sbeg, *e, *t;
4033
4034     sbeg = RSTRING_PTR(str);
4035     e = RSTRING_END(str);
4036     t = RSTRING_PTR(sub);
4037     slen = RSTRING_LEN(sub);
4038
4039     while (s) {
4040         if (memcmp(s, t, slen) == 0) {
4041             return pos;
4042         }
4043         if (pos == 0) break;
4044         pos--;
4045         s = rb_enc_prev_char(sbeg, s, e, enc);
4046     }
4047
4048     return -1;
4049 }
4050 #endif
4051
4052 static long
4053 rb_str_rindex(VALUE str, VALUE sub, long pos)
4054 {
4055     long len, slen;
4056     char *sbeg, *s;
4057     rb_encoding *enc;
4058     int singlebyte;
4059
4060     enc = rb_enc_check(str, sub);
4061     if (is_broken_string(sub)) return -1;
4062     singlebyte = single_byte_optimizable(str);
4063     len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4064     slen = str_strlen(sub, enc); /* rb_enc_check */
4065
4066     /* substring longer than string */
4067     if (len < slen) return -1;
4068     if (len - pos < slen) pos = len - slen;
4069     if (len == 0) return pos;
4070
4071     sbeg = RSTRING_PTR(str);
4072
4073     if (pos == 0) {
4074         if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4075             return 0;
4076         else
4077             return -1;
4078     }
4079
4080     s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4081     return str_rindex(str, sub, s, pos, enc);
4082 }
4083
4084 /*
4085  *  call-seq:
4086  *    rindex(substring, offset = self.length) -> integer or nil
4087  *    rindex(regexp, offset = self.length) -> integer or nil
4088  *
4089  *  Returns the \Integer index of the _last_ occurrence of the given +substring+,
4090  *  or +nil+ if none found:
4091  *
4092  *    'foo'.rindex('f') # => 0
4093  *    'foo'.rindex('o') # => 2
4094  *    'foo'.rindex('oo') # => 1
4095  *    'foo'.rindex('ooo') # => nil
4096  *
4097  *  Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4098  *  or +nil+ if none found:
4099  *
4100  *    'foo'.rindex(/f/) # => 0
4101  *    'foo'.rindex(/o/) # => 2
4102  *    'foo'.rindex(/oo/) # => 1
4103  *    'foo'.rindex(/ooo/) # => nil
4104  *
4105  *  The _last_ match means starting at the possible last position, not
4106  *  the last of longest matches.
4107  *
4108  *    'foo'.rindex(/o+/) # => 2
4109  *    $~ #=> #<MatchData "o">
4110  *
4111  *  To get the last longest match, needs to combine with negative
4112  *  lookbehind.
4113  *
4114  *    'foo'.rindex(/(?<!o)o+/) # => 1
4115  *    $~ #=> #<MatchData "oo">
4116  *
4117  *  Or String#index with negative lookforward.
4118  *
4119  *    'foo'.index(/o+(?!.*o)/) # => 1
4120  *    $~ #=> #<MatchData "oo">
4121  *
4122  *  \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4123  *   string to _end_ the search:
4124  *
4125  *    'foo'.rindex('o', 0) # => nil
4126  *    'foo'.rindex('o', 1) # => 1
4127  *    'foo'.rindex('o', 2) # => 2
4128  *    'foo'.rindex('o', 3) # => 2
4129  *
4130  *  If +offset+ is a negative \Integer, the maximum starting position in the
4131  *  string to _end_ the search is the sum of the string's length and +offset+:
4132  *
4133  *    'foo'.rindex('o', -1) # => 2
4134  *    'foo'.rindex('o', -2) # => 1
4135  *    'foo'.rindex('o', -3) # => nil
4136  *    'foo'.rindex('o', -4) # => nil
4137  *
4138  *  Related: String#index.
4139  */
4140
4141 static VALUE
4142 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4143 {
4144     VALUE sub;
4145     VALUE vpos;
4146     rb_encoding *enc = STR_ENC_GET(str);
4147     long pos, len = str_strlen(str, enc); /* str's enc */
4148
4149     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4150         pos = NUM2LONG(vpos);
4151         if (pos < 0) {
4152             pos += len;
4153             if (pos < 0) {
4154                 if (RB_TYPE_P(sub, T_REGEXP)) {
4155                     rb_backref_set(Qnil);
4156                 }
4157                 return Qnil;
4158             }
4159         }
4160         if (pos > len) pos = len;
4161     }
4162     else {
4163         pos = len;
4164     }
4165
4166     if (RB_TYPE_P(sub, T_REGEXP)) {
4167         /* enc = rb_get_check(str, sub); */
4168         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4169                          enc, single_byte_optimizable(str));
4170
4171         if (rb_reg_search(sub, str, pos, 1) >= 0) {
4172             VALUE match = rb_backref_get();
4173             struct re_registers *regs = RMATCH_REGS(match);
4174             pos = rb_str_sublen(str, BEG(0));
4175             return LONG2NUM(pos);
4176         }
4177     }
4178     else {
4179         StringValue(sub);
4180         pos = rb_str_rindex(str, sub, pos);
4181         if (pos >= 0) return LONG2NUM(pos);
4182     }
4183     return Qnil;
4184 }
4185
4186 /*
4187  *  call-seq:
4188  *    string =~ regexp -> integer or nil
4189  *    string =~ object -> integer or nil
4190  *
4191  *  Returns the \Integer index of the first substring that matches
4192  *  the given +regexp+, or +nil+ if no match found:
4193  *
4194  *    'foo' =~ /f/ # => 0
4195  *    'foo' =~ /o/ # => 1
4196  *    'foo' =~ /x/ # => nil
4197  *
4198  *  Note: also updates
4199  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4200  *
4201  *  If the given +object+ is not a \Regexp, returns the value
4202  *  returned by <tt>object =~ self</tt>.
4203  *
4204  *  Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4205  *  (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4206  *
4207  *    number= nil
4208  *    "no. 9" =~ /(?<number>\d+)/
4209  *    number # => nil (not assigned)
4210  *    /(?<number>\d+)/ =~ "no. 9"
4211  *    number #=> "9"
4212  *
4213  */
4214
4215 static VALUE
4216 rb_str_match(VALUE x, VALUE y)
4217 {
4218     switch (OBJ_BUILTIN_TYPE(y)) {
4219       case T_STRING:
4220         rb_raise(rb_eTypeError, "type mismatch: String given");
4221
4222       case T_REGEXP:
4223         return rb_reg_match(y, x);
4224
4225       default:
4226         return rb_funcall(y, idEqTilde, 1, x);
4227     }
4228 }
4229
4230
4231 static VALUE get_pat(VALUE);
4232
4233
4234 /*
4235  *  call-seq:
4236  *    match(pattern, offset = 0) -> matchdata or nil
4237  *    match(pattern, offset = 0) {|matchdata| ... } -> object
4238  *
4239  *  Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4240  *
4241  *  Note: also updates
4242  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4243  *
4244  *  - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4245  *      regexp = Regexp.new(pattern)
4246  *  - Computes +matchdata+, which will be either a \MatchData object or +nil+
4247  *    (see Regexp#match):
4248  *      matchdata = <tt>regexp.match(self)
4249  *
4250  *  With no block given, returns the computed +matchdata+:
4251  *
4252  *    'foo'.match('f') # => #<MatchData "f">
4253  *    'foo'.match('o') # => #<MatchData "o">
4254  *    'foo'.match('x') # => nil
4255  *
4256  *  If \Integer argument +offset+ is given, the search begins at index +offset+:
4257  *
4258  *    'foo'.match('f', 1) # => nil
4259  *    'foo'.match('o', 1) # => #<MatchData "o">
4260  *
4261  *  With a block given, calls the block with the computed +matchdata+
4262  *  and returns the block's return value:
4263  *
4264  *    'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4265  *    'foo'.match(/x/) {|matchdata| matchdata } # => nil
4266  *    'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4267  *
4268  */
4269
4270 static VALUE
4271 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4272 {
4273     VALUE re, result;
4274     if (argc < 1)
4275         rb_check_arity(argc, 1, 2);
4276     re = argv[0];
4277     argv[0] = str;
4278     result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4279     if (!NIL_P(result) && rb_block_given_p()) {
4280         return rb_yield(result);
4281     }
4282     return result;
4283 }
4284
4285 /*
4286  *  call-seq:
4287  *    match?(pattern, offset = 0) -> true or false
4288  *
4289  *  Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4290  *
4291  *  Note: does not update
4292  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4293  *
4294  *  Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4295  *    regexp = Regexp.new(pattern)
4296  *
4297  *  Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4298  *  +false+ otherwise:
4299  *
4300  *    'foo'.match?(/o/) # => true
4301  *    'foo'.match?('o') # => true
4302  *    'foo'.match?(/x/) # => false
4303  *
4304  *  If \Integer argument +offset+ is given, the search begins at index +offset+:
4305  *    'foo'.match?('f', 1) # => false
4306  *    'foo'.match?('o', 1) # => true
4307  *
4308  */
4309
4310 static VALUE
4311 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4312 {
4313     VALUE re;
4314     rb_check_arity(argc, 1, 2);
4315     re = get_pat(argv[0]);
4316     return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4317 }
4318
4319 enum neighbor_char {
4320     NEIGHBOR_NOT_CHAR,
4321     NEIGHBOR_FOUND,
4322     NEIGHBOR_WRAPPED
4323 };
4324
4325 static enum neighbor_char
4326 enc_succ_char(char *p, long len, rb_encoding *enc)
4327 {
4328     long i;
4329     int l;
4330
4331     if (rb_enc_mbminlen(enc) > 1) {
4332         /* wchar, trivial case */
4333         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4334         if (!MBCLEN_CHARFOUND_P(r)) {
4335             return NEIGHBOR_NOT_CHAR;
4336         }
4337         c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4338         l = rb_enc_code_to_mbclen(c, enc);
4339         if (!l) return NEIGHBOR_NOT_CHAR;
4340         if (l != len) return NEIGHBOR_WRAPPED;
4341         rb_enc_mbcput(c, p, enc);
4342         r = rb_enc_precise_mbclen(p, p + len, enc);
4343         if (!MBCLEN_CHARFOUND_P(r)) {
4344             return NEIGHBOR_NOT_CHAR;
4345         }
4346         return NEIGHBOR_FOUND;
4347     }
4348     while (1) {
4349         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4350             p[i] = '\0';
4351         if (i < 0)
4352             return NEIGHBOR_WRAPPED;
4353         ++((unsigned char*)p)[i];
4354         l = rb_enc_precise_mbclen(p, p+len, enc);
4355         if (MBCLEN_CHARFOUND_P(l)) {
4356             l = MBCLEN_CHARFOUND_LEN(l);
4357             if (l == len) {
4358                 return NEIGHBOR_FOUND;
4359             }
4360             else {
4361                 memset(p+l, 0xff, len-l);
4362             }
4363         }
4364         if (MBCLEN_INVALID_P(l) && i < len-1) {
4365             long len2;
4366             int l2;
4367             for (len2 = len-1; 0 < len2; len2--) {
4368                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4369                 if (!MBCLEN_INVALID_P(l2))
4370                     break;
4371             }
4372             memset(p+len2+1, 0xff, len-(len2+1));
4373         }
4374     }
4375 }
4376
4377 static enum neighbor_char
4378 enc_pred_char(char *p, long len, rb_encoding *enc)
4379 {
4380     long i;
4381     int l;
4382     if (rb_enc_mbminlen(enc) > 1) {
4383         /* wchar, trivial case */
4384         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4385         if (!MBCLEN_CHARFOUND_P(r)) {
4386             return NEIGHBOR_NOT_CHAR;
4387         }
4388         c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4389         if (!c) return NEIGHBOR_NOT_CHAR;
4390         --c;
4391         l = rb_enc_code_to_mbclen(c, enc);
4392         if (!l) return NEIGHBOR_NOT_CHAR;
4393         if (l != len) return NEIGHBOR_WRAPPED;
4394         rb_enc_mbcput(c, p, enc);
4395         r = rb_enc_precise_mbclen(p, p + len, enc);
4396         if (!MBCLEN_CHARFOUND_P(r)) {
4397             return NEIGHBOR_NOT_CHAR;
4398         }
4399         return NEIGHBOR_FOUND;
4400     }
4401     while (1) {
4402         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4403             p[i] = '\xff';
4404         if (i < 0)
4405             return NEIGHBOR_WRAPPED;
4406         --((unsigned char*)p)[i];
4407         l = rb_enc_precise_mbclen(p, p+len, enc);
4408         if (MBCLEN_CHARFOUND_P(l)) {
4409             l = MBCLEN_CHARFOUND_LEN(l);
4410             if (l == len) {
4411                 return NEIGHBOR_FOUND;
4412             }
4413             else {
4414                 memset(p+l, 0, len-l);
4415             }
4416         }
4417         if (MBCLEN_INVALID_P(l) && i < len-1) {
4418             long len2;
4419             int l2;
4420             for (len2 = len-1; 0 < len2; len2--) {
4421                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4422                 if (!MBCLEN_INVALID_P(l2))
4423                     break;
4424             }
4425             memset(p+len2+1, 0, len-(len2+1));
4426         }
4427     }
4428 }
4429
4430 /*
4431   overwrite +p+ by succeeding letter in +enc+ and returns
4432   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4433   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4434   assuming each ranges are successive, and mbclen
4435   never change in each ranges.
4436   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4437   character.
4438  */
4439 static enum neighbor_char
4440 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4441 {
4442     enum neighbor_char ret;
4443     unsigned int c;
4444     int ctype;
4445     int range;
4446     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4447
4448     /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4449     int try;
4450     const int max_gaps = 1;
4451
4452     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4453     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4454         ctype = ONIGENC_CTYPE_DIGIT;
4455     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4456         ctype = ONIGENC_CTYPE_ALPHA;
4457     else
4458         return NEIGHBOR_NOT_CHAR;
4459
4460     MEMCPY(save, p, char, len);
4461     for (try = 0; try <= max_gaps; ++try) {
4462         ret = enc_succ_char(p, len, enc);
4463         if (ret == NEIGHBOR_FOUND) {
4464             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4465             if (rb_enc_isctype(c, ctype, enc))
4466                 return NEIGHBOR_FOUND;
4467         }
4468     }
4469     MEMCPY(p, save, char, len);
4470     range = 1;
4471     while (1) {
4472         MEMCPY(save, p, char, len);
4473         ret = enc_pred_char(p, len, enc);
4474         if (ret == NEIGHBOR_FOUND) {
4475             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4476             if (!rb_enc_isctype(c, ctype, enc)) {
4477                 MEMCPY(p, save, char, len);
4478                 break;
4479             }
4480         }
4481         else {
4482             MEMCPY(p, save, char, len);
4483             break;
4484         }
4485         range++;
4486     }
4487     if (range == 1) {
4488         return NEIGHBOR_NOT_CHAR;
4489     }
4490
4491     if (ctype != ONIGENC_CTYPE_DIGIT) {
4492         MEMCPY(carry, p, char, len);
4493         return NEIGHBOR_WRAPPED;
4494     }
4495
4496     MEMCPY(carry, p, char, len);
4497     enc_succ_char(carry, len, enc);
4498     return NEIGHBOR_WRAPPED;
4499 }
4500
4501
4502 static VALUE str_succ(VALUE str);
4503
4504 /*
4505  *  call-seq:
4506  *    succ -> new_str
4507  *
4508  *  Returns the successor to +self+. The successor is calculated by
4509  *  incrementing characters.
4510  *
4511  *  The first character to be incremented is the rightmost alphanumeric:
4512  *  or, if no alphanumerics, the rightmost character:
4513  *
4514  *    'THX1138'.succ # => "THX1139"
4515  *    '<<koala>>'.succ # => "<<koalb>>"
4516  *    '***'.succ # => '**+'
4517  *
4518  *  The successor to a digit is another digit, "carrying" to the next-left
4519  *  character for a "rollover" from 9 to 0, and prepending another digit
4520  *  if necessary:
4521  *
4522  *    '00'.succ # => "01"
4523  *    '09'.succ # => "10"
4524  *    '99'.succ # => "100"
4525  *
4526  *  The successor to a letter is another letter of the same case,
4527  *  carrying to the next-left character for a rollover,
4528  *  and prepending another same-case letter if necessary:
4529  *
4530  *    'aa'.succ # => "ab"
4531  *    'az'.succ # => "ba"
4532  *    'zz'.succ # => "aaa"
4533  *    'AA'.succ # => "AB"
4534  *    'AZ'.succ # => "BA"
4535  *    'ZZ'.succ # => "AAA"
4536  *
4537  *  The successor to a non-alphanumeric character is the next character
4538  *  in the underlying character set's collating sequence,
4539  *  carrying to the next-left character for a rollover,
4540  *  and prepending another character if necessary:
4541  *
4542  *    s = 0.chr * 3
4543  *    s # => "\x00\x00\x00"
4544  *    s.succ # => "\x00\x00\x01"
4545  *    s = 255.chr * 3
4546  *    s # => "\xFF\xFF\xFF"
4547  *    s.succ # => "\x01\x00\x00\x00"
4548  *
4549  *  Carrying can occur between and among mixtures of alphanumeric characters:
4550  *
4551  *    s = 'zz99zz99'
4552  *    s.succ # => "aaa00aa00"
4553  *    s = '99zz99zz'
4554  *    s.succ # => "100aa00aa"
4555  *
4556  *  The successor to an empty \String is a new empty \String:
4557  *
4558  *    ''.succ # => ""
4559  *
4560  *  String#next is an alias for String#succ.
4561  */
4562
4563 VALUE
4564 rb_str_succ(VALUE orig)
4565 {
4566     VALUE str;
4567     str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4568     rb_enc_cr_str_copy_for_substr(str, orig);
4569     return str_succ(str);
4570 }
4571
4572 static VALUE
4573 str_succ(VALUE str)
4574 {
4575     rb_encoding *enc;
4576     char *sbeg, *s, *e, *last_alnum = 0;
4577     int found_alnum = 0;
4578     long l, slen;
4579     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4580     long carry_pos = 0, carry_len = 1;
4581     enum neighbor_char neighbor = NEIGHBOR_FOUND;
4582
4583     slen = RSTRING_LEN(str);
4584     if (slen == 0) return str;
4585
4586     enc = STR_ENC_GET(str);
4587     sbeg = RSTRING_PTR(str);
4588     s = e = sbeg + slen;
4589
4590     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4591         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4592             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4593                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4594                 break;
4595             }
4596         }
4597         l = rb_enc_precise_mbclen(s, e, enc);
4598         if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4599         l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4600         neighbor = enc_succ_alnum_char(s, l, enc, carry);
4601         switch (neighbor) {
4602           case NEIGHBOR_NOT_CHAR:
4603             continue;
4604           case NEIGHBOR_FOUND:
4605             return str;
4606           case NEIGHBOR_WRAPPED:
4607             last_alnum = s;
4608             break;
4609         }
4610         found_alnum = 1;
4611         carry_pos = s - sbeg;
4612         carry_len = l;
4613     }
4614     if (!found_alnum) {         /* str contains no alnum */
4615         s = e;
4616         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4617             enum neighbor_char neighbor;
4618             char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4619             l = rb_enc_precise_mbclen(s, e, enc);
4620             if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4621             l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4622             MEMCPY(tmp, s, char, l);
4623             neighbor = enc_succ_char(tmp, l, enc);
4624             switch (neighbor) {
4625               case NEIGHBOR_FOUND:
4626                 MEMCPY(s, tmp, char, l);
4627                 return str;
4628                 break;
4629               case NEIGHBOR_WRAPPED:
4630                 MEMCPY(s, tmp, char, l);
4631                 break;
4632               case NEIGHBOR_NOT_CHAR:
4633                 break;
4634             }
4635             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4636                 /* wrapped to \0...\0.  search next valid char. */
4637                 enc_succ_char(s, l, enc);
4638             }
4639             if (!rb_enc_asciicompat(enc)) {
4640                 MEMCPY(carry, s, char, l);
4641                 carry_len = l;
4642             }
4643             carry_pos = s - sbeg;
4644         }
4645         ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4646     }
4647     RESIZE_CAPA(str, slen + carry_len);
4648     sbeg = RSTRING_PTR(str);
4649     s = sbeg + carry_pos;
4650     memmove(s + carry_len, s, slen - carry_pos);
4651     memmove(s, carry, carry_len);
4652     slen += carry_len;
4653     STR_SET_LEN(str, slen);
4654     TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4655     rb_enc_str_coderange(str);
4656     return str;
4657 }
4658
4659
4660 /*
4661  *  call-seq:
4662  *    succ! -> self
4663  *
4664  *  Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4665  *
4666  *  String#next! is an alias for String#succ!.
4667  */
4668
4669 static VALUE
4670 rb_str_succ_bang(VALUE str)
4671 {
4672     rb_str_modify(str);
4673     str_succ(str);
4674     return str;
4675 }
4676
4677 static int
4678 all_digits_p(const char *s, long len)
4679 {
4680     while (len-- > 0) {
4681         if (!ISDIGIT(*s)) return 0;
4682         s++;
4683     }
4684     return 1;
4685 }
4686
4687 static int
4688 str_upto_i(VALUE str, VALUE arg)
4689 {
4690     rb_yield(str);
4691     return 0;
4692 }
4693
4694 /*
4695  *  call-seq:
4696  *    upto(other_string, exclusive = false) {|string| ... } -> self
4697  *    upto(other_string, exclusive = false) -> new_enumerator
4698  *
4699  *  With a block given, calls the block with each \String value
4700  *  returned by successive calls to String#succ;
4701  *  the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4702  *  the sequence terminates when value +other_string+ is reached;
4703  *  returns +self+:
4704  *
4705  *    'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4706  *  Output:
4707  *
4708  *    a8 a9 b0 b1 b2 b3 b4 b5 b6
4709  *
4710  *  If argument +exclusive+ is given as a truthy object, the last value is omitted:
4711  *
4712  *    'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4713  *
4714  *  Output:
4715  *
4716  *    a8 a9 b0 b1 b2 b3 b4 b5
4717  *
4718  *  If +other_string+ would not be reached, does not call the block:
4719  *
4720  *    '25'.upto('5') {|s| fail s }
4721  *    'aa'.upto('a') {|s| fail s }
4722  *
4723  *  With no block given, returns a new \Enumerator:
4724  *
4725  *    'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4726  *
4727  */
4728
4729 static VALUE
4730 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4731 {
4732     VALUE end, exclusive;
4733
4734     rb_scan_args(argc, argv, "11", &end, &exclusive);
4735     RETURN_ENUMERATOR(beg, argc, argv);
4736     return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4737 }
4738
4739 VALUE
4740 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4741 {
4742     VALUE current, after_end;
4743     ID succ;
4744     int n, ascii;
4745     rb_encoding *enc;
4746
4747     CONST_ID(succ, "succ");
4748     StringValue(end);
4749     enc = rb_enc_check(beg, end);
4750     ascii = (is_ascii_string(beg) && is_ascii_string(end));
4751     /* single character */
4752     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4753         char c = RSTRING_PTR(beg)[0];
4754         char e = RSTRING_PTR(end)[0];
4755
4756         if (c > e || (excl && c == e)) return beg;
4757         for (;;) {
4758             if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4759             if (!excl && c == e) break;
4760             c++;
4761             if (excl && c == e) break;
4762         }
4763         return beg;
4764     }
4765     /* both edges are all digits */
4766     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4767         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4768         all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4769         VALUE b, e;
4770         int width;
4771
4772         width = RSTRING_LENINT(beg);
4773         b = rb_str_to_inum(beg, 10, FALSE);
4774         e = rb_str_to_inum(end, 10, FALSE);
4775         if (FIXNUM_P(b) && FIXNUM_P(e)) {
4776             long bi = FIX2LONG(b);
4777             long ei = FIX2LONG(e);
4778             rb_encoding *usascii = rb_usascii_encoding();
4779
4780             while (bi <= ei) {
4781                 if (excl && bi == ei) break;
4782                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4783                 bi++;
4784             }
4785         }
4786         else {
4787             ID op = excl ? '<' : idLE;
4788             VALUE args[2], fmt = rb_fstring_lit("%.*d");
4789
4790             args[0] = INT2FIX(width);
4791             while (rb_funcall(b, op, 1, e)) {
4792                 args[1] = b;
4793                 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4794                 b = rb_funcallv(b, succ, 0, 0);
4795             }
4796         }
4797         return beg;
4798     }
4799     /* normal case */
4800     n = rb_str_cmp(beg, end);
4801     if (n > 0 || (excl && n == 0)) return beg;
4802
4803     after_end = rb_funcallv(end, succ, 0, 0);
4804     current = str_duplicate(rb_cString, beg);
4805     while (!rb_str_equal(current, after_end)) {
4806         VALUE next = Qnil;
4807         if (excl || !rb_str_equal(current, end))
4808             next = rb_funcallv(current, succ, 0, 0);
4809         if ((*each)(current, arg)) break;
4810         if (NIL_P(next)) break;
4811         current = next;
4812         StringValue(current);
4813         if (excl && rb_str_equal(current, end)) break;
4814         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4815             break;
4816     }
4817
4818     return beg;
4819 }
4820
4821 VALUE
4822 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4823 {
4824     VALUE current;
4825     ID succ;
4826
4827     CONST_ID(succ, "succ");
4828     /* both edges are all digits */
4829     if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4830         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4831         VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4832         int width = RSTRING_LENINT(beg);
4833         b = rb_str_to_inum(beg, 10, FALSE);
4834         if (FIXNUM_P(b)) {
4835             long bi = FIX2LONG(b);
4836             rb_encoding *usascii = rb_usascii_encoding();
4837
4838             while (FIXABLE(bi)) {
4839                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4840                 bi++;
4841             }
4842             b = LONG2NUM(bi);
4843         }
4844         args[0] = INT2FIX(width);
4845         while (1) {
4846             args[1] = b;
4847             if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4848             b = rb_funcallv(b, succ, 0, 0);
4849         }
4850     }
4851     /* normal case */
4852     current = str_duplicate(rb_cString, beg);
4853     while (1) {
4854         VALUE next = rb_funcallv(current, succ, 0, 0);
4855         if ((*each)(current, arg)) break;
4856         current = next;
4857         StringValue(current);
4858         if (RSTRING_LEN(current) == 0)
4859             break;
4860     }
4861
4862     return beg;
4863 }
4864
4865 static int
4866 include_range_i(VALUE str, VALUE arg)
4867 {
4868     VALUE *argp = (VALUE *)arg;
4869     if (!rb_equal(str, *argp)) return 0;
4870     *argp = Qnil;
4871     return 1;
4872 }
4873
4874 VALUE
4875 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4876 {
4877     beg = rb_str_new_frozen(beg);
4878     StringValue(end);
4879     end = rb_str_new_frozen(end);
4880     if (NIL_P(val)) return Qfalse;
4881     val = rb_check_string_type(val);
4882     if (NIL_P(val)) return Qfalse;
4883     if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4884         rb_enc_asciicompat(STR_ENC_GET(end)) &&
4885         rb_enc_asciicompat(STR_ENC_GET(val))) {
4886         const char *bp = RSTRING_PTR(beg);
4887         const char *ep = RSTRING_PTR(end);
4888         const char *vp = RSTRING_PTR(val);
4889         if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4890             if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4891                 return Qfalse;
4892             else {
4893                 char b = *bp;
4894                 char e = *ep;
4895                 char v = *vp;
4896
4897                 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4898                     if (b <= v && v < e) return Qtrue;
4899                     return RBOOL(!RTEST(exclusive) && v == e);
4900                 }
4901             }
4902         }
4903 #if 0
4904         /* both edges are all digits */
4905         if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4906             all_digits_p(bp, RSTRING_LEN(beg)) &&
4907             all_digits_p(ep, RSTRING_LEN(end))) {
4908             /* TODO */
4909         }
4910 #endif
4911     }
4912     rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4913
4914     return RBOOL(NIL_P(val));
4915 }
4916
4917 static VALUE
4918 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4919 {
4920     if (rb_reg_search(re, str, 0, 0) >= 0) {
4921         VALUE match = rb_backref_get();
4922         int nth = rb_reg_backref_number(match, backref);
4923         return rb_reg_nth_match(nth, match);
4924     }
4925     return Qnil;
4926 }
4927
4928 static VALUE
4929 rb_str_aref(VALUE str, VALUE indx)
4930 {
4931     long idx;
4932
4933     if (FIXNUM_P(indx)) {
4934         idx = FIX2LONG(indx);
4935     }
4936     else if (RB_TYPE_P(indx, T_REGEXP)) {
4937         return rb_str_subpat(str, indx, INT2FIX(0));
4938     }
4939     else if (RB_TYPE_P(indx, T_STRING)) {
4940         if (rb_str_index(str, indx, 0) != -1)
4941             return str_duplicate(rb_cString, indx);
4942         return Qnil;
4943     }
4944     else {
4945         /* check if indx is Range */
4946         long beg, len = str_strlen(str, NULL);
4947         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4948           case Qfalse:
4949             break;
4950           case Qnil:
4951             return Qnil;
4952           default:
4953             return rb_str_substr(str, beg, len);
4954         }
4955         idx = NUM2LONG(indx);
4956     }
4957
4958     return str_substr(str, idx, 1, FALSE);
4959 }
4960
4961
4962 /*
4963  *  call-seq:
4964  *    string[index] -> new_string or nil
4965  *    string[start, length] -> new_string or nil
4966  *    string[range] -> new_string or nil
4967  *    string[regexp, capture = 0] -> new_string or nil
4968  *    string[substring] -> new_string or nil
4969  *
4970  *  Returns the substring of +self+ specified by the arguments.
4971  *
4972  *  When the single \Integer argument +index+ is given,
4973  *  returns the 1-character substring found in +self+ at offset +index+:
4974  *
4975  *    'bar'[2] # => "r"
4976  *
4977  *  Counts backward from the end of +self+ if +index+ is negative:
4978  *
4979  *    'foo'[-3] # => "f"
4980  *
4981  *  Returns +nil+ if +index+ is out of range:
4982  *
4983  *    'foo'[3] # => nil
4984  *    'foo'[-4] # => nil
4985  *
4986  *  When the two \Integer arguments  +start+ and +length+ are given,
4987  *  returns the substring of the given +length+ found in +self+ at offset +start+:
4988  *
4989  *    'foo'[0, 2] # => "fo"
4990  *    'foo'[0, 0] # => ""
4991  *
4992  *  Counts backward from the end of +self+ if +start+ is negative:
4993  *
4994  *    'foo'[-2, 2] # => "oo"
4995  *
4996  *  Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4997  *
4998  *    'foo'[3, 2] # => ""
4999  *
5000  *  Returns +nil+ if +start+ is out of range:
5001  *
5002  *    'foo'[4, 2] # => nil
5003  *    'foo'[-4, 2] # => nil
5004  *
5005  *  Returns the trailing substring of +self+ if +length+ is large:
5006  *
5007  *    'foo'[1, 50] # => "oo"
5008  *
5009  *  Returns +nil+ if +length+ is negative:
5010  *
5011  *    'foo'[0, -1] # => nil
5012  *
5013  *  When the single \Range argument +range+ is given,
5014  *  derives +start+ and +length+ values from the given +range+,
5015  *  and returns values as above:
5016  *
5017  *  - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5018  *  - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5019  *
5020  *  When the \Regexp argument +regexp+ is given,
5021  *  and the +capture+ argument is <tt>0</tt>,
5022  *  returns the first matching substring found in +self+,
5023  *  or +nil+ if none found:
5024  *
5025  *    'foo'[/o/] # => "o"
5026  *    'foo'[/x/] # => nil
5027  *    s = 'hello there'
5028  *    s[/[aeiou](.)\1/] # => "ell"
5029  *    s[/[aeiou](.)\1/, 0] # => "ell"
5030  *
5031  *  If argument +capture+ is given and not <tt>0</tt>,
5032  *  it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5033  *  the method call returns only the specified capture
5034  *  (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5035  *
5036  *    s = 'hello there'
5037  *    s[/[aeiou](.)\1/, 1] # => "l"
5038  *    s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5039  *    s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5040  *
5041  *  If an invalid capture group index is given, +nil+ is returned.  If an invalid
5042  *  capture group name is given, +IndexError+ is raised.
5043  *
5044  *  When the single \String argument +substring+ is given,
5045  *  returns the substring from +self+ if found, otherwise +nil+:
5046  *
5047  *    'foo'['oo'] # => "oo"
5048  *    'foo'['xx'] # => nil
5049  *
5050  *  String#slice is an alias for String#[].
5051  */
5052
5053 static VALUE
5054 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5055 {
5056     if (argc == 2) {
5057         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5058             return rb_str_subpat(str, argv[0], argv[1]);
5059         }
5060         else {
5061             long beg = NUM2LONG(argv[0]);
5062             long len = NUM2LONG(argv[1]);
5063             return rb_str_substr(str, beg, len);
5064         }
5065     }
5066     rb_check_arity(argc, 1, 2);
5067     return rb_str_aref(str, argv[0]);
5068 }
5069
5070 VALUE
5071 rb_str_drop_bytes(VALUE str, long len)
5072 {
5073     char *ptr = RSTRING_PTR(str);
5074     long olen = RSTRING_LEN(str), nlen;
5075
5076     str_modifiable(str);
5077     if (len > olen) len = olen;
5078     nlen = olen - len;
5079     if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5080         char *oldptr = ptr;
5081         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5082         STR_SET_EMBED(str);
5083         STR_SET_EMBED_LEN(str, nlen);
5084         ptr = RSTRING(str)->as.embed.ary;
5085         memmove(ptr, oldptr + len, nlen);
5086         if (fl == STR_NOEMBED) xfree(oldptr);
5087     }
5088     else {
5089         if (!STR_SHARED_P(str)) {
5090             VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5091             rb_enc_cr_str_exact_copy(shared, str);
5092             OBJ_FREEZE(shared);
5093         }
5094         ptr = RSTRING(str)->as.heap.ptr += len;
5095         RSTRING(str)->as.heap.len = nlen;
5096     }
5097     ptr[nlen] = 0;
5098     ENC_CODERANGE_CLEAR(str);
5099     return str;
5100 }
5101
5102 static void
5103 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5104 {
5105     char *sptr;
5106     long slen, vlen = RSTRING_LEN(val);
5107     int cr;
5108
5109     if (beg == 0 && vlen == 0) {
5110         rb_str_drop_bytes(str, len);
5111         return;
5112     }
5113
5114     str_modify_keep_cr(str);
5115     RSTRING_GETMEM(str, sptr, slen);
5116     if (len < vlen) {
5117         /* expand string */
5118         RESIZE_CAPA(str, slen + vlen - len);
5119         sptr = RSTRING_PTR(str);
5120     }
5121
5122     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5123         cr = rb_enc_str_coderange(val);
5124     else
5125         cr = ENC_CODERANGE_UNKNOWN;
5126
5127     if (vlen != len) {
5128         memmove(sptr + beg + vlen,
5129                 sptr + beg + len,
5130                 slen - (beg + len));
5131     }
5132     if (vlen < beg && len < 0) {
5133         MEMZERO(sptr + slen, char, -len);
5134     }
5135     if (vlen > 0) {
5136         memmove(sptr + beg, RSTRING_PTR(val), vlen);
5137     }
5138     slen += vlen - len;
5139     STR_SET_LEN(str, slen);
5140     TERM_FILL(&sptr[slen], TERM_LEN(str));
5141     ENC_CODERANGE_SET(str, cr);
5142 }
5143
5144 void
5145 rb_str_update(VALUE str, long beg, long len, VALUE val)
5146 {
5147     long slen;
5148     char *p, *e;
5149     rb_encoding *enc;
5150     int singlebyte = single_byte_optimizable(str);
5151     int cr;
5152
5153     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5154
5155     StringValue(val);
5156     enc = rb_enc_check(str, val);
5157     slen = str_strlen(str, enc); /* rb_enc_check */
5158
5159     if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5160         rb_raise(rb_eIndexError, "index %ld out of string", beg);
5161     }
5162     if (beg < 0) {
5163         beg += slen;
5164     }
5165     assert(beg >= 0);
5166     assert(beg <= slen);
5167     if (len > slen - beg) {
5168         len = slen - beg;
5169     }
5170     str_modify_keep_cr(str);
5171     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5172     if (!p) p = RSTRING_END(str);
5173     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5174     if (!e) e = RSTRING_END(str);
5175     /* error check */
5176     beg = p - RSTRING_PTR(str); /* physical position */
5177     len = e - p;                /* physical length */
5178     rb_str_splice_0(str, beg, len, val);
5179     rb_enc_associate(str, enc);
5180     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5181     if (cr != ENC_CODERANGE_BROKEN)
5182         ENC_CODERANGE_SET(str, cr);
5183 }
5184
5185 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5186
5187 static void
5188 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5189 {
5190     int nth;
5191     VALUE match;
5192     long start, end, len;
5193     rb_encoding *enc;
5194     struct re_registers *regs;
5195
5196     if (rb_reg_search(re, str, 0, 0) < 0) {
5197         rb_raise(rb_eIndexError, "regexp not matched");
5198     }
5199     match = rb_backref_get();
5200     nth = rb_reg_backref_number(match, backref);
5201     regs = RMATCH_REGS(match);
5202     if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5203         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5204     }
5205     if (nth < 0) {
5206         nth += regs->num_regs;
5207     }
5208
5209     start = BEG(nth);
5210     if (start == -1) {
5211         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5212     }
5213     end = END(nth);
5214     len = end - start;
5215     StringValue(val);
5216     enc = rb_enc_check_str(str, val);
5217     rb_str_splice_0(str, start, len, val);
5218     rb_enc_associate(str, enc);
5219 }
5220
5221 static VALUE
5222 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5223 {
5224     long idx, beg;
5225
5226     switch (TYPE(indx)) {
5227       case T_REGEXP:
5228         rb_str_subpat_set(str, indx, INT2FIX(0), val);
5229         return val;
5230
5231       case T_STRING:
5232         beg = rb_str_index(str, indx, 0);
5233         if (beg < 0) {
5234             rb_raise(rb_eIndexError, "string not matched");
5235         }
5236         beg = rb_str_sublen(str, beg);
5237         rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5238         return val;
5239
5240       default:
5241         /* check if indx is Range */
5242         {
5243             long beg, len;
5244             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5245                 rb_str_splice(str, beg, len, val);
5246                 return val;
5247             }
5248         }
5249         /* FALLTHROUGH */
5250
5251       case T_FIXNUM:
5252         idx = NUM2LONG(indx);
5253         rb_str_splice(str, idx, 1, val);
5254         return val;
5255     }
5256 }
5257
5258 /*
5259  *  call-seq:
5260  *     str[integer] = new_str
5261  *     str[integer, integer] = new_str
5262  *     str[range] = aString
5263  *     str[regexp] = new_str
5264  *     str[regexp, integer] = new_str
5265  *     str[regexp, name] = new_str
5266  *     str[other_str] = new_str
5267  *
5268  *  Element Assignment---Replaces some or all of the content of
5269  *  <i>str</i>. The portion of the string affected is determined using
5270  *  the same criteria as String#[]. If the replacement string is not
5271  *  the same length as the text it is replacing, the string will be
5272  *  adjusted accordingly. If the regular expression or string is used
5273  *  as the index doesn't match a position in the string, IndexError is
5274  *  raised. If the regular expression form is used, the optional
5275  *  second Integer allows you to specify which portion of the match to
5276  *  replace (effectively using the MatchData indexing rules. The forms
5277  *  that take an Integer will raise an IndexError if the value is out
5278  *  of range; the Range form will raise a RangeError, and the Regexp
5279  *  and String will raise an IndexError on negative match.
5280  */
5281
5282 static VALUE
5283 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5284 {
5285     if (argc == 3) {
5286         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5287             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5288         }
5289         else {
5290             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5291         }
5292         return argv[2];
5293     }
5294     rb_check_arity(argc, 2, 3);
5295     return rb_str_aset(str, argv[0], argv[1]);
5296 }
5297
5298 /*
5299  *  call-seq:
5300  *    insert(index, other_string) -> self
5301  *
5302  *  Inserts the given +other_string+ into +self+; returns +self+.
5303  *
5304  *  If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5305  *
5306  *    'foo'.insert(1, 'bar') # => "fbaroo"
5307  *
5308  *  If the \Integer +index+ is negative, counts backward from the end of +self+
5309  *  and inserts +other_string+ at offset <tt>index+1</tt>
5310  *  (that is, _after_ <tt>self[index]</tt>):
5311  *
5312  *    'foo'.insert(-2, 'bar') # => "fobaro"
5313  *
5314  */
5315
5316 static VALUE
5317 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5318 {
5319     long pos = NUM2LONG(idx);
5320
5321     if (pos == -1) {
5322         return rb_str_append(str, str2);
5323     }
5324     else if (pos < 0) {
5325         pos++;
5326     }
5327     rb_str_splice(str, pos, 0, str2);
5328     return str;
5329 }
5330
5331
5332 /*
5333  *  call-seq:
5334  *     slice!(index)               -> new_string or nil
5335  *     slice!(start, length)       -> new_string or nil
5336  *     slice!(range)               -> new_string or nil
5337  *     slice!(regexp, capture = 0) -> new_string or nil
5338  *     slice!(substring)           -> new_string or nil
5339  *
5340  *  Removes the substring of +self+ specified by the arguments;
5341  *  returns the removed substring.
5342  *
5343  *  See String#[] for details about the arguments that specify the substring.
5344  *
5345  *  A few examples:
5346  *
5347  *     string = "This is a string"
5348  *     string.slice!(2)        #=> "i"
5349  *     string.slice!(3..6)     #=> " is "
5350  *     string.slice!(/s.*t/)   #=> "sa st"
5351  *     string.slice!("r")      #=> "r"
5352  *     string                  #=> "Thing"
5353  *
5354  */
5355
5356 static VALUE
5357 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5358 {
5359     VALUE result = Qnil;
5360     VALUE indx;
5361     long beg, len = 1;
5362     char *p;
5363
5364     rb_check_arity(argc, 1, 2);
5365     str_modify_keep_cr(str);
5366     indx = argv[0];
5367     if (RB_TYPE_P(indx, T_REGEXP)) {
5368         if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5369         VALUE match = rb_backref_get();
5370         struct re_registers *regs = RMATCH_REGS(match);
5371         int nth = 0;
5372         if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5373             if ((nth += regs->num_regs) <= 0) return Qnil;
5374         }
5375         else if (nth >= regs->num_regs) return Qnil;
5376         beg = BEG(nth);
5377         len = END(nth) - beg;
5378         goto subseq;
5379     }
5380     else if (argc == 2) {
5381         beg = NUM2LONG(indx);
5382         len = NUM2LONG(argv[1]);
5383         goto num_index;
5384     }
5385     else if (FIXNUM_P(indx)) {
5386         beg = FIX2LONG(indx);
5387         if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5388         if (!len) return Qnil;
5389         beg = p - RSTRING_PTR(str);
5390         goto subseq;
5391     }
5392     else if (RB_TYPE_P(indx, T_STRING)) {
5393         beg = rb_str_index(str, indx, 0);
5394         if (beg == -1) return Qnil;
5395         len = RSTRING_LEN(indx);
5396         result = str_duplicate(rb_cString, indx);
5397         goto squash;
5398     }
5399     else {
5400         switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5401           case Qnil:
5402             return Qnil;
5403           case Qfalse:
5404             beg = NUM2LONG(indx);
5405             if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5406             if (!len) return Qnil;
5407             beg = p - RSTRING_PTR(str);
5408             goto subseq;
5409           default:
5410             goto num_index;
5411         }
5412     }
5413
5414   num_index:
5415     if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5416     beg = p - RSTRING_PTR(str);
5417
5418   subseq:
5419     result = rb_str_new(RSTRING_PTR(str)+beg, len);
5420     rb_enc_cr_str_copy_for_substr(result, str);
5421
5422   squash:
5423     if (len > 0) {
5424         if (beg == 0) {
5425             rb_str_drop_bytes(str, len);
5426         }
5427         else {
5428             char *sptr = RSTRING_PTR(str);
5429             long slen = RSTRING_LEN(str);
5430             if (beg + len > slen) /* pathological check */
5431                 len = slen - beg;
5432             memmove(sptr + beg,
5433                     sptr + beg + len,
5434                     slen - (beg + len));
5435             slen -= len;
5436             STR_SET_LEN(str, slen);
5437             TERM_FILL(&sptr[slen], TERM_LEN(str));
5438         }
5439     }
5440     return result;
5441 }
5442
5443 static VALUE
5444 get_pat(VALUE pat)
5445 {
5446     VALUE val;
5447
5448     switch (OBJ_BUILTIN_TYPE(pat)) {
5449       case T_REGEXP:
5450         return pat;
5451
5452       case T_STRING:
5453         break;
5454
5455       default:
5456         val = rb_check_string_type(pat);
5457         if (NIL_P(val)) {
5458             Check_Type(pat, T_REGEXP);
5459         }
5460         pat = val;
5461     }
5462
5463     return rb_reg_regcomp(pat);
5464 }
5465
5466 static VALUE
5467 get_pat_quoted(VALUE pat, int check)
5468 {
5469     VALUE val;
5470
5471     switch (OBJ_BUILTIN_TYPE(pat)) {
5472       case T_REGEXP:
5473         return pat;
5474
5475       case T_STRING:
5476         break;
5477
5478       default:
5479         val = rb_check_string_type(pat);
5480         if (NIL_P(val)) {
5481             Check_Type(pat, T_REGEXP);
5482         }
5483         pat = val;
5484     }
5485     if (check && is_broken_string(pat)) {
5486         rb_exc_raise(rb_reg_check_preprocess(pat));
5487     }
5488     return pat;
5489 }
5490
5491 static long
5492 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5493 {
5494     if (BUILTIN_TYPE(pat) == T_STRING) {
5495         pos = rb_strseq_index(str, pat, pos, 1);
5496         if (set_backref_str) {
5497             if (pos >= 0) {
5498                 str = rb_str_new_frozen_String(str);
5499                 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5500             }
5501             else {
5502                 rb_backref_set(Qnil);
5503             }
5504         }
5505         return pos;
5506     }
5507     else {
5508         return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5509     }
5510 }
5511
5512
5513 /*
5514  *  call-seq:
5515  *    sub!(pattern, replacement)   -> self or nil
5516  *    sub!(pattern) {|match| ... } -> self or nil
5517  *
5518  *  Returns +self+ with only the first occurrence
5519  *  (not all occurrences) of the given +pattern+ replaced.
5520  *
5521  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5522  *
5523  *  Related: String#sub, String#gsub, String#gsub!.
5524  *
5525  */
5526
5527 static VALUE
5528 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5529 {
5530     VALUE pat, repl, hash = Qnil;
5531     int iter = 0;
5532     long plen;
5533     int min_arity = rb_block_given_p() ? 1 : 2;
5534     long beg;
5535
5536     rb_check_arity(argc, min_arity, 2);
5537     if (argc == 1) {
5538         iter = 1;
5539     }
5540     else {
5541         repl = argv[1];
5542         hash = rb_check_hash_type(argv[1]);
5543         if (NIL_P(hash)) {
5544             StringValue(repl);
5545         }
5546     }
5547
5548     pat = get_pat_quoted(argv[0], 1);
5549
5550     str_modifiable(str);
5551     beg = rb_pat_search(pat, str, 0, 1);
5552     if (beg >= 0) {
5553         rb_encoding *enc;
5554         int cr = ENC_CODERANGE(str);
5555         long beg0, end0;
5556         VALUE match, match0 = Qnil;
5557         struct re_registers *regs;
5558         char *p, *rp;
5559         long len, rlen;
5560
5561         match = rb_backref_get();
5562         regs = RMATCH_REGS(match);
5563         if (RB_TYPE_P(pat, T_STRING)) {
5564             beg0 = beg;
5565             end0 = beg0 + RSTRING_LEN(pat);
5566             match0 = pat;
5567         }
5568         else {
5569             beg0 = BEG(0);
5570             end0 = END(0);
5571             if (iter) match0 = rb_reg_nth_match(0, match);
5572         }
5573
5574         if (iter || !NIL_P(hash)) {
5575             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5576
5577             if (iter) {
5578                 repl = rb_obj_as_string(rb_yield(match0));
5579             }
5580             else {
5581                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5582                 repl = rb_obj_as_string(repl);
5583             }
5584             str_mod_check(str, p, len);
5585             rb_check_frozen(str);
5586         }
5587         else {
5588             repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5589         }
5590
5591         enc = rb_enc_compatible(str, repl);
5592         if (!enc) {
5593             rb_encoding *str_enc = STR_ENC_GET(str);
5594             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5595             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5596                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5597                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5598                          rb_enc_name(str_enc),
5599                          rb_enc_name(STR_ENC_GET(repl)));
5600             }
5601             enc = STR_ENC_GET(repl);
5602         }
5603         rb_str_modify(str);
5604         rb_enc_associate(str, enc);
5605         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5606             int cr2 = ENC_CODERANGE(repl);
5607             if (cr2 == ENC_CODERANGE_BROKEN ||
5608                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5609                 cr = ENC_CODERANGE_UNKNOWN;
5610             else
5611                 cr = cr2;
5612         }
5613         plen = end0 - beg0;
5614         rlen = RSTRING_LEN(repl);
5615         len = RSTRING_LEN(str);
5616         if (rlen > plen) {
5617             RESIZE_CAPA(str, len + rlen - plen);
5618         }
5619         p = RSTRING_PTR(str);
5620         if (rlen != plen) {
5621             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5622         }
5623         rp = RSTRING_PTR(repl);
5624         memmove(p + beg0, rp, rlen);
5625         len += rlen - plen;
5626         STR_SET_LEN(str, len);
5627         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5628         ENC_CODERANGE_SET(str, cr);
5629
5630         return str;
5631     }
5632     return Qnil;
5633 }
5634
5635
5636 /*
5637  *  call-seq:
5638  *    sub(pattern, replacement)   -> new_string
5639  *    sub(pattern) {|match| ... } -> new_string
5640  *
5641  *  Returns a copy of +self+ with only the first occurrence
5642  *  (not all occurrences) of the given +pattern+ replaced.
5643  *
5644  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5645  *
5646  *  Related: String#sub!, String#gsub, String#gsub!.
5647  *
5648  */
5649
5650 static VALUE
5651 rb_str_sub(int argc, VALUE *argv, VALUE str)
5652 {
5653     str = str_duplicate(rb_cString, str);
5654     rb_str_sub_bang(argc, argv, str);
5655     return str;
5656 }
5657
5658 static VALUE
5659 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5660 {
5661     VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5662     struct re_registers *regs;
5663     long beg, beg0, end0;
5664     long offset, blen, slen, len, last;
5665     enum {STR, ITER, MAP} mode = STR;
5666     char *sp, *cp;
5667     int need_backref = -1;
5668     rb_encoding *str_enc;
5669
5670     switch (argc) {
5671       case 1:
5672         RETURN_ENUMERATOR(str, argc, argv);
5673         mode = ITER;
5674         break;
5675       case 2:
5676         repl = argv[1];
5677         hash = rb_check_hash_type(argv[1]);
5678         if (NIL_P(hash)) {
5679             StringValue(repl);
5680         }
5681         else {
5682             mode = MAP;
5683         }
5684         break;
5685       default:
5686         rb_error_arity(argc, 1, 2);
5687     }
5688
5689     pat = get_pat_quoted(argv[0], 1);
5690     beg = rb_pat_search(pat, str, 0, need_backref);
5691     if (beg < 0) {
5692         if (bang) return Qnil;  /* no match, no substitution */
5693         return str_duplicate(rb_cString, str);
5694     }
5695
5696     offset = 0;
5697     blen = RSTRING_LEN(str) + 30; /* len + margin */
5698     dest = rb_str_buf_new(blen);
5699     sp = RSTRING_PTR(str);
5700     slen = RSTRING_LEN(str);
5701     cp = sp;
5702     str_enc = STR_ENC_GET(str);
5703     rb_enc_associate(dest, str_enc);
5704     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5705
5706     do {
5707         match = rb_backref_get();
5708         regs = RMATCH_REGS(match);
5709         if (RB_TYPE_P(pat, T_STRING)) {
5710             beg0 = beg;
5711             end0 = beg0 + RSTRING_LEN(pat);
5712             match0 = pat;
5713         }
5714         else {
5715             beg0 = BEG(0);
5716             end0 = END(0);
5717             if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5718         }
5719
5720         if (mode) {
5721             if (mode == ITER) {
5722                 val = rb_obj_as_string(rb_yield(match0));
5723             }
5724             else {
5725                 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5726                 val = rb_obj_as_string(val);
5727             }
5728             str_mod_check(str, sp, slen);
5729             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
5730                 rb_raise(rb_eRuntimeError, "block should not cheat");
5731             }
5732         }
5733         else if (need_backref) {
5734             val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5735             if (need_backref < 0) {
5736                 need_backref = val != repl;
5737             }
5738         }
5739         else {
5740             val = repl;
5741         }
5742
5743         len = beg0 - offset;    /* copy pre-match substr */
5744         if (len) {
5745             rb_enc_str_buf_cat(dest, cp, len, str_enc);
5746         }
5747
5748         rb_str_buf_append(dest, val);
5749
5750         last = offset;
5751         offset = end0;
5752         if (beg0 == end0) {
5753             /*
5754              * Always consume at least one character of the input string
5755              * in order to prevent infinite loops.
5756              */
5757             if (RSTRING_LEN(str) <= end0) break;
5758             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5759             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5760             offset = end0 + len;
5761         }
5762         cp = RSTRING_PTR(str) + offset;
5763         if (offset > RSTRING_LEN(str)) break;
5764         beg = rb_pat_search(pat, str, offset, need_backref);
5765     } while (beg >= 0);
5766     if (RSTRING_LEN(str) > offset) {
5767         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5768     }
5769     rb_pat_search(pat, str, last, 1);
5770     if (bang) {
5771         str_shared_replace(str, dest);
5772     }
5773     else {
5774         str = dest;
5775     }
5776
5777     return str;
5778 }
5779
5780
5781 /*
5782  *  call-seq:
5783  *     gsub!(pattern, replacement)   -> self or nil
5784  *     gsub!(pattern) {|match| ... } -> self or nil
5785  *     gsub!(pattern)                -> an_enumerator
5786  *
5787  *  Performs the specified substring replacement(s) on +self+;
5788  *  returns +self+ if any replacement occurred, +nil+ otherwise.
5789  *
5790  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5791  *
5792  *  Returns an Enumerator if no +replacement+ and no block given.
5793  *
5794  *  Related: String#sub, String#gsub, String#sub!.
5795  *
5796  */
5797
5798 static VALUE
5799 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5800 {
5801     str_modify_keep_cr(str);
5802     return str_gsub(argc, argv, str, 1);
5803 }
5804
5805
5806 /*
5807  *  call-seq:
5808  *     gsub(pattern, replacement)   -> new_string
5809  *     gsub(pattern) {|match| ... } -> new_string
5810  *     gsub(pattern)                -> enumerator
5811  *
5812  *  Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5813  *
5814  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5815  *
5816  *  Returns an Enumerator if no +replacement+ and no block given.
5817  *
5818  *  Related: String#sub, String#sub!, String#gsub!.
5819  *
5820  */
5821
5822 static VALUE
5823 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5824 {
5825     return str_gsub(argc, argv, str, 0);
5826 }
5827
5828
5829 /*
5830  *  call-seq:
5831  *    replace(other_string) -> self
5832  *
5833  *  Replaces the contents of +self+ with the contents of +other_string+:
5834  *
5835  *    s = 'foo'        # => "foo"
5836  *    s.replace('bar') # => "bar"
5837  *
5838  */
5839
5840 VALUE
5841 rb_str_replace(VALUE str, VALUE str2)
5842 {
5843     str_modifiable(str);
5844     if (str == str2) return str;
5845
5846     StringValue(str2);
5847     str_discard(str);
5848     return str_replace(str, str2);
5849 }
5850
5851 /*
5852  *  call-seq:
5853  *    clear -> self
5854  *
5855  *  Removes the contents of +self+:
5856  *
5857  *    s = 'foo' # => "foo"
5858  *    s.clear   # => ""
5859  *
5860  */
5861
5862 static VALUE
5863 rb_str_clear(VALUE str)
5864 {
5865     str_discard(str);
5866     STR_SET_EMBED(str);
5867     STR_SET_EMBED_LEN(str, 0);
5868     RSTRING_PTR(str)[0] = 0;
5869     if (rb_enc_asciicompat(STR_ENC_GET(str)))
5870         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5871     else
5872         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5873     return str;
5874 }
5875
5876 /*
5877  *  call-seq:
5878  *    chr -> string
5879  *
5880  *  Returns a string containing the first character of +self+:
5881  *
5882  *    s = 'foo' # => "foo"
5883  *    s.chr     # => "f"
5884  *
5885  */
5886
5887 static VALUE
5888 rb_str_chr(VALUE str)
5889 {
5890     return rb_str_substr(str, 0, 1);
5891 }
5892
5893 /*
5894  *  call-seq:
5895  *    getbyte(index) -> integer
5896  *
5897  *  Returns the byte at zero-based +index+ as an integer:
5898  *
5899  *    s = 'abcde'  # => "abcde"
5900  *    s.getbyte(0) # => 97
5901  *    s.getbyte(1) # => 98
5902  *
5903  *  Related: String#setbyte.
5904  */
5905 static VALUE
5906 rb_str_getbyte(VALUE str, VALUE index)
5907 {
5908     long pos = NUM2LONG(index);
5909
5910     if (pos < 0)
5911         pos += RSTRING_LEN(str);
5912     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
5913         return Qnil;
5914
5915     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5916 }
5917
5918 /*
5919  *  call-seq:
5920  *    setbyte(index, integer) -> integer
5921  *
5922  *  Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5923  *
5924  *    s = 'abcde'      # => "abcde"
5925  *    s.setbyte(0, 98) # => 98
5926  *    s                # => "bbcde"
5927  *
5928  *  Related: String#getbyte.
5929  */
5930 static VALUE
5931 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5932 {
5933     long pos = NUM2LONG(index);
5934     long len = RSTRING_LEN(str);
5935     char *ptr, *head, *left = 0;
5936     rb_encoding *enc;
5937     int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5938
5939     if (pos < -len || len <= pos)
5940         rb_raise(rb_eIndexError, "index %ld out of string", pos);
5941     if (pos < 0)
5942         pos += len;
5943
5944     VALUE v = rb_to_int(value);
5945     VALUE w = rb_int_and(v, INT2FIX(0xff));
5946     char byte = (char)(NUM2INT(w) & 0xFF);
5947
5948     if (!str_independent(str))
5949         str_make_independent(str);
5950     enc = STR_ENC_GET(str);
5951     head = RSTRING_PTR(str);
5952     ptr = &head[pos];
5953     if (!STR_EMBED_P(str)) {
5954         cr = ENC_CODERANGE(str);
5955         switch (cr) {
5956           case ENC_CODERANGE_7BIT:
5957             left = ptr;
5958             *ptr = byte;
5959             if (ISASCII(byte)) goto end;
5960             nlen = rb_enc_precise_mbclen(left, head+len, enc);
5961             if (!MBCLEN_CHARFOUND_P(nlen))
5962                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5963             else
5964                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5965             goto end;
5966           case ENC_CODERANGE_VALID:
5967             left = rb_enc_left_char_head(head, ptr, head+len, enc);
5968             width = rb_enc_precise_mbclen(left, head+len, enc);
5969             *ptr = byte;
5970             nlen = rb_enc_precise_mbclen(left, head+len, enc);
5971             if (!MBCLEN_CHARFOUND_P(nlen))
5972                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5973             else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5974                 ENC_CODERANGE_CLEAR(str);
5975             goto end;
5976         }
5977     }
5978     ENC_CODERANGE_CLEAR(str);
5979     *ptr = byte;
5980
5981   end:
5982     return value;
5983 }
5984
5985 static VALUE
5986 str_byte_substr(VALUE str, long beg, long len, int empty)
5987 {
5988     char *p, *s = RSTRING_PTR(str);
5989     long n = RSTRING_LEN(str);
5990     VALUE str2;
5991
5992     if (beg > n || len < 0) return Qnil;
5993     if (beg < 0) {
5994         beg += n;
5995         if (beg < 0) return Qnil;
5996     }
5997     if (len > n - beg)
5998         len = n - beg;
5999     if (len <= 0) {
6000         if (!empty) return Qnil;
6001         len = 0;
6002         p = 0;
6003     }
6004     else
6005         p = s + beg;
6006
6007     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
6008         str2 = rb_str_new_frozen(str);
6009         str2 = str_new_shared(rb_cString, str2);
6010         RSTRING(str2)->as.heap.ptr += beg;
6011         RSTRING(str2)->as.heap.len = len;
6012     }
6013     else {
6014         str2 = rb_str_new(p, len);
6015     }
6016
6017     str_enc_copy(str2, str);
6018
6019     if (RSTRING_LEN(str2) == 0) {
6020         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6021             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6022         else
6023             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6024     }
6025     else {
6026         switch (ENC_CODERANGE(str)) {
6027           case ENC_CODERANGE_7BIT:
6028             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6029             break;
6030           default:
6031             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6032             break;
6033         }
6034     }
6035
6036     return str2;
6037 }
6038
6039 static VALUE
6040 str_byte_aref(VALUE str, VALUE indx)
6041 {
6042     long idx;
6043     if (FIXNUM_P(indx)) {
6044         idx = FIX2LONG(indx);
6045     }
6046     else {
6047         /* check if indx is Range */
6048         long beg, len = RSTRING_LEN(str);
6049
6050         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6051           case Qfalse:
6052             break;
6053           case Qnil:
6054             return Qnil;
6055           default:
6056             return str_byte_substr(str, beg, len, TRUE);
6057         }
6058
6059         idx = NUM2LONG(indx);
6060     }
6061     return str_byte_substr(str, idx, 1, FALSE);
6062 }
6063
6064 /*
6065  *  call-seq:
6066  *    byteslice(index, length = 1) -> string or nil
6067  *    byteslice(range)             -> string or nil
6068  *
6069  *  Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6070  *
6071  *  With integer arguments +index+ and +length+ given,
6072  *  returns the substring beginning at the given +index+
6073  *  of the given +length+ (if possible),
6074  *  or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6075  *
6076  *    s = '0123456789' # => "0123456789"
6077  *    s.byteslice(2)   # => "2"
6078  *    s.byteslice(200) # => nil
6079  *    s.byteslice(4, 3)  # => "456"
6080  *    s.byteslice(4, 30) # => "456789"
6081  *    s.byteslice(4, -1) # => nil
6082  *    s.byteslice(40, 2) # => nil
6083  *
6084  *  In either case above, counts backwards from the end of +self+
6085  *  if +index+ is negative:
6086  *
6087  *    s = '0123456789'   # => "0123456789"
6088  *    s.byteslice(-4)    # => "6"
6089  *    s.byteslice(-4, 3) # => "678"
6090  *
6091  *  With Range argument +range+ given, returns
6092  *  <tt>byteslice(range.begin, range.size)</tt>:
6093  *
6094  *    s = '0123456789'    # => "0123456789"
6095  *    s.byteslice(4..6)   # => "456"
6096  *    s.byteslice(-6..-4) # => "456"
6097  *    s.byteslice(5..2)   # => "" # range.size is zero.
6098  *    s.byteslice(40..42) # => nil
6099  *
6100  *  In all cases, a returned string has the same encoding as +self+:
6101  *
6102  *    s.encoding              # => #<Encoding:UTF-8>
6103  *    s.byteslice(4).encoding # => #<Encoding:UTF-8>
6104  *
6105  */
6106
6107 static VALUE
6108 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6109 {
6110     if (argc == 2) {
6111         long beg = NUM2LONG(argv[0]);
6112         long end = NUM2LONG(argv[1]);
6113         return str_byte_substr(str, beg, end, TRUE);
6114     }
6115     rb_check_arity(argc, 1, 2);
6116     return str_byte_aref(str, argv[0]);
6117 }
6118
6119 /*
6120  *  call-seq:
6121  *    reverse -> string
6122  *
6123  *  Returns a new string with the characters from +self+ in reverse order.
6124  *
6125  *    'stressed'.reverse # => "desserts"
6126  *
6127  */
6128
6129 static VALUE
6130 rb_str_reverse(VALUE str)
6131 {
6132     rb_encoding *enc;
6133     VALUE rev;
6134     char *s, *e, *p;
6135     int cr;
6136
6137     if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6138     enc = STR_ENC_GET(str);
6139     rev = rb_str_new(0, RSTRING_LEN(str));
6140     s = RSTRING_PTR(str); e = RSTRING_END(str);
6141     p = RSTRING_END(rev);
6142     cr = ENC_CODERANGE(str);
6143
6144     if (RSTRING_LEN(str) > 1) {
6145         if (single_byte_optimizable(str)) {
6146             while (s < e) {
6147                 *--p = *s++;
6148             }
6149         }
6150         else if (cr == ENC_CODERANGE_VALID) {
6151             while (s < e) {
6152                 int clen = rb_enc_fast_mbclen(s, e, enc);
6153
6154                 p -= clen;
6155                 memcpy(p, s, clen);
6156                 s += clen;
6157             }
6158         }
6159         else {
6160             cr = rb_enc_asciicompat(enc) ?
6161                 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6162             while (s < e) {
6163                 int clen = rb_enc_mbclen(s, e, enc);
6164
6165                 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6166                 p -= clen;
6167                 memcpy(p, s, clen);
6168                 s += clen;
6169             }
6170         }
6171     }
6172     STR_SET_LEN(rev, RSTRING_LEN(str));
6173     str_enc_copy(rev, str);
6174     ENC_CODERANGE_SET(rev, cr);
6175
6176     return rev;
6177 }
6178
6179
6180 /*
6181  *  call-seq:
6182  *    reverse! -> self
6183  *
6184  *  Returns +self+ with its characters reversed:
6185  *
6186  *    s = 'stressed'
6187  *    s.reverse! # => "desserts"
6188  *    s          # => "desserts"
6189  *
6190  */
6191
6192 static VALUE
6193 rb_str_reverse_bang(VALUE str)
6194 {
6195     if (RSTRING_LEN(str) > 1) {
6196         if (single_byte_optimizable(str)) {
6197             char *s, *e, c;
6198
6199             str_modify_keep_cr(str);
6200             s = RSTRING_PTR(str);
6201             e = RSTRING_END(str) - 1;
6202             while (s < e) {
6203                 c = *s;
6204                 *s++ = *e;
6205                 *e-- = c;
6206             }
6207         }
6208         else {
6209             str_shared_replace(str, rb_str_reverse(str));
6210         }
6211     }
6212     else {
6213         str_modify_keep_cr(str);
6214     }
6215     return str;
6216 }
6217
6218
6219 /*
6220  *  call-seq:
6221  *    include? other_string -> true or false
6222  *
6223  *  Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6224  *
6225  *    s = 'foo'
6226  *    s.include?('f')    # => true
6227  *    s.include?('fo')   # => true
6228  *    s.include?('food') # => false
6229  *
6230  */
6231
6232 static VALUE
6233 rb_str_include(VALUE str, VALUE arg)
6234 {
6235     long i;
6236
6237     StringValue(arg);
6238     i = rb_str_index(str, arg, 0);
6239
6240     return RBOOL(i != -1);
6241 }
6242
6243
6244 /*
6245  *  call-seq:
6246  *    to_i(base = 10) -> integer
6247  *
6248  *  Returns the result of interpreting leading characters in +self+
6249  *  as an integer in the given +base+ (which must be in (2..36)):
6250  *
6251  *    '123456'.to_i     # => 123456
6252  *    '123def'.to_i(16) # => 1195503
6253  *
6254  *  Characters past a leading valid number (in the given +base+) are ignored:
6255  *
6256  *    '12.345'.to_i   # => 12
6257  *    '12345'.to_i(2) # => 1
6258  *
6259  *  Returns zero if there is no leading valid number:
6260  *
6261  *    'abcdef'.to_i # => 0
6262  *    '2'.to_i(2)   # => 0
6263  *
6264  */
6265
6266 static VALUE
6267 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6268 {
6269     int base = 10;
6270
6271     if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6272         rb_raise(rb_eArgError, "invalid radix %d", base);
6273     }
6274     return rb_str_to_inum(str, base, FALSE);
6275 }
6276
6277
6278 /*
6279  *  call-seq:
6280  *    to_f -> float
6281  *
6282  *  Returns the result of interpreting leading characters in +self+ as a Float:
6283  *
6284  *    '3.14159'.to_f  # => 3.14159
6285       '1.234e-2'.to_f # => 0.01234
6286  *
6287  *  Characters past a leading valid number (in the given +base+) are ignored:
6288  *
6289  *    '3.14 (pi to two places)'.to_f # => 3.14
6290  *
6291  *  Returns zero if there is no leading valid number:
6292  *
6293  *    'abcdef'.to_f # => 0.0
6294  *
6295  */
6296
6297 static VALUE
6298 rb_str_to_f(VALUE str)
6299 {
6300     return DBL2NUM(rb_str_to_dbl(str, FALSE));
6301 }
6302
6303
6304 /*
6305  *  call-seq:
6306  *    to_s -> self or string
6307  *
6308  *  Returns +self+ if +self+ is a \String,
6309  *  or +self+ converted to a \String if +self+ is a subclass of \String.
6310  *
6311  *  String#to_str is an alias for String#to_s.
6312  *
6313  */
6314
6315 static VALUE
6316 rb_str_to_s(VALUE str)
6317 {
6318     if (rb_obj_class(str) != rb_cString) {
6319         return str_duplicate(rb_cString, str);
6320     }
6321     return str;
6322 }
6323
6324 #if 0
6325 static void
6326 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6327 {
6328     char s[RUBY_MAX_CHAR_LEN];
6329     int n = rb_enc_codelen(c, enc);
6330
6331     rb_enc_mbcput(c, s, enc);
6332     rb_enc_str_buf_cat(str, s, n, enc);
6333 }
6334 #endif
6335
6336 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6337
6338 int
6339 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6340 {
6341     char buf[CHAR_ESC_LEN + 1];
6342     int l;
6343
6344 #if SIZEOF_INT > 4
6345     c &= 0xffffffff;
6346 #endif
6347     if (unicode_p) {
6348         if (c < 0x7F && ISPRINT(c)) {
6349             snprintf(buf, CHAR_ESC_LEN, "%c", c);
6350         }
6351         else if (c < 0x10000) {
6352             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6353         }
6354         else {
6355             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6356         }
6357     }
6358     else {
6359         if (c < 0x100) {
6360             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6361         }
6362         else {
6363             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6364         }
6365     }
6366     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
6367     rb_str_buf_cat(result, buf, l);
6368     return l;
6369 }
6370
6371 const char *
6372 ruby_escaped_char(int c)
6373 {
6374     switch (c) {
6375       case '\0': return "\\0";
6376       case '\n': return "\\n";
6377       case '\r': return "\\r";
6378       case '\t': return "\\t";
6379       case '\f': return "\\f";
6380       case '\013': return "\\v";
6381       case '\010': return "\\b";
6382       case '\007': return "\\a";
6383       case '\033': return "\\e";
6384       case '\x7f': return "\\c?";
6385     }
6386     return NULL;
6387 }
6388
6389 VALUE
6390 rb_str_escape(VALUE str)
6391 {
6392     int encidx = ENCODING_GET(str);
6393     rb_encoding *enc = rb_enc_from_index(encidx);
6394     const char *p = RSTRING_PTR(str);
6395     const char *pend = RSTRING_END(str);
6396     const char *prev = p;
6397     char buf[CHAR_ESC_LEN + 1];
6398     VALUE result = rb_str_buf_new(0);
6399     int unicode_p = rb_enc_unicode_p(enc);
6400     int asciicompat = rb_enc_asciicompat(enc);
6401
6402     while (p < pend) {
6403         unsigned int c;
6404         const char *cc;
6405         int n = rb_enc_precise_mbclen(p, pend, enc);
6406         if (!MBCLEN_CHARFOUND_P(n)) {
6407             if (p > prev) str_buf_cat(result, prev, p - prev);
6408             n = rb_enc_mbminlen(enc);
6409             if (pend < p + n)
6410                 n = (int)(pend - p);
6411             while (n--) {
6412                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6413                 str_buf_cat(result, buf, strlen(buf));
6414                 prev = ++p;
6415             }
6416             continue;
6417         }
6418         n = MBCLEN_CHARFOUND_LEN(n);
6419         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6420         p += n;
6421         cc = ruby_escaped_char(c);
6422         if (cc) {
6423             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6424             str_buf_cat(result, cc, strlen(cc));
6425             prev = p;
6426         }
6427         else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6428         }
6429         else {
6430             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6431             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6432             prev = p;
6433         }
6434     }
6435     if (p > prev) str_buf_cat(result, prev, p - prev);
6436     ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6437
6438     return result;
6439 }
6440
6441 /*
6442  *  call-seq:
6443  *    inspect -> string
6444  *
6445  *  Returns a printable version of +self+, enclosed in double-quotes,
6446  *  and with special characters escaped:
6447  *
6448  *    s = "foo\tbar\tbaz\n"
6449  *    # => "foo\tbar\tbaz\n"
6450  *    s.inspect
6451  *    # => "\"foo\\tbar\\tbaz\\n\""
6452  *
6453  */
6454
6455 VALUE
6456 rb_str_inspect(VALUE str)
6457 {
6458     int encidx = ENCODING_GET(str);
6459     rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6460     const char *p, *pend, *prev;
6461     char buf[CHAR_ESC_LEN + 1];
6462     VALUE result = rb_str_buf_new(0);
6463     rb_encoding *resenc = rb_default_internal_encoding();
6464     int unicode_p = rb_enc_unicode_p(enc);
6465     int asciicompat = rb_enc_asciicompat(enc);
6466
6467     if (resenc == NULL) resenc = rb_default_external_encoding();
6468     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6469     rb_enc_associate(result, resenc);
6470     str_buf_cat2(result, "\"");
6471
6472     p = RSTRING_PTR(str); pend = RSTRING_END(str);
6473     prev = p;
6474     actenc = get_actual_encoding(encidx, str);
6475     if (actenc != enc) {
6476         enc = actenc;
6477         if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6478     }
6479     while (p < pend) {
6480         unsigned int c, cc;
6481         int n;
6482
6483         n = rb_enc_precise_mbclen(p, pend, enc);
6484         if (!MBCLEN_CHARFOUND_P(n)) {
6485             if (p > prev) str_buf_cat(result, prev, p - prev);
6486             n = rb_enc_mbminlen(enc);
6487             if (pend < p + n)
6488                 n = (int)(pend - p);
6489             while (n--) {
6490                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6491                 str_buf_cat(result, buf, strlen(buf));
6492                 prev = ++p;
6493             }
6494             continue;
6495         }
6496         n = MBCLEN_CHARFOUND_LEN(n);
6497         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6498         p += n;
6499         if ((asciicompat || unicode_p) &&
6500           (c == '"'|| c == '\\' ||
6501             (c == '#' &&
6502              p < pend &&
6503              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6504              (cc = rb_enc_codepoint(p,pend,enc),
6505               (cc == '$' || cc == '@' || cc == '{'))))) {
6506             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6507             str_buf_cat2(result, "\\");
6508             if (asciicompat || enc == resenc) {
6509                 prev = p - n;
6510                 continue;
6511             }
6512         }
6513         switch (c) {
6514           case '\n': cc = 'n'; break;
6515           case '\r': cc = 'r'; break;
6516           case '\t': cc = 't'; break;
6517           case '\f': cc = 'f'; break;
6518           case '\013': cc = 'v'; break;
6519           case '\010': cc = 'b'; break;
6520           case '\007': cc = 'a'; break;
6521           case 033: cc = 'e'; break;
6522           default: cc = 0; break;
6523         }
6524         if (cc) {
6525             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6526             buf[0] = '\\';
6527             buf[1] = (char)cc;
6528             str_buf_cat(result, buf, 2);
6529             prev = p;
6530             continue;
6531         }
6532         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6533             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6534             continue;
6535         }
6536         else {
6537             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6538             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6539             prev = p;
6540             continue;
6541         }
6542     }
6543     if (p > prev) str_buf_cat(result, prev, p - prev);
6544     str_buf_cat2(result, "\"");
6545
6546     return result;
6547 }
6548
6549 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6550
6551 /*
6552  *  call-seq:
6553  *    dump -> string
6554  *
6555  *  Returns a printable version of +self+, enclosed in double-quotes,
6556  *  with special characters escaped, and with non-printing characters
6557  *  replaced by hexadecimal notation:
6558  *
6559  *    "hello \n ''".dump    # => "\"hello \\n ''\""
6560  *    "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6561  *
6562  *  Related: String#undump (inverse of String#dump).
6563  *
6564  */
6565
6566 VALUE
6567 rb_str_dump(VALUE str)
6568 {
6569     int encidx = rb_enc_get_index(str);
6570     rb_encoding *enc = rb_enc_from_index(encidx);
6571     long len;
6572     const char *p, *pend;
6573     char *q, *qend;
6574     VALUE result;
6575     int u8 = (encidx == rb_utf8_encindex());
6576     static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6577
6578     len = 2;                    /* "" */
6579     if (!rb_enc_asciicompat(enc)) {
6580         len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6581         len += strlen(enc->name);
6582     }
6583
6584     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6585     while (p < pend) {
6586         int clen;
6587         unsigned char c = *p++;
6588
6589         switch (c) {
6590           case '"':  case '\\':
6591           case '\n': case '\r':
6592           case '\t': case '\f':
6593           case '\013': case '\010': case '\007': case '\033':
6594             clen = 2;
6595             break;
6596
6597           case '#':
6598             clen = IS_EVSTR(p, pend) ? 2 : 1;
6599             break;
6600
6601           default:
6602             if (ISPRINT(c)) {
6603                 clen = 1;
6604             }
6605             else {
6606                 if (u8 && c > 0x7F) {   /* \u notation */
6607                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
6608                     if (MBCLEN_CHARFOUND_P(n)) {
6609                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6610                         if (cc <= 0xFFFF)
6611                             clen = 6;  /* \uXXXX */
6612                         else if (cc <= 0xFFFFF)
6613                             clen = 9;  /* \u{XXXXX} */
6614                         else
6615                             clen = 10; /* \u{XXXXXX} */
6616                         p += MBCLEN_CHARFOUND_LEN(n)-1;
6617                         break;
6618                     }
6619                 }
6620                 clen = 4;       /* \xNN */
6621             }
6622             break;
6623         }
6624
6625         if (clen > LONG_MAX - len) {
6626             rb_raise(rb_eRuntimeError, "string size too big");
6627         }
6628         len += clen;
6629     }
6630
6631     result = rb_str_new(0, len);
6632     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6633     q = RSTRING_PTR(result); qend = q + len + 1;
6634
6635     *q++ = '"';
6636     while (p < pend) {
6637         unsigned char c = *p++;
6638
6639         if (c == '"' || c == '\\') {
6640             *q++ = '\\';
6641             *q++ = c;
6642         }
6643         else if (c == '#') {
6644             if (IS_EVSTR(p, pend)) *q++ = '\\';
6645             *q++ = '#';
6646         }
6647         else if (c == '\n') {
6648             *q++ = '\\';
6649             *q++ = 'n';
6650         }
6651         else if (c == '\r') {
6652             *q++ = '\\';
6653             *q++ = 'r';
6654         }
6655         else if (c == '\t') {
6656             *q++ = '\\';
6657             *q++ = 't';
6658         }
6659         else if (c == '\f') {
6660             *q++ = '\\';
6661             *q++ = 'f';
6662         }
6663         else if (c == '\013') {
6664             *q++ = '\\';
6665             *q++ = 'v';
6666         }
6667         else if (c == '\010') {
6668             *q++ = '\\';
6669             *q++ = 'b';
6670         }
6671         else if (c == '\007') {
6672             *q++ = '\\';
6673             *q++ = 'a';
6674         }
6675         else if (c == '\033') {
6676             *q++ = '\\';
6677             *q++ = 'e';
6678         }
6679         else if (ISPRINT(c)) {
6680             *q++ = c;
6681         }
6682         else {
6683             *q++ = '\\';
6684             if (u8) {
6685                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6686                 if (MBCLEN_CHARFOUND_P(n)) {
6687                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6688                     p += n;
6689                     if (cc <= 0xFFFF)
6690                         snprintf(q, qend-q, "u%04X", cc);    /* \uXXXX */
6691                     else
6692                         snprintf(q, qend-q, "u{%X}", cc);  /* \u{XXXXX} or \u{XXXXXX} */
6693                     q += strlen(q);
6694                     continue;
6695                 }
6696             }
6697             snprintf(q, qend-q, "x%02X", c);
6698             q += 3;
6699         }
6700     }
6701     *q++ = '"';
6702     *q = '\0';
6703     if (!rb_enc_asciicompat(enc)) {
6704         snprintf(q, qend-q, nonascii_suffix, enc->name);
6705         encidx = rb_ascii8bit_encindex();
6706     }
6707     /* result from dump is ASCII */
6708     rb_enc_associate_index(result, encidx);
6709     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6710     return result;
6711 }
6712
6713 static int
6714 unescape_ascii(unsigned int c)
6715 {
6716     switch (c) {
6717       case 'n':
6718         return '\n';
6719       case 'r':
6720         return '\r';
6721       case 't':
6722         return '\t';
6723       case 'f':
6724         return '\f';
6725       case 'v':
6726         return '\13';
6727       case 'b':
6728         return '\010';
6729       case 'a':
6730         return '\007';
6731       case 'e':
6732         return 033;
6733     }
6734     UNREACHABLE_RETURN(-1);
6735 }
6736
6737 static void
6738 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6739 {
6740     const char *s = *ss;
6741     unsigned int c;
6742     int codelen;
6743     size_t hexlen;
6744     unsigned char buf[6];
6745     static rb_encoding *enc_utf8 = NULL;
6746
6747     switch (*s) {
6748       case '\\':
6749       case '"':
6750       case '#':
6751         rb_str_cat(undumped, s, 1); /* cat itself */
6752         s++;
6753         break;
6754       case 'n':
6755       case 'r':
6756       case 't':
6757       case 'f':
6758       case 'v':
6759       case 'b':
6760       case 'a':
6761       case 'e':
6762         *buf = unescape_ascii(*s);
6763         rb_str_cat(undumped, (char *)buf, 1);
6764         s++;
6765         break;
6766       case 'u':
6767         if (*binary) {
6768             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6769         }
6770         *utf8 = true;
6771         if (++s >= s_end) {
6772             rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6773         }
6774         if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6775         if (*penc != enc_utf8) {
6776             *penc = enc_utf8;
6777             rb_enc_associate(undumped, enc_utf8);
6778         }
6779         if (*s == '{') { /* handle \u{...} form */
6780             s++;
6781             for (;;) {
6782                 if (s >= s_end) {
6783                     rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6784                 }
6785                 if (*s == '}') {
6786                     s++;
6787                     break;
6788                 }
6789                 if (ISSPACE(*s)) {
6790                     s++;
6791                     continue;
6792                 }
6793                 c = scan_hex(s, s_end-s, &hexlen);
6794                 if (hexlen == 0 || hexlen > 6) {
6795                     rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6796                 }
6797                 if (c > 0x10ffff) {
6798                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6799                 }
6800                 if (0xd800 <= c && c <= 0xdfff) {
6801                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6802                 }
6803                 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6804                 rb_str_cat(undumped, (char *)buf, codelen);
6805                 s += hexlen;
6806             }
6807         }
6808         else { /* handle \uXXXX form */
6809             c = scan_hex(s, 4, &hexlen);
6810             if (hexlen != 4) {
6811                 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6812             }
6813             if (0xd800 <= c && c <= 0xdfff) {
6814                 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6815             }
6816             codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6817             rb_str_cat(undumped, (char *)buf, codelen);
6818             s += hexlen;
6819         }
6820         break;
6821       case 'x':
6822         if (*utf8) {
6823             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6824         }
6825         *binary = true;
6826         if (++s >= s_end) {
6827             rb_raise(rb_eRuntimeError, "invalid hex escape");
6828         }
6829         *buf = scan_hex(s, 2, &hexlen);
6830         if (hexlen != 2) {
6831             rb_raise(rb_eRuntimeError, "invalid hex escape");
6832         }
6833         rb_str_cat(undumped, (char *)buf, 1);
6834         s += hexlen;
6835         break;
6836       default:
6837         rb_str_cat(undumped, s-1, 2);
6838         s++;
6839     }
6840
6841     *ss = s;
6842 }
6843
6844 static VALUE rb_str_is_ascii_only_p(VALUE str);
6845
6846 /*
6847  *  call-seq:
6848  *    undump -> string
6849  *
6850  *  Returns an unescaped version of +self+:
6851  *
6852  *    s_orig = "\f\x00\xff\\\""    # => "\f\u0000\xFF\\\""
6853  *    s_dumped = s_orig.dump       # => "\"\\f\\x00\\xFF\\\\\\\"\""
6854  *    s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6855  *    s_undumped == s_orig         # => true
6856  *
6857  *  Related: String#dump (inverse of String#undump).
6858  *
6859  */
6860
6861 static VALUE
6862 str_undump(VALUE str)
6863 {
6864     const char *s = RSTRING_PTR(str);
6865     const char *s_end = RSTRING_END(str);
6866     rb_encoding *enc = rb_enc_get(str);
6867     VALUE undumped = rb_enc_str_new(s, 0L, enc);
6868     bool utf8 = false;
6869     bool binary = false;
6870     int w;
6871
6872     rb_must_asciicompat(str);
6873     if (rb_str_is_ascii_only_p(str) == Qfalse) {
6874         rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6875     }
6876     if (!str_null_check(str, &w)) {
6877         rb_raise(rb_eRuntimeError, "string contains null byte");
6878     }
6879     if (RSTRING_LEN(str) < 2) goto invalid_format;
6880     if (*s != '"') goto invalid_format;
6881
6882     /* strip '"' at the start */
6883     s++;
6884
6885     for (;;) {
6886         if (s >= s_end) {
6887             rb_raise(rb_eRuntimeError, "unterminated dumped string");
6888         }
6889
6890         if (*s == '"') {
6891             /* epilogue */
6892             s++;
6893             if (s == s_end) {
6894                 /* ascii compatible dumped string */
6895                 break;
6896             }
6897             else {
6898                 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6899                 static const char dup_suffix[] = ".dup";
6900                 const char *encname;
6901                 int encidx;
6902                 ptrdiff_t size;
6903
6904                 /* check separately for strings dumped by older versions */
6905                 size = sizeof(dup_suffix) - 1;
6906                 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6907
6908                 size = sizeof(force_encoding_suffix) - 1;
6909                 if (s_end - s <= size) goto invalid_format;
6910                 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6911                 s += size;
6912
6913                 if (utf8) {
6914                     rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6915                 }
6916
6917                 encname = s;
6918                 s = memchr(s, '"', s_end-s);
6919                 size = s - encname;
6920                 if (!s) goto invalid_format;
6921                 if (s_end - s != 2) goto invalid_format;
6922                 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6923
6924                 encidx = rb_enc_find_index2(encname, (long)size);
6925                 if (encidx < 0) {
6926                     rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6927                 }
6928                 rb_enc_associate_index(undumped, encidx);
6929             }
6930             break;
6931         }
6932
6933         if (*s == '\\') {
6934             s++;
6935             if (s >= s_end) {
6936                 rb_raise(rb_eRuntimeError, "invalid escape");
6937             }
6938             undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6939         }
6940         else {
6941             rb_str_cat(undumped, s++, 1);
6942         }
6943     }
6944
6945     return undumped;
6946 invalid_format:
6947     rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6948 }
6949
6950 static void
6951 rb_str_check_dummy_enc(rb_encoding *enc)
6952 {
6953     if (rb_enc_dummy_p(enc)) {
6954         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6955                  rb_enc_name(enc));
6956     }
6957 }
6958
6959 static rb_encoding *
6960 str_true_enc(VALUE str)
6961 {
6962     rb_encoding *enc = STR_ENC_GET(str);
6963     rb_str_check_dummy_enc(enc);
6964     return enc;
6965 }
6966
6967 static OnigCaseFoldType
6968 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6969 {
6970     if (argc==0)
6971         return flags;
6972     if (argc>2)
6973         rb_raise(rb_eArgError, "too many options");
6974     if (argv[0]==sym_turkic) {
6975         flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6976         if (argc==2) {
6977             if (argv[1]==sym_lithuanian)
6978                 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6979             else
6980                 rb_raise(rb_eArgError, "invalid second option");
6981         }
6982     }
6983     else if (argv[0]==sym_lithuanian) {
6984         flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6985         if (argc==2) {
6986             if (argv[1]==sym_turkic)
6987                 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6988             else
6989                 rb_raise(rb_eArgError, "invalid second option");
6990         }
6991     }
6992     else if (argc>1)
6993         rb_raise(rb_eArgError, "too many options");
6994     else if (argv[0]==sym_ascii)
6995         flags |= ONIGENC_CASE_ASCII_ONLY;
6996     else if (argv[0]==sym_fold) {
6997         if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6998             flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6999         else
7000             rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7001     }
7002     else
7003         rb_raise(rb_eArgError, "invalid option");
7004     return flags;
7005 }
7006
7007 static inline bool
7008 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7009 {
7010     if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7011         return true;
7012     return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7013 }
7014
7015 /* 16 should be long enough to absorb any kind of single character length increase */
7016 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7017 #ifndef CASEMAP_DEBUG
7018 # define CASEMAP_DEBUG 0
7019 #endif
7020
7021 struct mapping_buffer;
7022 typedef struct mapping_buffer {
7023     size_t capa;
7024     size_t used;
7025     struct mapping_buffer *next;
7026     OnigUChar space[FLEX_ARY_LEN];
7027 } mapping_buffer;
7028
7029 static void
7030 mapping_buffer_free(void *p)
7031 {
7032     mapping_buffer *previous_buffer;
7033     mapping_buffer *current_buffer = p;
7034     while (current_buffer) {
7035         previous_buffer = current_buffer;
7036         current_buffer  = current_buffer->next;
7037         ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7038     }
7039 }
7040
7041 static const rb_data_type_t mapping_buffer_type = {
7042     "mapping_buffer",
7043     {0, mapping_buffer_free,}
7044 };
7045
7046 static VALUE
7047 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7048 {
7049     VALUE target;
7050
7051     const OnigUChar *source_current, *source_end;
7052     int target_length = 0;
7053     VALUE buffer_anchor;
7054     mapping_buffer *current_buffer = 0;
7055     mapping_buffer **pre_buffer;
7056     size_t buffer_count = 0;
7057     int buffer_length_or_invalid;
7058
7059     if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7060
7061     source_current = (OnigUChar*)RSTRING_PTR(source);
7062     source_end = (OnigUChar*)RSTRING_END(source);
7063
7064     buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7065     pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7066     while (source_current < source_end) {
7067         /* increase multiplier using buffer count to converge quickly */
7068         size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7069         if (CASEMAP_DEBUG) {
7070             fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7071         }
7072         current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7073         *pre_buffer = current_buffer;
7074         pre_buffer = &current_buffer->next;
7075         current_buffer->next = NULL;
7076         current_buffer->capa = capa;
7077         buffer_length_or_invalid = enc->case_map(flags,
7078                                    &source_current, source_end,
7079                                    current_buffer->space,
7080                                    current_buffer->space+current_buffer->capa,
7081                                    enc);
7082         if (buffer_length_or_invalid < 0) {
7083             current_buffer = DATA_PTR(buffer_anchor);
7084             DATA_PTR(buffer_anchor) = 0;
7085             mapping_buffer_free(current_buffer);
7086             rb_raise(rb_eArgError, "input string invalid");
7087         }
7088         target_length  += current_buffer->used = buffer_length_or_invalid;
7089     }
7090     if (CASEMAP_DEBUG) {
7091         fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7092     }
7093
7094     if (buffer_count==1) {
7095         target = rb_str_new((const char*)current_buffer->space, target_length);
7096     }
7097     else {
7098         char *target_current;
7099
7100         target = rb_str_new(0, target_length);
7101         target_current = RSTRING_PTR(target);
7102         current_buffer = DATA_PTR(buffer_anchor);
7103         while (current_buffer) {
7104             memcpy(target_current, current_buffer->space, current_buffer->used);
7105             target_current += current_buffer->used;
7106             current_buffer  = current_buffer->next;
7107         }
7108     }
7109     current_buffer = DATA_PTR(buffer_anchor);
7110     DATA_PTR(buffer_anchor) = 0;
7111     mapping_buffer_free(current_buffer);
7112
7113     /* TODO: check about string terminator character */
7114     str_enc_copy(target, source);
7115     /*ENC_CODERANGE_SET(mapped, cr);*/
7116
7117     return target;
7118 }
7119
7120 static VALUE
7121 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7122 {
7123     const OnigUChar *source_current, *source_end;
7124     OnigUChar *target_current, *target_end;
7125     long old_length = RSTRING_LEN(source);
7126     int length_or_invalid;
7127
7128     if (old_length == 0) return Qnil;
7129
7130     source_current = (OnigUChar*)RSTRING_PTR(source);
7131     source_end = (OnigUChar*)RSTRING_END(source);
7132     if (source == target) {
7133         target_current = (OnigUChar*)source_current;
7134         target_end = (OnigUChar*)source_end;
7135     }
7136     else {
7137         target_current = (OnigUChar*)RSTRING_PTR(target);
7138         target_end = (OnigUChar*)RSTRING_END(target);
7139     }
7140
7141     length_or_invalid = onigenc_ascii_only_case_map(flags,
7142                                &source_current, source_end,
7143                                target_current, target_end, enc);
7144     if (length_or_invalid < 0)
7145         rb_raise(rb_eArgError, "input string invalid");
7146     if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7147         fprintf(stderr, "problem with rb_str_ascii_casemap"
7148                 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7149         rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7150                  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7151     }
7152
7153     str_enc_copy(target, source);
7154
7155     return target;
7156 }
7157
7158 static bool
7159 upcase_single(VALUE str)
7160 {
7161     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7162     bool modified = false;
7163
7164     while (s < send) {
7165         unsigned int c = *(unsigned char*)s;
7166
7167         if ('a' <= c && c <= 'z') {
7168             *s = 'A' + (c - 'a');
7169             modified = true;
7170         }
7171         s++;
7172     }
7173     return modified;
7174 }
7175
7176 /*
7177  *  call-seq:
7178  *    upcase!(*options) -> self or nil
7179  *
7180  *  Upcases the characters in +self+;
7181  *  returns +self+ if any changes were made, +nil+ otherwise:
7182  *
7183  *    s = 'Hello World!' # => "Hello World!"
7184  *    s.upcase!          # => "HELLO WORLD!"
7185  *    s                  # => "HELLO WORLD!"
7186  *    s.upcase!          # => nil
7187  *
7188  *  The casing may be affected by the given +options+;
7189  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7190  *
7191  *  Related: String#upcase, String#downcase, String#downcase!.
7192  *
7193  */
7194
7195 static VALUE
7196 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7197 {
7198     rb_encoding *enc;
7199     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7200
7201     flags = check_case_options(argc, argv, flags);
7202     str_modify_keep_cr(str);
7203     enc = str_true_enc(str);
7204     if (case_option_single_p(flags, enc, str)) {
7205         if (upcase_single(str))
7206             flags |= ONIGENC_CASE_MODIFIED;
7207     }
7208     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7209         rb_str_ascii_casemap(str, str, &flags, enc);
7210     else
7211         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7212
7213     if (ONIGENC_CASE_MODIFIED&flags) return str;
7214     return Qnil;
7215 }
7216
7217
7218 /*
7219  *  call-seq:
7220  *    upcase(*options) -> string
7221  *
7222  *  Returns a string containing the upcased characters in +self+:
7223  *
7224  *     s = 'Hello World!' # => "Hello World!"
7225  *     s.upcase           # => "HELLO WORLD!"
7226  *
7227  *  The casing may be affected by the given +options+;
7228  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7229  *
7230  *  Related: String#upcase!, String#downcase, String#downcase!.
7231  *
7232  */
7233
7234 static VALUE
7235 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7236 {
7237     rb_encoding *enc;
7238     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7239     VALUE ret;
7240
7241     flags = check_case_options(argc, argv, flags);
7242     enc = str_true_enc(str);
7243     if (case_option_single_p(flags, enc, str)) {
7244         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7245         str_enc_copy(ret, str);
7246         upcase_single(ret);
7247     }
7248     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7249         ret = rb_str_new(0, RSTRING_LEN(str));
7250         rb_str_ascii_casemap(str, ret, &flags, enc);
7251     }
7252     else {
7253         ret = rb_str_casemap(str, &flags, enc);
7254     }
7255
7256     return ret;
7257 }
7258
7259 static bool
7260 downcase_single(VALUE str)
7261 {
7262     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7263     bool modified = false;
7264
7265     while (s < send) {
7266         unsigned int c = *(unsigned char*)s;
7267
7268         if ('A' <= c && c <= 'Z') {
7269             *s = 'a' + (c - 'A');
7270             modified = true;
7271         }
7272         s++;
7273     }
7274
7275     return modified;
7276 }
7277
7278 /*
7279  *  call-seq:
7280  *    downcase!(*options) -> self or nil
7281  *
7282  *  Downcases the characters in +self+;
7283  *  returns +self+ if any changes were made, +nil+ otherwise:
7284  *
7285  *    s = 'Hello World!' # => "Hello World!"
7286  *    s.downcase!        # => "hello world!"
7287  *    s                  # => "hello world!"
7288  *    s.downcase!        # => nil
7289  *
7290  *  The casing may be affected by the given +options+;
7291  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7292  *
7293  *  Related: String#downcase, String#upcase, String#upcase!.
7294  *
7295  */
7296
7297 static VALUE
7298 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7299 {
7300     rb_encoding *enc;
7301     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7302
7303     flags = check_case_options(argc, argv, flags);
7304     str_modify_keep_cr(str);
7305     enc = str_true_enc(str);
7306     if (case_option_single_p(flags, enc, str)) {
7307         if (downcase_single(str))
7308             flags |= ONIGENC_CASE_MODIFIED;
7309     }
7310     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7311         rb_str_ascii_casemap(str, str, &flags, enc);
7312     else
7313         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7314
7315     if (ONIGENC_CASE_MODIFIED&flags) return str;
7316     return Qnil;
7317 }
7318
7319
7320 /*
7321  *  call-seq:
7322  *    downcase(*options) -> string
7323  *
7324  *  Returns a string containing the downcased characters in +self+:
7325  *
7326  *     s = 'Hello World!' # => "Hello World!"
7327  *     s.downcase         # => "hello world!"
7328  *
7329  *  The casing may be affected by the given +options+;
7330  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7331  *
7332  *  Related: String#downcase!, String#upcase, String#upcase!.
7333  *
7334  */
7335
7336 static VALUE
7337 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7338 {
7339     rb_encoding *enc;
7340     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7341     VALUE ret;
7342
7343     flags = check_case_options(argc, argv, flags);
7344     enc = str_true_enc(str);
7345     if (case_option_single_p(flags, enc, str)) {
7346         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7347         str_enc_copy(ret, str);
7348         downcase_single(ret);
7349     }
7350     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7351         ret = rb_str_new(0, RSTRING_LEN(str));
7352         rb_str_ascii_casemap(str, ret, &flags, enc);
7353     }
7354     else {
7355         ret = rb_str_casemap(str, &flags, enc);
7356     }
7357
7358     return ret;
7359 }
7360
7361
7362 /*
7363  *  call-seq:
7364  *    capitalize!(*options) -> self or nil
7365  *
7366  *  Upcases the first character in +self+;
7367  *  downcases the remaining characters;
7368  *  returns +self+ if any changes were made, +nil+ otherwise:
7369  *
7370  *    s = 'hello World!' # => "hello World!"
7371  *    s.capitalize!      # => "Hello world!"
7372  *    s                  # => "Hello world!"
7373  *    s.capitalize!      # => nil
7374  *
7375  *  The casing may be affected by the given +options+;
7376  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7377  *
7378  *  Related: String#capitalize.
7379  *
7380  */
7381
7382 static VALUE
7383 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7384 {
7385     rb_encoding *enc;
7386     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7387
7388     flags = check_case_options(argc, argv, flags);
7389     str_modify_keep_cr(str);
7390     enc = str_true_enc(str);
7391     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7392     if (flags&ONIGENC_CASE_ASCII_ONLY)
7393         rb_str_ascii_casemap(str, str, &flags, enc);
7394     else
7395         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7396
7397     if (ONIGENC_CASE_MODIFIED&flags) return str;
7398     return Qnil;
7399 }
7400
7401
7402 /*
7403  *  call-seq:
7404  *    capitalize(*options) -> string
7405  *
7406  *  Returns a string containing the characters in +self+;
7407  *  the first character is upcased;
7408  *  the remaining characters are downcased:
7409  *
7410  *     s = 'hello World!' # => "hello World!"
7411  *     s.capitalize       # => "Hello world!"
7412  *
7413  *  The casing may be affected by the given +options+;
7414  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7415  *
7416  *  Related: String#capitalize!.
7417  *
7418  */
7419
7420 static VALUE
7421 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7422 {
7423     rb_encoding *enc;
7424     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7425     VALUE ret;
7426
7427     flags = check_case_options(argc, argv, flags);
7428     enc = str_true_enc(str);
7429     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7430     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7431         ret = rb_str_new(0, RSTRING_LEN(str));
7432         rb_str_ascii_casemap(str, ret, &flags, enc);
7433     }
7434     else {
7435         ret = rb_str_casemap(str, &flags, enc);
7436     }
7437     return ret;
7438 }
7439
7440
7441 /*
7442  *  call-seq:
7443  *    swapcase!(*options) -> self or nil
7444  *
7445  *  Upcases each lowercase character in +self+;
7446  *  downcases uppercase character;
7447  *  returns +self+ if any changes were made, +nil+ otherwise:
7448  *
7449  *    s = 'Hello World!' # => "Hello World!"
7450  *    s.swapcase!        # => "hELLO wORLD!"
7451  *    s                  # => "Hello World!"
7452  *    ''.swapcase!       # => nil
7453  *
7454  *  The casing may be affected by the given +options+;
7455  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7456  *
7457  *  Related: String#swapcase.
7458  *
7459  */
7460
7461 static VALUE
7462 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7463 {
7464     rb_encoding *enc;
7465     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7466
7467     flags = check_case_options(argc, argv, flags);
7468     str_modify_keep_cr(str);
7469     enc = str_true_enc(str);
7470     if (flags&ONIGENC_CASE_ASCII_ONLY)
7471         rb_str_ascii_casemap(str, str, &flags, enc);
7472     else
7473         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7474
7475     if (ONIGENC_CASE_MODIFIED&flags) return str;
7476     return Qnil;
7477 }
7478
7479
7480 /*
7481  *  call-seq:
7482  *    swapcase(*options) -> string
7483  *
7484  *  Returns a string containing the characters in +self+, with cases reversed;
7485  *  each uppercase character is downcased;
7486  *  each lowercase character is upcased:
7487  *
7488  *     s = 'Hello World!' # => "Hello World!"
7489  *     s.swapcase         # => "hELLO wORLD!"
7490  *
7491  *  The casing may be affected by the given +options+;
7492  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7493  *
7494  *  Related: String#swapcase!.
7495  *
7496  */
7497
7498 static VALUE
7499 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7500 {
7501     rb_encoding *enc;
7502     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7503     VALUE ret;
7504
7505     flags = check_case_options(argc, argv, flags);
7506     enc = str_true_enc(str);
7507     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7508     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7509         ret = rb_str_new(0, RSTRING_LEN(str));
7510         rb_str_ascii_casemap(str, ret, &flags, enc);
7511     }
7512     else {
7513         ret = rb_str_casemap(str, &flags, enc);
7514     }
7515     return ret;
7516 }
7517
7518 typedef unsigned char *USTR;
7519
7520 struct tr {
7521     int gen;
7522     unsigned int now, max;
7523     char *p, *pend;
7524 };
7525
7526 static unsigned int
7527 trnext(struct tr *t, rb_encoding *enc)
7528 {
7529     int n;
7530
7531     for (;;) {
7532       nextpart:
7533         if (!t->gen) {
7534             if (t->p == t->pend) return -1;
7535             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7536                 t->p += n;
7537             }
7538             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7539             t->p += n;
7540             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7541                 t->p += n;
7542                 if (t->p < t->pend) {
7543                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7544                     t->p += n;
7545                     if (t->now > c) {
7546                         if (t->now < 0x80 && c < 0x80) {
7547                             rb_raise(rb_eArgError,
7548                                      "invalid range \"%c-%c\" in string transliteration",
7549                                      t->now, c);
7550                         }
7551                         else {
7552                             rb_raise(rb_eArgError, "invalid range in string transliteration");
7553                         }
7554                         continue; /* not reached */
7555                     }
7556                     t->gen = 1;
7557                     t->max = c;
7558                 }
7559             }
7560             return t->now;
7561         }
7562         else {
7563             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7564                 if (t->now == t->max) {
7565                     t->gen = 0;
7566                     goto nextpart;
7567                 }
7568             }
7569             if (t->now < t->max) {
7570                 return t->now;
7571             }
7572             else {
7573                 t->gen = 0;
7574                 return t->max;
7575             }
7576         }
7577     }
7578 }
7579
7580 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7581
7582 static VALUE
7583 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7584 {
7585     const unsigned int errc = -1;
7586     unsigned int trans[256];
7587     rb_encoding *enc, *e1, *e2;
7588     struct tr trsrc, trrepl;
7589     int cflag = 0;
7590     unsigned int c, c0, last = 0;
7591     int modify = 0, i, l;
7592     unsigned char *s, *send;
7593     VALUE hash = 0;
7594     int singlebyte = single_byte_optimizable(str);
7595     int termlen;
7596     int cr;
7597
7598 #define CHECK_IF_ASCII(c) \
7599     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7600            (cr = ENC_CODERANGE_VALID) : 0)
7601
7602     StringValue(src);
7603     StringValue(repl);
7604     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7605     if (RSTRING_LEN(repl) == 0) {
7606         return rb_str_delete_bang(1, &src, str);
7607     }
7608
7609     cr = ENC_CODERANGE(str);
7610     e1 = rb_enc_check(str, src);
7611     e2 = rb_enc_check(str, repl);
7612     if (e1 == e2) {
7613         enc = e1;
7614     }
7615     else {
7616         enc = rb_enc_check(src, repl);
7617     }
7618     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7619     if (RSTRING_LEN(src) > 1 &&
7620         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7621         trsrc.p + l < trsrc.pend) {
7622         cflag = 1;
7623         trsrc.p += l;
7624     }
7625     trrepl.p = RSTRING_PTR(repl);
7626     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7627     trsrc.gen = trrepl.gen = 0;
7628     trsrc.now = trrepl.now = 0;
7629     trsrc.max = trrepl.max = 0;
7630
7631     if (cflag) {
7632         for (i=0; i<256; i++) {
7633             trans[i] = 1;
7634         }
7635         while ((c = trnext(&trsrc, enc)) != errc) {
7636             if (c < 256) {
7637                 trans[c] = errc;
7638             }
7639             else {
7640                 if (!hash) hash = rb_hash_new();
7641                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7642             }
7643         }
7644         while ((c = trnext(&trrepl, enc)) != errc)
7645             /* retrieve last replacer */;
7646         last = trrepl.now;
7647         for (i=0; i<256; i++) {
7648             if (trans[i] != errc) {
7649                 trans[i] = last;
7650             }
7651         }
7652     }
7653     else {
7654         unsigned int r;
7655
7656         for (i=0; i<256; i++) {
7657             trans[i] = errc;
7658         }
7659         while ((c = trnext(&trsrc, enc)) != errc) {
7660             r = trnext(&trrepl, enc);
7661             if (r == errc) r = trrepl.now;
7662             if (c < 256) {
7663                 trans[c] = r;
7664                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7665             }
7666             else {
7667                 if (!hash) hash = rb_hash_new();
7668                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7669             }
7670         }
7671     }
7672
7673     if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7674         cr = ENC_CODERANGE_7BIT;
7675     str_modify_keep_cr(str);
7676     s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7677     termlen = rb_enc_mbminlen(enc);
7678     if (sflag) {
7679         int clen, tlen;
7680         long offset, max = RSTRING_LEN(str);
7681         unsigned int save = -1;
7682         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7683
7684         while (s < send) {
7685             int may_modify = 0;
7686
7687             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7688             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7689
7690             s += clen;
7691             if (c < 256) {
7692                 c = trans[c];
7693             }
7694             else if (hash) {
7695                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7696                 if (NIL_P(tmp)) {
7697                     if (cflag) c = last;
7698                     else c = errc;
7699                 }
7700                 else if (cflag) c = errc;
7701                 else c = NUM2INT(tmp);
7702             }
7703             else {
7704                 c = errc;
7705             }
7706             if (c != (unsigned int)-1) {
7707                 if (save == c) {
7708                     CHECK_IF_ASCII(c);
7709                     continue;
7710                 }
7711                 save = c;
7712                 tlen = rb_enc_codelen(c, enc);
7713                 modify = 1;
7714             }
7715             else {
7716                 save = -1;
7717                 c = c0;
7718                 if (enc != e1) may_modify = 1;
7719             }
7720             if ((offset = t - buf) + tlen > max) {
7721                 size_t MAYBE_UNUSED(old) = max + termlen;
7722                 max = offset + tlen + (send - s);
7723                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7724                 t = buf + offset;
7725             }
7726             rb_enc_mbcput(c, t, enc);
7727             if (may_modify && memcmp(s, t, tlen) != 0) {
7728                 modify = 1;
7729             }
7730             CHECK_IF_ASCII(c);
7731             t += tlen;
7732         }
7733         if (!STR_EMBED_P(str)) {
7734             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7735         }
7736         TERM_FILL((char *)t, termlen);
7737         RSTRING(str)->as.heap.ptr = (char *)buf;
7738         RSTRING(str)->as.heap.len = t - buf;
7739         STR_SET_NOEMBED(str);
7740         RSTRING(str)->as.heap.aux.capa = max;
7741     }
7742     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7743         while (s < send) {
7744             c = (unsigned char)*s;
7745             if (trans[c] != errc) {
7746                 if (!cflag) {
7747                     c = trans[c];
7748                     *s = c;
7749                     modify = 1;
7750                 }
7751                 else {
7752                     *s = last;
7753                     modify = 1;
7754                 }
7755             }
7756             CHECK_IF_ASCII(c);
7757             s++;
7758         }
7759     }
7760     else {
7761         int clen, tlen;
7762         long offset, max = (long)((send - s) * 1.2);
7763         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7764
7765         while (s < send) {
7766             int may_modify = 0;
7767             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7768             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7769
7770             if (c < 256) {
7771                 c = trans[c];
7772             }
7773             else if (hash) {
7774                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7775                 if (NIL_P(tmp)) {
7776                     if (cflag) c = last;
7777                     else c = errc;
7778                 }
7779                 else if (cflag) c = errc;
7780                 else c = NUM2INT(tmp);
7781             }
7782             else {
7783                 c = cflag ? last : errc;
7784             }
7785             if (c != errc) {
7786                 tlen = rb_enc_codelen(c, enc);
7787                 modify = 1;
7788             }
7789             else {
7790                 c = c0;
7791                 if (enc != e1) may_modify = 1;
7792             }
7793             if ((offset = t - buf) + tlen > max) {
7794                 size_t MAYBE_UNUSED(old) = max + termlen;
7795                 max = offset + tlen + (long)((send - s) * 1.2);
7796                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7797                 t = buf + offset;
7798             }
7799             if (s != t) {
7800                 rb_enc_mbcput(c, t, enc);
7801                 if (may_modify && memcmp(s, t, tlen) != 0) {
7802                     modify = 1;
7803                 }
7804             }
7805             CHECK_IF_ASCII(c);
7806             s += clen;
7807             t += tlen;
7808         }
7809         if (!STR_EMBED_P(str)) {
7810             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7811         }
7812         TERM_FILL((char *)t, termlen);
7813         RSTRING(str)->as.heap.ptr = (char *)buf;
7814         RSTRING(str)->as.heap.len = t - buf;
7815         STR_SET_NOEMBED(str);
7816         RSTRING(str)->as.heap.aux.capa = max;
7817     }
7818
7819     if (modify) {
7820         if (cr != ENC_CODERANGE_BROKEN)
7821             ENC_CODERANGE_SET(str, cr);
7822         rb_enc_associate(str, enc);
7823         return str;
7824     }
7825     return Qnil;
7826 }
7827
7828
7829 /*
7830  *  call-seq:
7831  *     str.tr!(from_str, to_str)   -> str or nil
7832  *
7833  *  Translates <i>str</i> in place, using the same rules as
7834  *  String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7835  *  were made.
7836  */
7837
7838 static VALUE
7839 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7840 {
7841     return tr_trans(str, src, repl, 0);
7842 }
7843
7844
7845 /*
7846  *  call-seq:
7847  *     str.tr(from_str, to_str)   => new_str
7848  *
7849  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
7850  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
7851  *  +from_str+, it is padded with its last character in order to maintain the
7852  *  correspondence.
7853  *
7854  *     "hello".tr('el', 'ip')      #=> "hippo"
7855  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
7856  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
7857  *
7858  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
7859  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
7860  *  all characters except those listed.
7861  *
7862  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
7863  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
7864  *
7865  *  The backslash character <code>\\</code> can be used to escape
7866  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
7867  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
7868  *
7869  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7870  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
7871  *
7872  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
7873  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
7874  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7875  *
7876  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
7877  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
7878  */
7879
7880 static VALUE
7881 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7882 {
7883     str = str_duplicate(rb_cString, str);
7884     tr_trans(str, src, repl, 0);
7885     return str;
7886 }
7887
7888 #define TR_TABLE_MAX (UCHAR_MAX+1)
7889 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7890 static void
7891 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7892                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7893 {
7894     const unsigned int errc = -1;
7895     char buf[TR_TABLE_MAX];
7896     struct tr tr;
7897     unsigned int c;
7898     VALUE table = 0, ptable = 0;
7899     int i, l, cflag = 0;
7900
7901     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7902     tr.gen = tr.now = tr.max = 0;
7903
7904     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7905         cflag = 1;
7906         tr.p += l;
7907     }
7908     if (first) {
7909         for (i=0; i<TR_TABLE_MAX; i++) {
7910             stable[i] = 1;
7911         }
7912         stable[TR_TABLE_MAX] = cflag;
7913     }
7914     else if (stable[TR_TABLE_MAX] && !cflag) {
7915         stable[TR_TABLE_MAX] = 0;
7916     }
7917     for (i=0; i<TR_TABLE_MAX; i++) {
7918         buf[i] = cflag;
7919     }
7920
7921     while ((c = trnext(&tr, enc)) != errc) {
7922         if (c < TR_TABLE_MAX) {
7923             buf[(unsigned char)c] = !cflag;
7924         }
7925         else {
7926             VALUE key = UINT2NUM(c);
7927
7928             if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7929                 if (cflag) {
7930                     ptable = *ctablep;
7931                     table = ptable ? ptable : rb_hash_new();
7932                     *ctablep = table;
7933                 }
7934                 else {
7935                     table = rb_hash_new();
7936                     ptable = *tablep;
7937                     *tablep = table;
7938                 }
7939             }
7940             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7941                 rb_hash_aset(table, key, Qtrue);
7942             }
7943         }
7944     }
7945     for (i=0; i<TR_TABLE_MAX; i++) {
7946         stable[i] = stable[i] && buf[i];
7947     }
7948     if (!table && !cflag) {
7949         *tablep = 0;
7950     }
7951 }
7952
7953
7954 static int
7955 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7956 {
7957     if (c < TR_TABLE_MAX) {
7958         return table[c] != 0;
7959     }
7960     else {
7961         VALUE v = UINT2NUM(c);
7962
7963         if (del) {
7964             if (!NIL_P(rb_hash_lookup(del, v)) &&
7965                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7966                 return TRUE;
7967             }
7968         }
7969         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7970             return FALSE;
7971         }
7972         return table[TR_TABLE_MAX] ? TRUE : FALSE;
7973     }
7974 }
7975
7976 /*
7977  *  call-seq:
7978  *     str.delete!([other_str]+)   -> str or nil
7979  *
7980  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7981  *  <code>nil</code> if <i>str</i> was not modified.
7982  */
7983
7984 static VALUE
7985 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7986 {
7987     char squeez[TR_TABLE_SIZE];
7988     rb_encoding *enc = 0;
7989     char *s, *send, *t;
7990     VALUE del = 0, nodel = 0;
7991     int modify = 0;
7992     int i, ascompat, cr;
7993
7994     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7995     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7996     for (i=0; i<argc; i++) {
7997         VALUE s = argv[i];
7998
7999         StringValue(s);
8000         enc = rb_enc_check(str, s);
8001         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8002     }
8003
8004     str_modify_keep_cr(str);
8005     ascompat = rb_enc_asciicompat(enc);
8006     s = t = RSTRING_PTR(str);
8007     send = RSTRING_END(str);
8008     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8009     while (s < send) {
8010         unsigned int c;
8011         int clen;
8012
8013         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8014             if (squeez[c]) {
8015                 modify = 1;
8016             }
8017             else {
8018                 if (t != s) *t = c;
8019                 t++;
8020             }
8021             s++;
8022         }
8023         else {
8024             c = rb_enc_codepoint_len(s, send, &clen, enc);
8025
8026             if (tr_find(c, squeez, del, nodel)) {
8027                 modify = 1;
8028             }
8029             else {
8030                 if (t != s) rb_enc_mbcput(c, t, enc);
8031                 t += clen;
8032                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8033             }
8034             s += clen;
8035         }
8036     }
8037     TERM_FILL(t, TERM_LEN(str));
8038     STR_SET_LEN(str, t - RSTRING_PTR(str));
8039     ENC_CODERANGE_SET(str, cr);
8040
8041     if (modify) return str;
8042     return Qnil;
8043 }
8044
8045
8046 /*
8047  *  call-seq:
8048  *     str.delete([other_str]+)   -> new_str
8049  *
8050  *  Returns a copy of <i>str</i> with all characters in the intersection of its
8051  *  arguments deleted. Uses the same rules for building the set of characters as
8052  *  String#count.
8053  *
8054  *     "hello".delete "l","lo"        #=> "heo"
8055  *     "hello".delete "lo"            #=> "he"
8056  *     "hello".delete "aeiou", "^e"   #=> "hell"
8057  *     "hello".delete "ej-m"          #=> "ho"
8058  */
8059
8060 static VALUE
8061 rb_str_delete(int argc, VALUE *argv, VALUE str)
8062 {
8063     str = str_duplicate(rb_cString, str);
8064     rb_str_delete_bang(argc, argv, str);
8065     return str;
8066 }
8067
8068
8069 /*
8070  *  call-seq:
8071  *     str.squeeze!([other_str]*)   -> str or nil
8072  *
8073  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
8074  *  <code>nil</code> if no changes were made.
8075  */
8076
8077 static VALUE
8078 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8079 {
8080     char squeez[TR_TABLE_SIZE];
8081     rb_encoding *enc = 0;
8082     VALUE del = 0, nodel = 0;
8083     unsigned char *s, *send, *t;
8084     int i, modify = 0;
8085     int ascompat, singlebyte = single_byte_optimizable(str);
8086     unsigned int save;
8087
8088     if (argc == 0) {
8089         enc = STR_ENC_GET(str);
8090     }
8091     else {
8092         for (i=0; i<argc; i++) {
8093             VALUE s = argv[i];
8094
8095             StringValue(s);
8096             enc = rb_enc_check(str, s);
8097             if (singlebyte && !single_byte_optimizable(s))
8098                 singlebyte = 0;
8099             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8100         }
8101     }
8102
8103     str_modify_keep_cr(str);
8104     s = t = (unsigned char *)RSTRING_PTR(str);
8105     if (!s || RSTRING_LEN(str) == 0) return Qnil;
8106     send = (unsigned char *)RSTRING_END(str);
8107     save = -1;
8108     ascompat = rb_enc_asciicompat(enc);
8109
8110     if (singlebyte) {
8111         while (s < send) {
8112             unsigned int c = *s++;
8113             if (c != save || (argc > 0 && !squeez[c])) {
8114                 *t++ = save = c;
8115             }
8116         }
8117     }
8118     else {
8119         while (s < send) {
8120             unsigned int c;
8121             int clen;
8122
8123             if (ascompat && (c = *s) < 0x80) {
8124                 if (c != save || (argc > 0 && !squeez[c])) {
8125                     *t++ = save = c;
8126                 }
8127                 s++;
8128             }
8129             else {
8130                 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8131
8132                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8133                     if (t != s) rb_enc_mbcput(c, t, enc);
8134                     save = c;
8135                     t += clen;
8136                 }
8137                 s += clen;
8138             }
8139         }
8140     }
8141
8142     TERM_FILL((char *)t, TERM_LEN(str));
8143     if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8144         STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8145         modify = 1;
8146     }
8147
8148     if (modify) return str;
8149     return Qnil;
8150 }
8151
8152
8153 /*
8154  *  call-seq:
8155  *     str.squeeze([other_str]*)    -> new_str
8156  *
8157  *  Builds a set of characters from the <i>other_str</i> parameter(s)
8158  *  using the procedure described for String#count. Returns a new
8159  *  string where runs of the same character that occur in this set are
8160  *  replaced by a single character. If no arguments are given, all
8161  *  runs of identical characters are replaced by a single character.
8162  *
8163  *     "yellow moon".squeeze                  #=> "yelow mon"
8164  *     "  now   is  the".squeeze(" ")         #=> " now is the"
8165  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
8166  */
8167
8168 static VALUE
8169 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8170 {
8171     str = str_duplicate(rb_cString, str);
8172     rb_str_squeeze_bang(argc, argv, str);
8173     return str;
8174 }
8175
8176
8177 /*
8178  *  call-seq:
8179  *     str.tr_s!(from_str, to_str)   -> str or nil
8180  *
8181  *  Performs String#tr_s processing on <i>str</i> in place,
8182  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
8183  */
8184
8185 static VALUE
8186 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8187 {
8188     return tr_trans(str, src, repl, 1);
8189 }
8190
8191
8192 /*
8193  *  call-seq:
8194  *     str.tr_s(from_str, to_str)   -> new_str
8195  *
8196  *  Processes a copy of <i>str</i> as described under String#tr, then
8197  *  removes duplicate characters in regions that were affected by the
8198  *  translation.
8199  *
8200  *     "hello".tr_s('l', 'r')     #=> "hero"
8201  *     "hello".tr_s('el', '*')    #=> "h*o"
8202  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
8203  */
8204
8205 static VALUE
8206 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8207 {
8208     str = str_duplicate(rb_cString, str);
8209     tr_trans(str, src, repl, 1);
8210     return str;
8211 }
8212
8213
8214 /*
8215  *  call-seq:
8216  *     str.count([other_str]+)   -> integer
8217  *
8218  *  Each +other_str+ parameter defines a set of characters to count.  The
8219  *  intersection of these sets defines the characters to count in +str+.  Any
8220  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
8221  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
8222  *  backslash character <code>\\</code> can be used to escape <code>^</code> or
8223  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
8224  *  sequence or the end of a +other_str+.
8225  *
8226  *     a = "hello world"
8227  *     a.count "lo"                   #=> 5
8228  *     a.count "lo", "o"              #=> 2
8229  *     a.count "hello", "^l"          #=> 4
8230  *     a.count "ej-m"                 #=> 4
8231  *
8232  *     "hello^world".count "\\^aeiou" #=> 4
8233  *     "hello-world".count "a\\-eo"   #=> 4
8234  *
8235  *     c = "hello world\\r\\n"
8236  *     c.count "\\"                   #=> 2
8237  *     c.count "\\A"                  #=> 0
8238  *     c.count "X-\\w"                #=> 3
8239  */
8240
8241 static VALUE
8242 rb_str_count(int argc, VALUE *argv, VALUE str)
8243 {
8244     char table[TR_TABLE_SIZE];
8245     rb_encoding *enc = 0;
8246     VALUE del = 0, nodel = 0, tstr;
8247     char *s, *send;
8248     int i;
8249     int ascompat;
8250     size_t n = 0;
8251
8252     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8253
8254     tstr = argv[0];
8255     StringValue(tstr);
8256     enc = rb_enc_check(str, tstr);
8257     if (argc == 1) {
8258         const char *ptstr;
8259         if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8260             (ptstr = RSTRING_PTR(tstr),
8261              ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8262             !is_broken_string(str)) {
8263             int clen;
8264             unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8265
8266             s = RSTRING_PTR(str);
8267             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8268             send = RSTRING_END(str);
8269             while (s < send) {
8270                 if (*(unsigned char*)s++ == c) n++;
8271             }
8272             return SIZET2NUM(n);
8273         }
8274     }
8275
8276     tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8277     for (i=1; i<argc; i++) {
8278         tstr = argv[i];
8279         StringValue(tstr);
8280         enc = rb_enc_check(str, tstr);
8281         tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8282     }
8283
8284     s = RSTRING_PTR(str);
8285     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8286     send = RSTRING_END(str);
8287     ascompat = rb_enc_asciicompat(enc);
8288     while (s < send) {
8289         unsigned int c;
8290
8291         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8292             if (table[c]) {
8293                 n++;
8294             }
8295             s++;
8296         }
8297         else {
8298             int clen;
8299             c = rb_enc_codepoint_len(s, send, &clen, enc);
8300             if (tr_find(c, table, del, nodel)) {
8301                 n++;
8302             }
8303             s += clen;
8304         }
8305     }
8306
8307     return SIZET2NUM(n);
8308 }
8309
8310 static VALUE
8311 rb_fs_check(VALUE val)
8312 {
8313     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8314         val = rb_check_string_type(val);
8315         if (NIL_P(val)) return 0;
8316     }
8317     return val;
8318 }
8319
8320 static const char isspacetable[256] = {
8321     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8322     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8323     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8324     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8325     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8326     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8327     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8328     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8329     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8330     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8331     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8332     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8333     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8334     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8335     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8336     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8337 };
8338
8339 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8340
8341 static long
8342 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8343 {
8344     if (empty_count >= 0 && len == 0) {
8345         return empty_count + 1;
8346     }
8347     if (empty_count > 0) {
8348         /* make different substrings */
8349         if (result) {
8350             do {
8351                 rb_ary_push(result, str_new_empty_String(str));
8352             } while (--empty_count > 0);
8353         }
8354         else {
8355             do {
8356                 rb_yield(str_new_empty_String(str));
8357             } while (--empty_count > 0);
8358         }
8359     }
8360     str = rb_str_subseq(str, beg, len);
8361     if (result) {
8362         rb_ary_push(result, str);
8363     }
8364     else {
8365         rb_yield(str);
8366     }
8367     return empty_count;
8368 }
8369
8370 typedef enum {
8371     SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8372 } split_type_t;
8373
8374 static split_type_t
8375 literal_split_pattern(VALUE spat, split_type_t default_type)
8376 {
8377     rb_encoding *enc = STR_ENC_GET(spat);
8378     const char *ptr;
8379     long len;
8380     RSTRING_GETMEM(spat, ptr, len);
8381     if (len == 0) {
8382         /* Special case - split into chars */
8383         return SPLIT_TYPE_CHARS;
8384     }
8385     else if (rb_enc_asciicompat(enc)) {
8386         if (len == 1 && ptr[0] == ' ') {
8387             return SPLIT_TYPE_AWK;
8388         }
8389     }
8390     else {
8391         int l;
8392         if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8393             return SPLIT_TYPE_AWK;
8394         }
8395     }
8396     return default_type;
8397 }
8398
8399 /*
8400  *  call-seq:
8401  *     str.split(pattern=nil, [limit])                -> an_array
8402  *     str.split(pattern=nil, [limit]) {|sub| block } -> str
8403  *
8404  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
8405  *  of these substrings.
8406  *
8407  *  If <i>pattern</i> is a String, then its contents are used as
8408  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8409  *  space, <i>str</i> is split on whitespace, with leading and trailing
8410  *  whitespace and runs of contiguous whitespace characters ignored.
8411  *
8412  *  If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8413  *  pattern matches. Whenever the pattern matches a zero-length string,
8414  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
8415  *  groups, the respective matches will be returned in the array as well.
8416  *
8417  *  If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8418  *  If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8419  *  split on whitespace as if ' ' were specified.
8420  *
8421  *  If the <i>limit</i> parameter is omitted, trailing null fields are
8422  *  suppressed. If <i>limit</i> is a positive number, at most that number
8423  *  of split substrings will be returned (captured groups will be returned
8424  *  as well, but are not counted towards the limit).
8425  *  If <i>limit</i> is <code>1</code>, the entire
8426  *  string is returned as the only entry in an array. If negative, there is no
8427  *  limit to the number of fields returned, and trailing null fields are not
8428  *  suppressed.
8429  *
8430  *  When the input +str+ is empty an empty Array is returned as the string is
8431  *  considered to have no fields to split.
8432  *
8433  *     " now's  the time ".split       #=> ["now's", "the", "time"]
8434  *     " now's  the time ".split(' ')  #=> ["now's", "the", "time"]
8435  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
8436  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8437  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
8438  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
8439  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
8440  *
8441  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
8442  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
8443  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
8444  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
8445  *
8446  *     "1:2:3".split(/(:)()()/, 2)     #=> ["1", ":", "", "", "2:3"]
8447  *
8448  *     "".split(',', -1)               #=> []
8449  *
8450  *  If a block is given, invoke the block with each split substring.
8451  *
8452  */
8453
8454 static VALUE
8455 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8456 {
8457     rb_encoding *enc;
8458     VALUE spat;
8459     VALUE limit;
8460     split_type_t split_type;
8461     long beg, end, i = 0, empty_count = -1;
8462     int lim = 0;
8463     VALUE result, tmp;
8464
8465     result = rb_block_given_p() ? Qfalse : Qnil;
8466     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8467         lim = NUM2INT(limit);
8468         if (lim <= 0) limit = Qnil;
8469         else if (lim == 1) {
8470             if (RSTRING_LEN(str) == 0)
8471                 return result ? rb_ary_new2(0) : str;
8472             tmp = str_duplicate(rb_cString, str);
8473             if (!result) {
8474                 rb_yield(tmp);
8475                 return str;
8476             }
8477             return rb_ary_new3(1, tmp);
8478         }
8479         i = 1;
8480     }
8481     if (NIL_P(limit) && !lim) empty_count = 0;
8482
8483     enc = STR_ENC_GET(str);
8484     split_type = SPLIT_TYPE_REGEXP;
8485     if (!NIL_P(spat)) {
8486         spat = get_pat_quoted(spat, 0);
8487     }
8488     else if (NIL_P(spat = rb_fs)) {
8489         split_type = SPLIT_TYPE_AWK;
8490     }
8491     else if (!(spat = rb_fs_check(spat))) {
8492         rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8493     }
8494     else {
8495         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8496     }
8497     if (split_type != SPLIT_TYPE_AWK) {
8498         switch (BUILTIN_TYPE(spat)) {
8499           case T_REGEXP:
8500             rb_reg_options(spat); /* check if uninitialized */
8501             tmp = RREGEXP_SRC(spat);
8502             split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8503             if (split_type == SPLIT_TYPE_AWK) {
8504                 spat = tmp;
8505                 split_type = SPLIT_TYPE_STRING;
8506             }
8507             break;
8508
8509           case T_STRING:
8510             mustnot_broken(spat);
8511             split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8512             break;
8513
8514           default:
8515             UNREACHABLE_RETURN(Qnil);
8516         }
8517     }
8518
8519 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8520
8521     if (result) result = rb_ary_new();
8522     beg = 0;
8523     char *ptr = RSTRING_PTR(str);
8524     char *eptr = RSTRING_END(str);
8525     if (split_type == SPLIT_TYPE_AWK) {
8526         char *bptr = ptr;
8527         int skip = 1;
8528         unsigned int c;
8529
8530         end = beg;
8531         if (is_ascii_string(str)) {
8532             while (ptr < eptr) {
8533                 c = (unsigned char)*ptr++;
8534                 if (skip) {
8535                     if (ascii_isspace(c)) {
8536                         beg = ptr - bptr;
8537                     }
8538                     else {
8539                         end = ptr - bptr;
8540                         skip = 0;
8541                         if (!NIL_P(limit) && lim <= i) break;
8542                     }
8543                 }
8544                 else if (ascii_isspace(c)) {
8545                     SPLIT_STR(beg, end-beg);
8546                     skip = 1;
8547                     beg = ptr - bptr;
8548                     if (!NIL_P(limit)) ++i;
8549                 }
8550                 else {
8551                     end = ptr - bptr;
8552                 }
8553             }
8554         }
8555         else {
8556             while (ptr < eptr) {
8557                 int n;
8558
8559                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8560                 ptr += n;
8561                 if (skip) {
8562                     if (rb_isspace(c)) {
8563                         beg = ptr - bptr;
8564                     }
8565                     else {
8566                         end = ptr - bptr;
8567                         skip = 0;
8568                         if (!NIL_P(limit) && lim <= i) break;
8569                     }
8570                 }
8571                 else if (rb_isspace(c)) {
8572                     SPLIT_STR(beg, end-beg);
8573                     skip = 1;
8574                     beg = ptr - bptr;
8575                     if (!NIL_P(limit)) ++i;
8576                 }
8577                 else {
8578                     end = ptr - bptr;
8579                 }
8580             }
8581         }
8582     }
8583     else if (split_type == SPLIT_TYPE_STRING) {
8584         char *str_start = ptr;
8585         char *substr_start = ptr;
8586         char *sptr = RSTRING_PTR(spat);
8587         long slen = RSTRING_LEN(spat);
8588
8589         mustnot_broken(str);
8590         enc = rb_enc_check(str, spat);
8591         while (ptr < eptr &&
8592                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8593             /* Check we are at the start of a char */
8594             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8595             if (t != ptr + end) {
8596                 ptr = t;
8597                 continue;
8598             }
8599             SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8600             ptr += end + slen;
8601             substr_start = ptr;
8602             if (!NIL_P(limit) && lim <= ++i) break;
8603         }
8604         beg = ptr - str_start;
8605     }
8606     else if (split_type == SPLIT_TYPE_CHARS) {
8607         char *str_start = ptr;
8608         int n;
8609
8610         mustnot_broken(str);
8611         enc = rb_enc_get(str);
8612         while (ptr < eptr &&
8613                (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8614             SPLIT_STR(ptr - str_start, n);
8615             ptr += n;
8616             if (!NIL_P(limit) && lim <= ++i) break;
8617         }
8618         beg = ptr - str_start;
8619     }
8620     else {
8621         long len = RSTRING_LEN(str);
8622         long start = beg;
8623         long idx;
8624         int last_null = 0;
8625         struct re_registers *regs;
8626         VALUE match = 0;
8627
8628         for (; rb_reg_search(spat, str, start, 0) >= 0;
8629              (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8630             match = rb_backref_get();
8631             if (!result) rb_match_busy(match);
8632             regs = RMATCH_REGS(match);
8633             end = BEG(0);
8634             if (start == end && BEG(0) == END(0)) {
8635                 if (!ptr) {
8636                     SPLIT_STR(0, 0);
8637                     break;
8638                 }
8639                 else if (last_null == 1) {
8640                     SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8641                     beg = start;
8642                 }
8643                 else {
8644                     if (start == len)
8645                         start++;
8646                     else
8647                         start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8648                     last_null = 1;
8649                     continue;
8650                 }
8651             }
8652             else {
8653                 SPLIT_STR(beg, end-beg);
8654                 beg = start = END(0);
8655             }
8656             last_null = 0;
8657
8658             for (idx=1; idx < regs->num_regs; idx++) {
8659                 if (BEG(idx) == -1) continue;
8660                 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8661             }
8662             if (!NIL_P(limit) && lim <= ++i) break;
8663         }
8664         if (match) rb_match_unbusy(match);
8665     }
8666     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8667         SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8668     }
8669
8670     return result ? result : str;
8671 }
8672
8673 VALUE
8674 rb_str_split(VALUE str, const char *sep0)
8675 {
8676     VALUE sep;
8677
8678     StringValue(str);
8679     sep = rb_str_new_cstr(sep0);
8680     return rb_str_split_m(1, &sep, str);
8681 }
8682
8683 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8684
8685 static inline int
8686 enumerator_element(VALUE ary, VALUE e)
8687 {
8688     if (ary) {
8689         rb_ary_push(ary, e);
8690         return 0;
8691     }
8692     else {
8693         rb_yield(e);
8694         return 1;
8695     }
8696 }
8697
8698 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8699
8700 static const char *
8701 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8702 {
8703     const char *prev = rb_enc_prev_char(p, e, e, enc);
8704     if (rb_enc_is_newline(prev, e, enc)) {
8705         e = prev;
8706         prev = rb_enc_prev_char(p, e, e, enc);
8707         if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8708             e = prev;
8709     }
8710     return e;
8711 }
8712
8713 static VALUE
8714 get_rs(void)
8715 {
8716     VALUE rs = rb_rs;
8717     if (!NIL_P(rs) &&
8718         (!RB_TYPE_P(rs, T_STRING) ||
8719          RSTRING_LEN(rs) != 1 ||
8720          RSTRING_PTR(rs)[0] != '\n')) {
8721         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8722     }
8723     return rs;
8724 }
8725
8726 #define rb_rs get_rs()
8727
8728 static VALUE
8729 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8730 {
8731     rb_encoding *enc;
8732     VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8733     const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8734     long pos, len, rslen;
8735     int rsnewline = 0;
8736
8737     if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8738         rs = rb_rs;
8739     if (!NIL_P(opts)) {
8740         static ID keywords[1];
8741         if (!keywords[0]) {
8742             keywords[0] = rb_intern_const("chomp");
8743         }
8744         rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8745         chomp = (chomp != Qundef && RTEST(chomp));
8746     }
8747
8748     if (NIL_P(rs)) {
8749         if (!ENUM_ELEM(ary, str)) {
8750             return ary;
8751         }
8752         else {
8753             return orig;
8754         }
8755     }
8756
8757     if (!RSTRING_LEN(str)) goto end;
8758     str = rb_str_new_frozen(str);
8759     ptr = subptr = RSTRING_PTR(str);
8760     pend = RSTRING_END(str);
8761     len = RSTRING_LEN(str);
8762     StringValue(rs);
8763     rslen = RSTRING_LEN(rs);
8764
8765     if (rs == rb_default_rs)
8766         enc = rb_enc_get(str);
8767     else
8768         enc = rb_enc_check(str, rs);
8769
8770     if (rslen == 0) {
8771         /* paragraph mode */
8772         int n;
8773         const char *eol = NULL;
8774         subend = subptr;
8775         while (subend < pend) {
8776             do {
8777                 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8778                     n = 0;
8779                 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8780                 if (rb_enc_is_newline(subend + n, pend, enc)) {
8781                     if (eol == subend) break;
8782                     subend += rslen;
8783                     if (subptr) eol = subend;
8784                 }
8785                 else {
8786                     if (!subptr) subptr = subend;
8787                     subend += rslen;
8788                 }
8789                 rslen = 0;
8790             } while (subend < pend);
8791             if (!subptr) break;
8792             line = rb_str_subseq(str, subptr - ptr,
8793                                  subend - subptr + (chomp ? 0 : rslen));
8794             if (ENUM_ELEM(ary, line)) {
8795                 str_mod_check(str, ptr, len);
8796             }
8797             subptr = eol = NULL;
8798         }
8799         goto end;
8800     }
8801     else {
8802         rsptr = RSTRING_PTR(rs);
8803         if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8804             rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8805             rsnewline = 1;
8806         }
8807     }
8808
8809     if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8810         rs = rb_str_new(rsptr, rslen);
8811         rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8812         rsptr = RSTRING_PTR(rs);
8813         rslen = RSTRING_LEN(rs);
8814     }
8815
8816     while (subptr < pend) {
8817         pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8818         if (pos < 0) break;
8819         hit = subptr + pos;
8820         adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8821         if (hit != adjusted) {
8822             subptr = adjusted;
8823             continue;
8824         }
8825         subend = hit += rslen;
8826         if (chomp) {
8827             if (rsnewline) {
8828                 subend = chomp_newline(subptr, subend, enc);
8829             }
8830             else {
8831                 subend -= rslen;
8832             }
8833         }
8834         line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8835         if (ENUM_ELEM(ary, line)) {
8836             str_mod_check(str, ptr, len);
8837         }
8838         subptr = hit;
8839     }
8840
8841     if (subptr != pend) {
8842         if (chomp) {
8843             if (rsnewline) {
8844                 pend = chomp_newline(subptr, pend, enc);
8845             }
8846             else if (pend - subptr >= rslen &&
8847                      memcmp(pend - rslen, rsptr, rslen) == 0) {
8848                 pend -= rslen;
8849             }
8850         }
8851         line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8852         ENUM_ELEM(ary, line);
8853         RB_GC_GUARD(str);
8854     }
8855
8856   end:
8857     if (ary)
8858         return ary;
8859     else
8860         return orig;
8861 }
8862
8863 /*
8864  *  call-seq:
8865  *     str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8866  *     str.each_line(separator=$/, chomp: false)                   -> an_enumerator
8867  *
8868  *  Splits <i>str</i> using the supplied parameter as the record
8869  *  separator (<code>$/</code> by default), passing each substring in
8870  *  turn to the supplied block.  If a zero-length record separator is
8871  *  supplied, the string is split into paragraphs delimited by
8872  *  multiple successive newlines.
8873  *
8874  *  If +chomp+ is +true+, +separator+ will be removed from the end of each
8875  *  line.
8876  *
8877  *  If no block is given, an enumerator is returned instead.
8878  *
8879  *     "hello\nworld".each_line {|s| p s}
8880  *     # prints:
8881  *     #   "hello\n"
8882  *     #   "world"
8883  *
8884  *     "hello\nworld".each_line('l') {|s| p s}
8885  *     # prints:
8886  *     #   "hel"
8887  *     #   "l"
8888  *     #   "o\nworl"
8889  *     #   "d"
8890  *
8891  *     "hello\n\n\nworld".each_line('') {|s| p s}
8892  *     # prints
8893  *     #   "hello\n\n"
8894  *     #   "world"
8895  *
8896  *     "hello\nworld".each_line(chomp: true) {|s| p s}
8897  *     # prints:
8898  *     #   "hello"
8899  *     #   "world"
8900  *
8901  *     "hello\nworld".each_line('l', chomp: true) {|s| p s}
8902  *     # prints:
8903  *     #   "he"
8904  *     #   ""
8905  *     #   "o\nwor"
8906  *     #   "d"
8907  *
8908  */
8909
8910 static VALUE
8911 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8912 {
8913     RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8914     return rb_str_enumerate_lines(argc, argv, str, 0);
8915 }
8916
8917 /*
8918  *  call-seq:
8919  *     str.lines(separator=$/, chomp: false)  -> an_array
8920  *
8921  *  Returns an array of lines in <i>str</i> split using the supplied
8922  *  record separator (<code>$/</code> by default).  This is a
8923  *  shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8924  *
8925  *  If +chomp+ is +true+, +separator+ will be removed from the end of each
8926  *  line.
8927  *
8928  *     "hello\nworld\n".lines              #=> ["hello\n", "world\n"]
8929  *     "hello  world".lines(' ')           #=> ["hello ", " ", "world"]
8930  *     "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8931  *
8932  *  If a block is given, which is a deprecated form, works the same as
8933  *  <code>each_line</code>.
8934  */
8935
8936 static VALUE
8937 rb_str_lines(int argc, VALUE *argv, VALUE str)
8938 {
8939     VALUE ary = WANTARRAY("lines", 0);
8940     return rb_str_enumerate_lines(argc, argv, str, ary);
8941 }
8942
8943 static VALUE
8944 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8945 {
8946     return LONG2FIX(RSTRING_LEN(str));
8947 }
8948
8949 static VALUE
8950 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8951 {
8952     long i;
8953
8954     for (i=0; i<RSTRING_LEN(str); i++) {
8955         ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8956     }
8957     if (ary)
8958         return ary;
8959     else
8960         return str;
8961 }
8962
8963 /*
8964  *  call-seq:
8965  *     str.each_byte {|integer| block }    -> str
8966  *     str.each_byte                      -> an_enumerator
8967  *
8968  *  Passes each byte in <i>str</i> to the given block, or returns an
8969  *  enumerator if no block is given.
8970  *
8971  *     "hello".each_byte {|c| print c, ' ' }
8972  *
8973  *  <em>produces:</em>
8974  *
8975  *     104 101 108 108 111
8976  */
8977
8978 static VALUE
8979 rb_str_each_byte(VALUE str)
8980 {
8981     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8982     return rb_str_enumerate_bytes(str, 0);
8983 }
8984
8985 /*
8986  *  call-seq:
8987  *     str.bytes    -> an_array
8988  *
8989  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
8990  *  <code>str.each_byte.to_a</code>.
8991  *
8992  *  If a block is given, which is a deprecated form, works the same as
8993  *  <code>each_byte</code>.
8994  */
8995
8996 static VALUE
8997 rb_str_bytes(VALUE str)
8998 {
8999     VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9000     return rb_str_enumerate_bytes(str, ary);
9001 }
9002
9003 static VALUE
9004 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9005 {
9006     return rb_str_length(str);
9007 }
9008
9009 static VALUE
9010 rb_str_enumerate_chars(VALUE str, VALUE ary)
9011 {
9012     VALUE orig = str;
9013     long i, len, n;
9014     const char *ptr;
9015     rb_encoding *enc;
9016
9017     str = rb_str_new_frozen(str);
9018     ptr = RSTRING_PTR(str);
9019     len = RSTRING_LEN(str);
9020     enc = rb_enc_get(str);
9021
9022     if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9023         for (i = 0; i < len; i += n) {
9024             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9025             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9026         }
9027     }
9028     else {
9029         for (i = 0; i < len; i += n) {
9030             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9031             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9032         }
9033     }
9034     RB_GC_GUARD(str);
9035     if (ary)
9036         return ary;
9037     else
9038         return orig;
9039 }
9040
9041 /*
9042  *  call-seq:
9043  *     str.each_char {|cstr| block }    -> str
9044  *     str.each_char                    -> an_enumerator
9045  *
9046  *  Passes each character in <i>str</i> to the given block, or returns
9047  *  an enumerator if no block is given.
9048  *
9049  *     "hello".each_char {|c| print c, ' ' }
9050  *
9051  *  <em>produces:</em>
9052  *
9053  *     h e l l o
9054  */
9055
9056 static VALUE
9057 rb_str_each_char(VALUE str)
9058 {
9059     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9060     return rb_str_enumerate_chars(str, 0);
9061 }
9062
9063 /*
9064  *  call-seq:
9065  *     str.chars    -> an_array
9066  *
9067  *  Returns an array of characters in <i>str</i>.  This is a shorthand
9068  *  for <code>str.each_char.to_a</code>.
9069  *
9070  *  If a block is given, which is a deprecated form, works the same as
9071  *  <code>each_char</code>.
9072  */
9073
9074 static VALUE
9075 rb_str_chars(VALUE str)
9076 {
9077     VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9078     return rb_str_enumerate_chars(str, ary);
9079 }
9080
9081 static VALUE
9082 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9083 {
9084     VALUE orig = str;
9085     int n;
9086     unsigned int c;
9087     const char *ptr, *end;
9088     rb_encoding *enc;
9089
9090     if (single_byte_optimizable(str))
9091         return rb_str_enumerate_bytes(str, ary);
9092
9093     str = rb_str_new_frozen(str);
9094     ptr = RSTRING_PTR(str);
9095     end = RSTRING_END(str);
9096     enc = STR_ENC_GET(str);
9097
9098     while (ptr < end) {
9099         c = rb_enc_codepoint_len(ptr, end, &n, enc);
9100         ENUM_ELEM(ary, UINT2NUM(c));
9101         ptr += n;
9102     }
9103     RB_GC_GUARD(str);
9104     if (ary)
9105         return ary;
9106     else
9107         return orig;
9108 }
9109
9110 /*
9111  *  call-seq:
9112  *     str.each_codepoint {|integer| block }    -> str
9113  *     str.each_codepoint                       -> an_enumerator
9114  *
9115  *  Passes the Integer ordinal of each character in <i>str</i>,
9116  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
9117  *  given block.  For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9118  *  values are directly derived from the binary representation
9119  *  of each character.
9120  *
9121  *  If no block is given, an enumerator is returned instead.
9122  *
9123  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
9124  *
9125  *  <em>produces:</em>
9126  *
9127  *     104 101 108 108 111 1593
9128  */
9129
9130 static VALUE
9131 rb_str_each_codepoint(VALUE str)
9132 {
9133     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9134     return rb_str_enumerate_codepoints(str, 0);
9135 }
9136
9137 /*
9138  *  call-seq:
9139  *     str.codepoints   -> an_array
9140  *
9141  *  Returns an array of the Integer ordinals of the
9142  *  characters in <i>str</i>.  This is a shorthand for
9143  *  <code>str.each_codepoint.to_a</code>.
9144  *
9145  *  If a block is given, which is a deprecated form, works the same as
9146  *  <code>each_codepoint</code>.
9147  */
9148
9149 static VALUE
9150 rb_str_codepoints(VALUE str)
9151 {
9152     VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9153     return rb_str_enumerate_codepoints(str, ary);
9154 }
9155
9156 static regex_t *
9157 get_reg_grapheme_cluster(rb_encoding *enc)
9158 {
9159     int encidx = rb_enc_to_index(enc);
9160     regex_t *reg_grapheme_cluster = NULL;
9161     static regex_t *reg_grapheme_cluster_utf8 = NULL;
9162
9163     /* synchronize */
9164     if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9165         reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9166     }
9167     if (!reg_grapheme_cluster) {
9168         const OnigUChar source_ascii[] = "\\X";
9169         OnigErrorInfo einfo;
9170         const OnigUChar *source = source_ascii;
9171         size_t source_len = sizeof(source_ascii) - 1;
9172         switch (encidx) {
9173 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9174 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9175 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9176 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9177 #define CASE_UTF(e) \
9178           case ENCINDEX_UTF_##e: { \
9179             static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9180             source = source_UTF_##e; \
9181             source_len = sizeof(source_UTF_##e); \
9182             break; \
9183           }
9184             CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9185 #undef CASE_UTF
9186 #undef CHARS_16BE
9187 #undef CHARS_16LE
9188 #undef CHARS_32BE
9189 #undef CHARS_32LE
9190         }
9191         int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9192                          ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9193         if (r) {
9194             UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9195             onig_error_code_to_str(message, r, &einfo);
9196             rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9197         }
9198         if (encidx == rb_utf8_encindex()) {
9199             reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9200         }
9201     }
9202     return reg_grapheme_cluster;
9203 }
9204
9205 static VALUE
9206 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9207 {
9208     size_t grapheme_cluster_count = 0;
9209     regex_t *reg_grapheme_cluster = NULL;
9210     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9211     const char *ptr, *end;
9212
9213     if (!rb_enc_unicode_p(enc)) {
9214         return rb_str_length(str);
9215     }
9216
9217     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9218     ptr = RSTRING_PTR(str);
9219     end = RSTRING_END(str);
9220
9221     while (ptr < end) {
9222         OnigPosition len = onig_match(reg_grapheme_cluster,
9223                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9224                                       (const OnigUChar *)ptr, NULL, 0);
9225         if (len <= 0) break;
9226         grapheme_cluster_count++;
9227         ptr += len;
9228     }
9229
9230     return SIZET2NUM(grapheme_cluster_count);
9231 }
9232
9233 static VALUE
9234 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9235 {
9236     VALUE orig = str;
9237     regex_t *reg_grapheme_cluster = NULL;
9238     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9239     const char *ptr0, *ptr, *end;
9240
9241     if (!rb_enc_unicode_p(enc)) {
9242         return rb_str_enumerate_chars(str, ary);
9243     }
9244
9245     if (!ary) str = rb_str_new_frozen(str);
9246     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9247     ptr0 = ptr = RSTRING_PTR(str);
9248     end = RSTRING_END(str);
9249
9250     while (ptr < end) {
9251         OnigPosition len = onig_match(reg_grapheme_cluster,
9252                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9253                                       (const OnigUChar *)ptr, NULL, 0);
9254         if (len <= 0) break;
9255         ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9256         ptr += len;
9257     }
9258     RB_GC_GUARD(str);
9259     if (ary)
9260         return ary;
9261     else
9262         return orig;
9263 }
9264
9265 /*
9266  *  call-seq:
9267  *     str.each_grapheme_cluster {|cstr| block }    -> str
9268  *     str.each_grapheme_cluster                    -> an_enumerator
9269  *
9270  *  Passes each grapheme cluster in <i>str</i> to the given block, or returns
9271  *  an enumerator if no block is given.
9272  *  Unlike String#each_char, this enumerates by grapheme clusters defined by
9273  *  Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9274  *
9275  *     "a\u0300".each_char.to_a.size #=> 2
9276  *     "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9277  *
9278  */
9279
9280 static VALUE
9281 rb_str_each_grapheme_cluster(VALUE str)
9282 {
9283     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9284     return rb_str_enumerate_grapheme_clusters(str, 0);
9285 }
9286
9287 /*
9288  *  call-seq:
9289  *     str.grapheme_clusters   -> an_array
9290  *
9291  *  Returns an array of grapheme clusters in <i>str</i>.  This is a shorthand
9292  *  for <code>str.each_grapheme_cluster.to_a</code>.
9293  *
9294  *  If a block is given, which is a deprecated form, works the same as
9295  *  <code>each_grapheme_cluster</code>.
9296  */
9297
9298 static VALUE
9299 rb_str_grapheme_clusters(VALUE str)
9300 {
9301     VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9302     return rb_str_enumerate_grapheme_clusters(str, ary);
9303 }
9304
9305 static long
9306 chopped_length(VALUE str)
9307 {
9308     rb_encoding *enc = STR_ENC_GET(str);
9309     const char *p, *p2, *beg, *end;
9310
9311     beg = RSTRING_PTR(str);
9312     end = beg + RSTRING_LEN(str);
9313     if (beg >= end) return 0;
9314     p = rb_enc_prev_char(beg, end, end, enc);
9315     if (!p) return 0;
9316     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9317         p2 = rb_enc_prev_char(beg, p, end, enc);
9318         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9319     }
9320     return p - beg;
9321 }
9322
9323 /*
9324  *  call-seq:
9325  *     str.chop!   -> str or nil
9326  *
9327  *  Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9328  *  <code>nil</code> if <i>str</i> is the empty string.  See also
9329  *  String#chomp!.
9330  */
9331
9332 static VALUE
9333 rb_str_chop_bang(VALUE str)
9334 {
9335     str_modify_keep_cr(str);
9336     if (RSTRING_LEN(str) > 0) {
9337         long len;
9338         len = chopped_length(str);
9339         STR_SET_LEN(str, len);
9340         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9341         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9342             ENC_CODERANGE_CLEAR(str);
9343         }
9344         return str;
9345     }
9346     return Qnil;
9347 }
9348
9349
9350 /*
9351  *  call-seq:
9352  *     str.chop   -> new_str
9353  *
9354  *  Returns a new String with the last character removed.  If the
9355  *  string ends with <code>\r\n</code>, both characters are
9356  *  removed. Applying <code>chop</code> to an empty string returns an
9357  *  empty string. String#chomp is often a safer alternative, as it
9358  *  leaves the string unchanged if it doesn't end in a record
9359  *  separator.
9360  *
9361  *     "string\r\n".chop   #=> "string"
9362  *     "string\n\r".chop   #=> "string\n"
9363  *     "string\n".chop     #=> "string"
9364  *     "string".chop       #=> "strin"
9365  *     "x".chop.chop       #=> ""
9366  */
9367
9368 static VALUE
9369 rb_str_chop(VALUE str)
9370 {
9371     return rb_str_subseq(str, 0, chopped_length(str));
9372 }
9373
9374 static long
9375 smart_chomp(VALUE str, const char *e, const char *p)
9376 {
9377     rb_encoding *enc = rb_enc_get(str);
9378     if (rb_enc_mbminlen(enc) > 1) {
9379         const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9380         if (rb_enc_is_newline(pp, e, enc)) {
9381             e = pp;
9382         }
9383         pp = e - rb_enc_mbminlen(enc);
9384         if (pp >= p) {
9385             pp = rb_enc_left_char_head(p, pp, e, enc);
9386             if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9387                 e = pp;
9388             }
9389         }
9390     }
9391     else {
9392         switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9393           case '\n':
9394             if (--e > p && *(e-1) == '\r') {
9395                 --e;
9396             }
9397             break;
9398           case '\r':
9399             --e;
9400             break;
9401         }
9402     }
9403     return e - p;
9404 }
9405
9406 static long
9407 chompped_length(VALUE str, VALUE rs)
9408 {
9409     rb_encoding *enc;
9410     int newline;
9411     char *pp, *e, *rsptr;
9412     long rslen;
9413     char *const p = RSTRING_PTR(str);
9414     long len = RSTRING_LEN(str);
9415
9416     if (len == 0) return 0;
9417     e = p + len;
9418     if (rs == rb_default_rs) {
9419         return smart_chomp(str, e, p);
9420     }
9421
9422     enc = rb_enc_get(str);
9423     RSTRING_GETMEM(rs, rsptr, rslen);
9424     if (rslen == 0) {
9425         if (rb_enc_mbminlen(enc) > 1) {
9426             while (e > p) {
9427                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9428                 if (!rb_enc_is_newline(pp, e, enc)) break;
9429                 e = pp;
9430                 pp -= rb_enc_mbminlen(enc);
9431                 if (pp >= p) {
9432                     pp = rb_enc_left_char_head(p, pp, e, enc);
9433                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9434                         e = pp;
9435                     }
9436                 }
9437             }
9438         }
9439         else {
9440             while (e > p && *(e-1) == '\n') {
9441                 --e;
9442                 if (e > p && *(e-1) == '\r')
9443                     --e;
9444             }
9445         }
9446         return e - p;
9447     }
9448     if (rslen > len) return len;
9449
9450     enc = rb_enc_get(rs);
9451     newline = rsptr[rslen-1];
9452     if (rslen == rb_enc_mbminlen(enc)) {
9453         if (rslen == 1) {
9454             if (newline == '\n')
9455                 return smart_chomp(str, e, p);
9456         }
9457         else {
9458             if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9459                 return smart_chomp(str, e, p);
9460         }
9461     }
9462
9463     enc = rb_enc_check(str, rs);
9464     if (is_broken_string(rs)) {
9465         return len;
9466     }
9467     pp = e - rslen;
9468     if (p[len-1] == newline &&
9469         (rslen <= 1 ||
9470          memcmp(rsptr, pp, rslen) == 0)) {
9471         if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9472             return len - rslen;
9473         RB_GC_GUARD(rs);
9474     }
9475     return len;
9476 }
9477
9478 /*!
9479  * Returns the separator for arguments of rb_str_chomp.
9480  *
9481  * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
9482  */
9483 static VALUE
9484 chomp_rs(int argc, const VALUE *argv)
9485 {
9486     rb_check_arity(argc, 0, 1);
9487     if (argc > 0) {
9488         VALUE rs = argv[0];
9489         if (!NIL_P(rs)) StringValue(rs);
9490         return rs;
9491     }
9492     else {
9493         return rb_rs;
9494     }
9495 }
9496
9497 VALUE
9498 rb_str_chomp_string(VALUE str, VALUE rs)
9499 {
9500     long olen = RSTRING_LEN(str);
9501     long len = chompped_length(str, rs);
9502     if (len >= olen) return Qnil;
9503     str_modify_keep_cr(str);
9504     STR_SET_LEN(str, len);
9505     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9506     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9507         ENC_CODERANGE_CLEAR(str);
9508     }
9509     return str;
9510 }
9511
9512 /*
9513  *  call-seq:
9514  *     str.chomp!(separator=$/)   -> str or nil
9515  *
9516  *  Modifies <i>str</i> in place as described for String#chomp,
9517  *  returning <i>str</i>, or <code>nil</code> if no modifications were
9518  *  made.
9519  */
9520
9521 static VALUE
9522 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9523 {
9524     VALUE rs;
9525     str_modifiable(str);
9526     if (RSTRING_LEN(str) == 0) return Qnil;
9527     rs = chomp_rs(argc, argv);
9528     if (NIL_P(rs)) return Qnil;
9529     return rb_str_chomp_string(str, rs);
9530 }
9531
9532
9533 /*
9534  *  call-seq:
9535  *     str.chomp(separator=$/)   -> new_str
9536  *
9537  *  Returns a new String with the given record separator removed
9538  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
9539  *  changed from the default Ruby record separator, then <code>chomp</code> also
9540  *  removes carriage return characters (that is, it will remove <code>\n</code>,
9541  *  <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9542  *  it will remove all trailing newlines from the string.
9543  *
9544  *     "hello".chomp                #=> "hello"
9545  *     "hello\n".chomp              #=> "hello"
9546  *     "hello\r\n".chomp            #=> "hello"
9547  *     "hello\n\r".chomp            #=> "hello\n"
9548  *     "hello\r".chomp              #=> "hello"
9549  *     "hello \n there".chomp       #=> "hello \n there"
9550  *     "hello".chomp("llo")         #=> "he"
9551  *     "hello\r\n\r\n".chomp('')    #=> "hello"
9552  *     "hello\r\n\r\r\n".chomp('')  #=> "hello\r\n\r"
9553  */
9554
9555 static VALUE
9556 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9557 {
9558     VALUE rs = chomp_rs(argc, argv);
9559     if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9560     return rb_str_subseq(str, 0, chompped_length(str, rs));
9561 }
9562
9563 static long
9564 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9565 {
9566     const char *const start = s;
9567
9568     if (!s || s >= e) return 0;
9569
9570     /* remove spaces at head */
9571     if (single_byte_optimizable(str)) {
9572         while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9573     }
9574     else {
9575         while (s < e) {
9576             int n;
9577             unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9578
9579             if (cc && !rb_isspace(cc)) break;
9580             s += n;
9581         }
9582     }
9583     return s - start;
9584 }
9585
9586 /*
9587  *  call-seq:
9588  *     str.lstrip!   -> self or nil
9589  *
9590  *  Removes leading whitespace from the receiver.
9591  *  Returns the altered receiver, or +nil+ if no change was made.
9592  *  See also String#rstrip! and String#strip!.
9593  *
9594  *  Refer to String#strip for the definition of whitespace.
9595  *
9596  *     "  hello  ".lstrip!  #=> "hello  "
9597  *     "hello  ".lstrip!    #=> nil
9598  *     "hello".lstrip!      #=> nil
9599  */
9600
9601 static VALUE
9602 rb_str_lstrip_bang(VALUE str)
9603 {
9604     rb_encoding *enc;
9605     char *start, *s;
9606     long olen, loffset;
9607
9608     str_modify_keep_cr(str);
9609     enc = STR_ENC_GET(str);
9610     RSTRING_GETMEM(str, start, olen);
9611     loffset = lstrip_offset(str, start, start+olen, enc);
9612     if (loffset > 0) {
9613         long len = olen-loffset;
9614         s = start + loffset;
9615         memmove(start, s, len);
9616         STR_SET_LEN(str, len);
9617         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9618         return str;
9619     }
9620     return Qnil;
9621 }
9622
9623
9624 /*
9625  *  call-seq:
9626  *     str.lstrip   -> new_str
9627  *
9628  *  Returns a copy of the receiver with leading whitespace removed.
9629  *  See also String#rstrip and String#strip.
9630  *
9631  *  Refer to String#strip for the definition of whitespace.
9632  *
9633  *     "  hello  ".lstrip   #=> "hello  "
9634  *     "hello".lstrip       #=> "hello"
9635  */
9636
9637 static VALUE
9638 rb_str_lstrip(VALUE str)
9639 {
9640     char *start;
9641     long len, loffset;
9642     RSTRING_GETMEM(str, start, len);
9643     loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9644     if (loffset <= 0) return str_duplicate(rb_cString, str);
9645     return rb_str_subseq(str, loffset, len - loffset);
9646 }
9647
9648 static long
9649 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9650 {
9651     const char *t;
9652
9653     rb_str_check_dummy_enc(enc);
9654     if (!s || s >= e) return 0;
9655     t = e;
9656
9657     /* remove trailing spaces or '\0's */
9658     if (single_byte_optimizable(str)) {
9659         unsigned char c;
9660         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9661     }
9662     else {
9663         char *tp;
9664
9665         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9666             unsigned int c = rb_enc_codepoint(tp, e, enc);
9667             if (c && !rb_isspace(c)) break;
9668             t = tp;
9669         }
9670     }
9671     return e - t;
9672 }
9673
9674 /*
9675  *  call-seq:
9676  *     str.rstrip!   -> self or nil
9677  *
9678  *  Removes trailing whitespace from the receiver.
9679  *  Returns the altered receiver, or +nil+ if no change was made.
9680  *  See also String#lstrip! and String#strip!.
9681  *
9682  *  Refer to String#strip for the definition of whitespace.
9683  *
9684  *     "  hello  ".rstrip!  #=> "  hello"
9685  *     "  hello".rstrip!    #=> nil
9686  *     "hello".rstrip!      #=> nil
9687  */
9688
9689 static VALUE
9690 rb_str_rstrip_bang(VALUE str)
9691 {
9692     rb_encoding *enc;
9693     char *start;
9694     long olen, roffset;
9695
9696     str_modify_keep_cr(str);
9697     enc = STR_ENC_GET(str);
9698     RSTRING_GETMEM(str, start, olen);
9699     roffset = rstrip_offset(str, start, start+olen, enc);
9700     if (roffset > 0) {
9701         long len = olen - roffset;
9702
9703         STR_SET_LEN(str, len);
9704         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9705         return str;
9706     }
9707     return Qnil;
9708 }
9709
9710
9711 /*
9712  *  call-seq:
9713  *     str.rstrip   -> new_str
9714  *
9715  *  Returns a copy of the receiver with trailing whitespace removed.
9716  *  See also String#lstrip and String#strip.
9717  *
9718  *  Refer to String#strip for the definition of whitespace.
9719  *
9720  *     "  hello  ".rstrip   #=> "  hello"
9721  *     "hello".rstrip       #=> "hello"
9722  */
9723
9724 static VALUE
9725 rb_str_rstrip(VALUE str)
9726 {
9727     rb_encoding *enc;
9728     char *start;
9729     long olen, roffset;
9730
9731     enc = STR_ENC_GET(str);
9732     RSTRING_GETMEM(str, start, olen);
9733     roffset = rstrip_offset(str, start, start+olen, enc);
9734
9735     if (roffset <= 0) return str_duplicate(rb_cString, str);
9736     return rb_str_subseq(str, 0, olen-roffset);
9737 }
9738
9739
9740 /*
9741  *  call-seq:
9742  *     str.strip!   -> self or nil
9743  *
9744  *  Removes leading and trailing whitespace from the receiver.
9745  *  Returns the altered receiver, or +nil+ if there was no change.
9746  *
9747  *  Refer to String#strip for the definition of whitespace.
9748  *
9749  *     "  hello  ".strip!  #=> "hello"
9750  *     "hello".strip!      #=> nil
9751  */
9752
9753 static VALUE
9754 rb_str_strip_bang(VALUE str)
9755 {
9756     char *start;
9757     long olen, loffset, roffset;
9758     rb_encoding *enc;
9759
9760     str_modify_keep_cr(str);
9761     enc = STR_ENC_GET(str);
9762     RSTRING_GETMEM(str, start, olen);
9763     loffset = lstrip_offset(str, start, start+olen, enc);
9764     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9765
9766     if (loffset > 0 || roffset > 0) {
9767         long len = olen-roffset;
9768         if (loffset > 0) {
9769             len -= loffset;
9770             memmove(start, start + loffset, len);
9771         }
9772         STR_SET_LEN(str, len);
9773         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9774         return str;
9775     }
9776     return Qnil;
9777 }
9778
9779
9780 /*
9781  *  call-seq:
9782  *     str.strip   -> new_str
9783  *
9784  *  Returns a copy of the receiver with leading and trailing whitespace removed.
9785  *
9786  *  Whitespace is defined as any of the following characters:
9787  *  null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9788  *
9789  *     "    hello    ".strip   #=> "hello"
9790  *     "\tgoodbye\r\n".strip   #=> "goodbye"
9791  *     "\x00\t\n\v\f\r ".strip #=> ""
9792  *     "hello".strip           #=> "hello"
9793  */
9794
9795 static VALUE
9796 rb_str_strip(VALUE str)
9797 {
9798     char *start;
9799     long olen, loffset, roffset;
9800     rb_encoding *enc = STR_ENC_GET(str);
9801
9802     RSTRING_GETMEM(str, start, olen);
9803     loffset = lstrip_offset(str, start, start+olen, enc);
9804     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9805
9806     if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9807     return rb_str_subseq(str, loffset, olen-loffset-roffset);
9808 }
9809
9810 static VALUE
9811 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9812 {
9813     VALUE result, match;
9814     struct re_registers *regs;
9815     int i;
9816     long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9817     if (pos >= 0) {
9818         if (BUILTIN_TYPE(pat) == T_STRING) {
9819             regs = NULL;
9820             end = pos + RSTRING_LEN(pat);
9821         }
9822         else {
9823             match = rb_backref_get();
9824             regs = RMATCH_REGS(match);
9825             pos = BEG(0);
9826             end = END(0);
9827         }
9828         if (pos == end) {
9829             rb_encoding *enc = STR_ENC_GET(str);
9830             /*
9831              * Always consume at least one character of the input string
9832              */
9833             if (RSTRING_LEN(str) > end)
9834                 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9835                                                   RSTRING_END(str), enc);
9836             else
9837                 *start = end + 1;
9838         }
9839         else {
9840             *start = end;
9841         }
9842         if (!regs || regs->num_regs == 1) {
9843             result = rb_str_subseq(str, pos, end - pos);
9844             return result;
9845         }
9846         result = rb_ary_new2(regs->num_regs);
9847         for (i=1; i < regs->num_regs; i++) {
9848             VALUE s = Qnil;
9849             if (BEG(i) >= 0) {
9850                 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9851             }
9852             rb_ary_push(result, s);
9853         }
9854
9855         return result;
9856     }
9857     return Qnil;
9858 }
9859
9860
9861 /*
9862  *  call-seq:
9863  *     str.scan(pattern)                         -> array
9864  *     str.scan(pattern) {|match, ...| block }   -> str
9865  *
9866  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
9867  *  Regexp or a String). For each match, a result is
9868  *  generated and either added to the result array or passed to the block. If
9869  *  the pattern contains no groups, each individual result consists of the
9870  *  matched string, <code>$&</code>.  If the pattern contains groups, each
9871  *  individual result is itself an array containing one entry per group.
9872  *
9873  *     a = "cruel world"
9874  *     a.scan(/\w+/)        #=> ["cruel", "world"]
9875  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
9876  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
9877  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
9878  *
9879  *  And the block form:
9880  *
9881  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
9882  *     print "\n"
9883  *     a.scan(/(.)(.)/) {|x,y| print y, x }
9884  *     print "\n"
9885  *
9886  *  <em>produces:</em>
9887  *
9888  *     <<cruel>> <<world>>
9889  *     rceu lowlr
9890  */
9891
9892 static VALUE
9893 rb_str_scan(VALUE str, VALUE pat)
9894 {
9895     VALUE result;
9896     long start = 0;
9897     long last = -1, prev = 0;
9898     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9899
9900     pat = get_pat_quoted(pat, 1);
9901     mustnot_broken(str);
9902     if (!rb_block_given_p()) {
9903         VALUE ary = rb_ary_new();
9904
9905         while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9906             last = prev;
9907             prev = start;
9908             rb_ary_push(ary, result);
9909         }
9910         if (last >= 0) rb_pat_search(pat, str, last, 1);
9911         else rb_backref_set(Qnil);
9912         return ary;
9913     }
9914
9915     while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9916         last = prev;
9917         prev = start;
9918         rb_yield(result);
9919         str_mod_check(str, p, len);
9920     }
9921     if (last >= 0) rb_pat_search(pat, str, last, 1);
9922     return str;
9923 }
9924
9925
9926 /*
9927  *  call-seq:
9928  *     str.hex   -> integer
9929  *
9930  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
9931  *  (with an optional sign and an optional <code>0x</code>) and returns the
9932  *  corresponding number. Zero is returned on error.
9933  *
9934  *     "0x0a".hex     #=> 10
9935  *     "-1234".hex    #=> -4660
9936  *     "0".hex        #=> 0
9937  *     "wombat".hex   #=> 0
9938  */
9939
9940 static VALUE
9941 rb_str_hex(VALUE str)
9942 {
9943     return rb_str_to_inum(str, 16, FALSE);
9944 }
9945
9946
9947 /*
9948  *  call-seq:
9949  *     str.oct   -> integer
9950  *
9951  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
9952  *  optional sign) and returns the corresponding number.  Returns 0 if the
9953  *  conversion fails.
9954  *
9955  *     "123".oct       #=> 83
9956  *     "-377".oct      #=> -255
9957  *     "bad".oct       #=> 0
9958  *     "0377bad".oct   #=> 255
9959  *
9960  *  If +str+ starts with <code>0</code>, radix indicators are honored.
9961  *  See Kernel#Integer.
9962  */
9963
9964 static VALUE
9965 rb_str_oct(VALUE str)
9966 {
9967     return rb_str_to_inum(str, -8, FALSE);
9968 }
9969
9970 #ifndef HAVE_CRYPT_R
9971 # include "ruby/thread_native.h"
9972 # include "ruby/atomic.h"
9973
9974 static struct {
9975     rb_atomic_t initialized;
9976     rb_nativethread_lock_t lock;
9977 } crypt_mutex;
9978
9979 static void
9980 crypt_mutex_destroy(void)
9981 {
9982     RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9983     rb_nativethread_lock_destroy(&crypt_mutex.lock);
9984     crypt_mutex.initialized = 0;
9985 }
9986
9987 static void
9988 crypt_mutex_initialize(void)
9989 {
9990     rb_atomic_t i;
9991     while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9992     switch (i) {
9993       case 0:
9994         rb_nativethread_lock_initialize(&crypt_mutex.lock);
9995         atexit(crypt_mutex_destroy);
9996         RUBY_ASSERT(crypt_mutex.initialized == 2);
9997         RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
9998         break;
9999       case 1:
10000         break;
10001       default:
10002         rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
10003     }
10004 }
10005 #endif
10006
10007 /*
10008  *  call-seq:
10009  *     str.crypt(salt_str)   -> new_str
10010  *
10011  *  Returns the string generated by calling <code>crypt(3)</code>
10012  *  standard library function with <code>str</code> and
10013  *  <code>salt_str</code>, in this order, as its arguments.  Please do
10014  *  not use this method any longer.  It is legacy; provided only for
10015  *  backward compatibility with ruby scripts in earlier days.  It is
10016  *  bad to use in contemporary programs for several reasons:
10017  *
10018  *  * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10019  *    run.  The generated string lacks data portability.
10020  *
10021  *  * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10022  *    (i.e. silently ends up in unexpected results).
10023  *
10024  *  * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10025  *    thread safe.
10026  *
10027  *  * So-called "traditional" usage of <code>crypt(3)</code> is very
10028  *    very very weak.  According to its manpage, Linux's traditional
10029  *    <code>crypt(3)</code> output has only 2**56 variations; too
10030  *    easy to brute force today.  And this is the default behaviour.
10031  *
10032  *  * In order to make things robust some OSes implement so-called
10033  *    "modular" usage. To go through, you have to do a complex
10034  *    build-up of the <code>salt_str</code> parameter, by hand.
10035  *    Failure in generation of a proper salt string tends not to
10036  *    yield any errors; typos in parameters are normally not
10037  *    detectable.
10038  *
10039  *    * For instance, in the following example, the second invocation
10040  *      of String#crypt is wrong; it has a typo in "round=" (lacks
10041  *      "s").  However the call does not fail and something unexpected
10042  *      is generated.
10043  *
10044  *         "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10045  *         "foo".crypt("$5$round=1000$salt$")  # Typo not detected
10046  *
10047  *  * Even in the "modular" mode, some hash functions are considered
10048  *    archaic and no longer recommended at all; for instance module
10049  *    <code>$1$</code> is officially abandoned by its author: see
10050  *    http://phk.freebsd.dk/sagas/md5crypt_eol/ .  For another
10051  *    instance module <code>$3$</code> is considered completely
10052  *    broken: see the manpage of FreeBSD.
10053  *
10054  *  * On some OS such as Mac OS, there is no modular mode. Yet, as
10055  *    written above, <code>crypt(3)</code> on Mac OS never fails.
10056  *    This means even if you build up a proper salt string it
10057  *    generates a traditional DES hash anyways, and there is no way
10058  *    for you to be aware of.
10059  *
10060  *        "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10061  *
10062  *  If for some reason you cannot migrate to other secure contemporary
10063  *  password hashing algorithms, install the string-crypt gem and
10064  *  <code>require 'string/crypt'</code> to continue using it.
10065  */
10066
10067 static VALUE
10068 rb_str_crypt(VALUE str, VALUE salt)
10069 {
10070 #ifdef HAVE_CRYPT_R
10071     VALUE databuf;
10072     struct crypt_data *data;
10073 #   define CRYPT_END() ALLOCV_END(databuf)
10074 #else
10075     extern char *crypt(const char *, const char *);
10076 #   define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10077 #endif
10078     VALUE result;
10079     const char *s, *saltp;
10080     char *res;
10081 #ifdef BROKEN_CRYPT
10082     char salt_8bit_clean[3];
10083 #endif
10084
10085     StringValue(salt);
10086     mustnot_wchar(str);
10087     mustnot_wchar(salt);
10088     s = StringValueCStr(str);
10089     saltp = RSTRING_PTR(salt);
10090     if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10091         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10092     }
10093
10094 #ifdef BROKEN_CRYPT
10095     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10096         salt_8bit_clean[0] = saltp[0] & 0x7f;
10097         salt_8bit_clean[1] = saltp[1] & 0x7f;
10098         salt_8bit_clean[2] = '\0';
10099         saltp = salt_8bit_clean;
10100     }
10101 #endif
10102 #ifdef HAVE_CRYPT_R
10103     data = ALLOCV(databuf, sizeof(struct crypt_data));
10104 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10105     data->initialized = 0;
10106 # endif
10107     res = crypt_r(s, saltp, data);
10108 #else
10109     crypt_mutex_initialize();
10110     rb_nativethread_lock_lock(&crypt_mutex.lock);
10111     res = crypt(s, saltp);
10112 #endif
10113     if (!res) {
10114         int err = errno;
10115         CRYPT_END();
10116         rb_syserr_fail(err, "crypt");
10117     }
10118     result = rb_str_new_cstr(res);
10119     CRYPT_END();
10120     return result;
10121 }
10122
10123
10124 /*
10125  *  call-seq:
10126  *     str.ord   -> integer
10127  *
10128  *  Returns the Integer ordinal of a one-character string.
10129  *
10130  *     "a".ord         #=> 97
10131  */
10132
10133 static VALUE
10134 rb_str_ord(VALUE s)
10135 {
10136     unsigned int c;
10137
10138     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10139     return UINT2NUM(c);
10140 }
10141 /*
10142  *  call-seq:
10143  *     str.sum(n=16)   -> integer
10144  *
10145  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10146  *  where <em>n</em> is the optional Integer parameter, defaulting
10147  *  to 16. The result is simply the sum of the binary value of each byte in
10148  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10149  *  checksum.
10150  */
10151
10152 static VALUE
10153 rb_str_sum(int argc, VALUE *argv, VALUE str)
10154 {
10155     int bits = 16;
10156     char *ptr, *p, *pend;
10157     long len;
10158     VALUE sum = INT2FIX(0);
10159     unsigned long sum0 = 0;
10160
10161     if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10162         bits = 0;
10163     }
10164     ptr = p = RSTRING_PTR(str);
10165     len = RSTRING_LEN(str);
10166     pend = p + len;
10167
10168     while (p < pend) {
10169         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10170             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10171             str_mod_check(str, ptr, len);
10172             sum0 = 0;
10173         }
10174         sum0 += (unsigned char)*p;
10175         p++;
10176     }
10177
10178     if (bits == 0) {
10179         if (sum0) {
10180             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10181         }
10182     }
10183     else {
10184         if (sum == INT2FIX(0)) {
10185             if (bits < (int)sizeof(long)*CHAR_BIT) {
10186                 sum0 &= (((unsigned long)1)<<bits)-1;
10187             }
10188             sum = LONG2FIX(sum0);
10189         }
10190         else {
10191             VALUE mod;
10192
10193             if (sum0) {
10194                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10195             }
10196
10197             mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10198             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10199             sum = rb_funcall(sum, '&', 1, mod);
10200         }
10201     }
10202     return sum;
10203 }
10204
10205 static VALUE
10206 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10207 {
10208     rb_encoding *enc;
10209     VALUE w;
10210     long width, len, flen = 1, fclen = 1;
10211     VALUE res;
10212     char *p;
10213     const char *f = " ";
10214     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10215     VALUE pad;
10216     int singlebyte = 1, cr;
10217     int termlen;
10218
10219     rb_scan_args(argc, argv, "11", &w, &pad);
10220     enc = STR_ENC_GET(str);
10221     termlen = rb_enc_mbminlen(enc);
10222     width = NUM2LONG(w);
10223     if (argc == 2) {
10224         StringValue(pad);
10225         enc = rb_enc_check(str, pad);
10226         f = RSTRING_PTR(pad);
10227         flen = RSTRING_LEN(pad);
10228         fclen = str_strlen(pad, enc); /* rb_enc_check */
10229         singlebyte = single_byte_optimizable(pad);
10230         if (flen == 0 || fclen == 0) {
10231             rb_raise(rb_eArgError, "zero width padding");
10232         }
10233     }
10234     len = str_strlen(str, enc); /* rb_enc_check */
10235     if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10236     n = width - len;
10237     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10238     rlen = n - llen;
10239     cr = ENC_CODERANGE(str);
10240     if (flen > 1) {
10241        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10242        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10243     }
10244     size = RSTRING_LEN(str);
10245     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10246        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10247        (len += llen2 + rlen2) >= LONG_MAX - size) {
10248        rb_raise(rb_eArgError, "argument too big");
10249     }
10250     len += size;
10251     res = str_new0(rb_cString, 0, len, termlen);
10252     p = RSTRING_PTR(res);
10253     if (flen <= 1) {
10254        memset(p, *f, llen);
10255        p += llen;
10256     }
10257     else {
10258        while (llen >= fclen) {
10259             memcpy(p,f,flen);
10260             p += flen;
10261             llen -= fclen;
10262         }
10263        if (llen > 0) {
10264            memcpy(p, f, llen2);
10265            p += llen2;
10266         }
10267     }
10268     memcpy(p, RSTRING_PTR(str), size);
10269     p += size;
10270     if (flen <= 1) {
10271        memset(p, *f, rlen);
10272        p += rlen;
10273     }
10274     else {
10275        while (rlen >= fclen) {
10276             memcpy(p,f,flen);
10277             p += flen;
10278             rlen -= fclen;
10279         }
10280        if (rlen > 0) {
10281            memcpy(p, f, rlen2);
10282            p += rlen2;
10283         }
10284     }
10285     TERM_FILL(p, termlen);
10286     STR_SET_LEN(res, p-RSTRING_PTR(res));
10287     rb_enc_associate(res, enc);
10288     if (argc == 2)
10289         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10290     if (cr != ENC_CODERANGE_BROKEN)
10291         ENC_CODERANGE_SET(res, cr);
10292
10293     RB_GC_GUARD(pad);
10294     return res;
10295 }
10296
10297
10298 /*
10299  *  call-seq:
10300  *     str.ljust(integer, padstr=' ')   -> new_str
10301  *
10302  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10303  *  String of length <i>integer</i> with <i>str</i> left justified
10304  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10305  *
10306  *     "hello".ljust(4)            #=> "hello"
10307  *     "hello".ljust(20)           #=> "hello               "
10308  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
10309  */
10310
10311 static VALUE
10312 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10313 {
10314     return rb_str_justify(argc, argv, str, 'l');
10315 }
10316
10317
10318 /*
10319  *  call-seq:
10320  *     str.rjust(integer, padstr=' ')   -> new_str
10321  *
10322  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10323  *  String of length <i>integer</i> with <i>str</i> right justified
10324  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10325  *
10326  *     "hello".rjust(4)            #=> "hello"
10327  *     "hello".rjust(20)           #=> "               hello"
10328  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
10329  */
10330
10331 static VALUE
10332 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10333 {
10334     return rb_str_justify(argc, argv, str, 'r');
10335 }
10336
10337
10338 /*
10339  *  call-seq:
10340  *     str.center(width, padstr=' ')   -> new_str
10341  *
10342  *  Centers +str+ in +width+.  If +width+ is greater than the length of +str+,
10343  *  returns a new String of length +width+ with +str+ centered and padded with
10344  *  +padstr+; otherwise, returns +str+.
10345  *
10346  *     "hello".center(4)         #=> "hello"
10347  *     "hello".center(20)        #=> "       hello        "
10348  *     "hello".center(20, '123') #=> "1231231hello12312312"
10349  */
10350
10351 static VALUE
10352 rb_str_center(int argc, VALUE *argv, VALUE str)
10353 {
10354     return rb_str_justify(argc, argv, str, 'c');
10355 }
10356
10357 /*
10358  *  call-seq:
10359  *     str.partition(sep)              -> [head, sep, tail]
10360  *     str.partition(regexp)           -> [head, match, tail]
10361  *
10362  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10363  *  and returns the part before it, the match, and the part
10364  *  after it.
10365  *  If it is not found, returns two empty strings and <i>str</i>.
10366  *
10367  *     "hello".partition("l")         #=> ["he", "l", "lo"]
10368  *     "hello".partition("x")         #=> ["hello", "", ""]
10369  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
10370  */
10371
10372 static VALUE
10373 rb_str_partition(VALUE str, VALUE sep)
10374 {
10375     long pos;
10376
10377     sep = get_pat_quoted(sep, 0);
10378     if (RB_TYPE_P(sep, T_REGEXP)) {
10379         if (rb_reg_search(sep, str, 0, 0) < 0) {
10380             goto failed;
10381         }
10382         VALUE match = rb_backref_get();
10383         struct re_registers *regs = RMATCH_REGS(match);
10384
10385         pos = BEG(0);
10386         sep = rb_str_subseq(str, pos, END(0) - pos);
10387     }
10388     else {
10389         pos = rb_str_index(str, sep, 0);
10390         if (pos < 0) goto failed;
10391     }
10392     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10393                           sep,
10394                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10395                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10396
10397   failed:
10398     return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10399 }
10400
10401 /*
10402  *  call-seq:
10403  *     str.rpartition(sep)             -> [head, sep, tail]
10404  *     str.rpartition(regexp)          -> [head, match, tail]
10405  *
10406  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10407  *  of the string, and returns the part before it, the match, and the part
10408  *  after it.
10409  *  If it is not found, returns two empty strings and <i>str</i>.
10410  *
10411  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
10412  *     "hello".rpartition("x")         #=> ["", "", "hello"]
10413  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
10414  *
10415  *  The match from the end means starting at the possible last position, not
10416  *  the last of longest matches.
10417  *
10418  *     "hello".rpartition(/l+/)        #=> ["hel", "l", "o"]
10419  *
10420  *  To partition at the last longest match, needs to combine with
10421  *  negative lookbehind.
10422  *
10423  *     "hello".rpartition(/(?<!l)l+/)  #=> ["he", "ll", "o"]
10424  *
10425  *  Or String#partition with negative lookforward.
10426  *
10427  *     "hello".partition(/l+(?!.*l)/)  #=> ["he", "ll", "o"]
10428  */
10429
10430 static VALUE
10431 rb_str_rpartition(VALUE str, VALUE sep)
10432 {
10433     long pos = RSTRING_LEN(str);
10434
10435     sep = get_pat_quoted(sep, 0);
10436     if (RB_TYPE_P(sep, T_REGEXP)) {
10437         if (rb_reg_search(sep, str, pos, 1) < 0) {
10438             goto failed;
10439         }
10440         VALUE match = rb_backref_get();
10441         struct re_registers *regs = RMATCH_REGS(match);
10442
10443         pos = BEG(0);
10444         sep = rb_str_subseq(str, pos, END(0) - pos);
10445     }
10446     else {
10447         pos = rb_str_sublen(str, pos);
10448         pos = rb_str_rindex(str, sep, pos);
10449         if (pos < 0) {
10450             goto failed;
10451         }
10452         pos = rb_str_offset(str, pos);
10453     }
10454
10455     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10456                           sep,
10457                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10458                                         RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10459   failed:
10460     return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10461 }
10462
10463 /*
10464  *  call-seq:
10465  *     str.start_with?([prefixes]+)   -> true or false
10466  *
10467  *  Returns true if +str+ starts with one of the +prefixes+ given.
10468  *  Each of the +prefixes+ should be a String or a Regexp.
10469  *
10470  *    "hello".start_with?("hell")               #=> true
10471  *    "hello".start_with?(/H/i)                 #=> true
10472  *
10473  *    # returns true if one of the prefixes matches.
10474  *    "hello".start_with?("heaven", "hell")     #=> true
10475  *    "hello".start_with?("heaven", "paradise") #=> false
10476  */
10477
10478 static VALUE
10479 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10480 {
10481     int i;
10482
10483     for (i=0; i<argc; i++) {
10484         VALUE tmp = argv[i];
10485         if (RB_TYPE_P(tmp, T_REGEXP)) {
10486             if (rb_reg_start_with_p(tmp, str))
10487                 return Qtrue;
10488         }
10489         else {
10490             StringValue(tmp);
10491             rb_enc_check(str, tmp);
10492             if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10493             if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10494                 return Qtrue;
10495         }
10496     }
10497     return Qfalse;
10498 }
10499
10500 /*
10501  *  call-seq:
10502  *     str.end_with?([suffixes]+)   -> true or false
10503  *
10504  *  Returns true if +str+ ends with one of the +suffixes+ given.
10505  *
10506  *    "hello".end_with?("ello")               #=> true
10507  *
10508  *    # returns true if one of the +suffixes+ matches.
10509  *    "hello".end_with?("heaven", "ello")     #=> true
10510  *    "hello".end_with?("heaven", "paradise") #=> false
10511  */
10512
10513 static VALUE
10514 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10515 {
10516     int i;
10517     char *p, *s, *e;
10518     rb_encoding *enc;
10519
10520     for (i=0; i<argc; i++) {
10521         VALUE tmp = argv[i];
10522         long slen, tlen;
10523         StringValue(tmp);
10524         enc = rb_enc_check(str, tmp);
10525         if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10526         if ((slen = RSTRING_LEN(str)) < tlen) continue;
10527         p = RSTRING_PTR(str);
10528         e = p + slen;
10529         s = e - tlen;
10530         if (rb_enc_left_char_head(p, s, e, enc) != s)
10531             continue;
10532         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10533             return Qtrue;
10534     }
10535     return Qfalse;
10536 }
10537
10538 /*!
10539  * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10540  * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10541  *
10542  * @param str the target
10543  * @param prefix the prefix
10544  * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10545  * @retval Positive-Integer otherwise
10546  */
10547 static long
10548 deleted_prefix_length(VALUE str, VALUE prefix)
10549 {
10550     char *strptr, *prefixptr;
10551     long olen, prefixlen;
10552
10553     StringValue(prefix);
10554     if (is_broken_string(prefix)) return 0;
10555     rb_enc_check(str, prefix);
10556
10557     /* return 0 if not start with prefix */
10558     prefixlen = RSTRING_LEN(prefix);
10559     if (prefixlen <= 0) return 0;
10560     olen = RSTRING_LEN(str);
10561     if (olen < prefixlen) return 0;
10562     strptr = RSTRING_PTR(str);
10563     prefixptr = RSTRING_PTR(prefix);
10564     if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10565
10566     return prefixlen;
10567 }
10568
10569 /*
10570  *  call-seq:
10571  *     str.delete_prefix!(prefix) -> self or nil
10572  *
10573  *  Deletes leading <code>prefix</code> from <i>str</i>, returning
10574  *  <code>nil</code> if no change was made.
10575  *
10576  *     "hello".delete_prefix!("hel") #=> "lo"
10577  *     "hello".delete_prefix!("llo") #=> nil
10578  */
10579
10580 static VALUE
10581 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10582 {
10583     long prefixlen;
10584     str_modify_keep_cr(str);
10585
10586     prefixlen = deleted_prefix_length(str, prefix);
10587     if (prefixlen <= 0) return Qnil;
10588
10589     return rb_str_drop_bytes(str, prefixlen);
10590 }
10591
10592 /*
10593  *  call-seq:
10594  *     str.delete_prefix(prefix) -> new_str
10595  *
10596  *  Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10597  *
10598  *     "hello".delete_prefix("hel") #=> "lo"
10599  *     "hello".delete_prefix("llo") #=> "hello"
10600  */
10601
10602 static VALUE
10603 rb_str_delete_prefix(VALUE str, VALUE prefix)
10604 {
10605     long prefixlen;
10606
10607     prefixlen = deleted_prefix_length(str, prefix);
10608     if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10609
10610     return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10611 }
10612
10613 /*!
10614  * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10615  * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10616  *
10617  * @param str the target
10618  * @param suffix the suffix
10619  * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10620  * @retval Positive-Integer otherwise
10621  */
10622 static long
10623 deleted_suffix_length(VALUE str, VALUE suffix)
10624 {
10625     char *strptr, *suffixptr, *s;
10626     long olen, suffixlen;
10627     rb_encoding *enc;
10628
10629     StringValue(suffix);
10630     if (is_broken_string(suffix)) return 0;
10631     enc = rb_enc_check(str, suffix);
10632
10633     /* return 0 if not start with suffix */
10634     suffixlen = RSTRING_LEN(suffix);
10635     if (suffixlen <= 0) return 0;
10636     olen = RSTRING_LEN(str);
10637     if (olen < suffixlen) return 0;
10638     strptr = RSTRING_PTR(str);
10639     suffixptr = RSTRING_PTR(suffix);
10640     s = strptr + olen - suffixlen;
10641     if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10642     if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10643
10644     return suffixlen;
10645 }
10646
10647 /*
10648  *  call-seq:
10649  *     str.delete_suffix!(suffix) -> self or nil
10650  *
10651  *  Deletes trailing <code>suffix</code> from <i>str</i>, returning
10652  *  <code>nil</code> if no change was made.
10653  *
10654  *     "hello".delete_suffix!("llo") #=> "he"
10655  *     "hello".delete_suffix!("hel") #=> nil
10656  */
10657
10658 static VALUE
10659 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10660 {
10661     long olen, suffixlen, len;
10662     str_modifiable(str);
10663
10664     suffixlen = deleted_suffix_length(str, suffix);
10665     if (suffixlen <= 0) return Qnil;
10666
10667     olen = RSTRING_LEN(str);
10668     str_modify_keep_cr(str);
10669     len = olen - suffixlen;
10670     STR_SET_LEN(str, len);
10671     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10672     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10673         ENC_CODERANGE_CLEAR(str);
10674     }
10675     return str;
10676 }
10677
10678 /*
10679  *  call-seq:
10680  *     str.delete_suffix(suffix) -> new_str
10681  *
10682  *  Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10683  *
10684  *     "hello".delete_suffix("llo") #=> "he"
10685  *     "hello".delete_suffix("hel") #=> "hello"
10686  */
10687
10688 static VALUE
10689 rb_str_delete_suffix(VALUE str, VALUE suffix)
10690 {
10691     long suffixlen;
10692
10693     suffixlen = deleted_suffix_length(str, suffix);
10694     if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10695
10696     return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10697 }
10698
10699 void
10700 rb_str_setter(VALUE val, ID id, VALUE *var)
10701 {
10702     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10703         rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10704     }
10705     *var = val;
10706 }
10707
10708 static void
10709 rb_fs_setter(VALUE val, ID id, VALUE *var)
10710 {
10711     val = rb_fs_check(val);
10712     if (!val) {
10713         rb_raise(rb_eTypeError,
10714                  "value of %"PRIsVALUE" must be String or Regexp",
10715                  rb_id2str(id));
10716     }
10717     if (!NIL_P(val)) {
10718         rb_warn_deprecated("`$;'", NULL);
10719     }
10720     *var = val;
10721 }
10722
10723
10724 /*
10725  *  call-seq:
10726  *     str.force_encoding(encoding)   -> str
10727  *
10728  *  Changes the encoding to +encoding+ and returns self.
10729  */
10730
10731 static VALUE
10732 rb_str_force_encoding(VALUE str, VALUE enc)
10733 {
10734     str_modifiable(str);
10735     rb_enc_associate(str, rb_to_encoding(enc));
10736     ENC_CODERANGE_CLEAR(str);
10737     return str;
10738 }
10739
10740 /*
10741  *  call-seq:
10742  *     str.b   -> str
10743  *
10744  *  Returns a copied string whose encoding is ASCII-8BIT.
10745  */
10746
10747 static VALUE
10748 rb_str_b(VALUE str)
10749 {
10750     VALUE str2;
10751     if (FL_TEST(str, STR_NOEMBED)) {
10752         str2 = str_alloc_heap(rb_cString);
10753     }
10754     else {
10755         str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10756     }
10757     str_replace_shared_without_enc(str2, str);
10758     ENC_CODERANGE_CLEAR(str2);
10759     return str2;
10760 }
10761
10762 /*
10763  *  call-seq:
10764  *     str.valid_encoding?  -> true or false
10765  *
10766  *  Returns true for a string which is encoded correctly.
10767  *
10768  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
10769  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
10770  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
10771  */
10772
10773 static VALUE
10774 rb_str_valid_encoding_p(VALUE str)
10775 {
10776     int cr = rb_enc_str_coderange(str);
10777
10778     return RBOOL(cr != ENC_CODERANGE_BROKEN);
10779 }
10780
10781 /*
10782  *  call-seq:
10783  *     str.ascii_only?  -> true or false
10784  *
10785  *  Returns true for a string which has only ASCII characters.
10786  *
10787  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
10788  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
10789  */
10790
10791 static VALUE
10792 rb_str_is_ascii_only_p(VALUE str)
10793 {
10794     int cr = rb_enc_str_coderange(str);
10795
10796     return RBOOL(cr == ENC_CODERANGE_7BIT);
10797 }
10798
10799 VALUE
10800 rb_str_ellipsize(VALUE str, long len)
10801 {
10802     static const char ellipsis[] = "...";
10803     const long ellipsislen = sizeof(ellipsis) - 1;
10804     rb_encoding *const enc = rb_enc_get(str);
10805     const long blen = RSTRING_LEN(str);
10806     const char *const p = RSTRING_PTR(str), *e = p + blen;
10807     VALUE estr, ret = 0;
10808
10809     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10810     if (len * rb_enc_mbminlen(enc) >= blen ||
10811         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10812         ret = str;
10813     }
10814     else if (len <= ellipsislen ||
10815              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10816         if (rb_enc_asciicompat(enc)) {
10817             ret = rb_str_new(ellipsis, len);
10818             rb_enc_associate(ret, enc);
10819         }
10820         else {
10821             estr = rb_usascii_str_new(ellipsis, len);
10822             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10823         }
10824     }
10825     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10826         rb_str_cat(ret, ellipsis, ellipsislen);
10827     }
10828     else {
10829         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10830                              rb_enc_from_encoding(enc), 0, Qnil);
10831         rb_str_append(ret, estr);
10832     }
10833     return ret;
10834 }
10835
10836 static VALUE
10837 str_compat_and_valid(VALUE str, rb_encoding *enc)
10838 {
10839     int cr;
10840     str = StringValue(str);
10841     cr = rb_enc_str_coderange(str);
10842     if (cr == ENC_CODERANGE_BROKEN) {
10843         rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10844     }
10845     else {
10846         rb_encoding *e = STR_ENC_GET(str);
10847         if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10848             rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10849                      rb_enc_name(enc), rb_enc_name(e));
10850         }
10851     }
10852     return str;
10853 }
10854
10855 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10856
10857 VALUE
10858 rb_str_scrub(VALUE str, VALUE repl)
10859 {
10860     rb_encoding *enc = STR_ENC_GET(str);
10861     return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10862 }
10863
10864 VALUE
10865 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10866 {
10867     int cr = ENC_CODERANGE_UNKNOWN;
10868     if (enc == STR_ENC_GET(str)) {
10869         /* cached coderange makes sense only when enc equals the
10870          * actual encoding of str */
10871         cr = ENC_CODERANGE(str);
10872     }
10873     return enc_str_scrub(enc, str, repl, cr);
10874 }
10875
10876 static VALUE
10877 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10878 {
10879     int encidx;
10880     VALUE buf = Qnil;
10881     const char *rep, *p, *e, *p1, *sp;
10882     long replen = -1;
10883     long slen;
10884
10885     if (rb_block_given_p()) {
10886         if (!NIL_P(repl))
10887             rb_raise(rb_eArgError, "both of block and replacement given");
10888         replen = 0;
10889     }
10890
10891     if (ENC_CODERANGE_CLEAN_P(cr))
10892         return Qnil;
10893
10894     if (!NIL_P(repl)) {
10895         repl = str_compat_and_valid(repl, enc);
10896     }
10897
10898     if (rb_enc_dummy_p(enc)) {
10899         return Qnil;
10900     }
10901     encidx = rb_enc_to_index(enc);
10902
10903 #define DEFAULT_REPLACE_CHAR(str) do { \
10904         static const char replace[sizeof(str)-1] = str; \
10905         rep = replace; replen = (int)sizeof(replace); \
10906     } while (0)
10907
10908     slen = RSTRING_LEN(str);
10909     p = RSTRING_PTR(str);
10910     e = RSTRING_END(str);
10911     p1 = p;
10912     sp = p;
10913
10914     if (rb_enc_asciicompat(enc)) {
10915         int rep7bit_p;
10916         if (!replen) {
10917             rep = NULL;
10918             rep7bit_p = FALSE;
10919         }
10920         else if (!NIL_P(repl)) {
10921             rep = RSTRING_PTR(repl);
10922             replen = RSTRING_LEN(repl);
10923             rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10924         }
10925         else if (encidx == rb_utf8_encindex()) {
10926             DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10927             rep7bit_p = FALSE;
10928         }
10929         else {
10930             DEFAULT_REPLACE_CHAR("?");
10931             rep7bit_p = TRUE;
10932         }
10933         cr = ENC_CODERANGE_7BIT;
10934
10935         p = search_nonascii(p, e);
10936         if (!p) {
10937             p = e;
10938         }
10939         while (p < e) {
10940             int ret = rb_enc_precise_mbclen(p, e, enc);
10941             if (MBCLEN_NEEDMORE_P(ret)) {
10942                 break;
10943             }
10944             else if (MBCLEN_CHARFOUND_P(ret)) {
10945                 cr = ENC_CODERANGE_VALID;
10946                 p += MBCLEN_CHARFOUND_LEN(ret);
10947             }
10948             else if (MBCLEN_INVALID_P(ret)) {
10949                 /*
10950                  * p1~p: valid ascii/multibyte chars
10951                  * p ~e: invalid bytes + unknown bytes
10952                  */
10953                 long clen = rb_enc_mbmaxlen(enc);
10954                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10955                 if (p > p1) {
10956                     rb_str_buf_cat(buf, p1, p - p1);
10957                 }
10958
10959                 if (e - p < clen) clen = e - p;
10960                 if (clen <= 2) {
10961                     clen = 1;
10962                 }
10963                 else {
10964                     const char *q = p;
10965                     clen--;
10966                     for (; clen > 1; clen--) {
10967                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
10968                         if (MBCLEN_NEEDMORE_P(ret)) break;
10969                         if (MBCLEN_INVALID_P(ret)) continue;
10970                         UNREACHABLE;
10971                     }
10972                 }
10973                 if (rep) {
10974                     rb_str_buf_cat(buf, rep, replen);
10975                     if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10976                 }
10977                 else {
10978                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
10979                     str_mod_check(str, sp, slen);
10980                     repl = str_compat_and_valid(repl, enc);
10981                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10982                     if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10983                         cr = ENC_CODERANGE_VALID;
10984                 }
10985                 p += clen;
10986                 p1 = p;
10987                 p = search_nonascii(p, e);
10988                 if (!p) {
10989                     p = e;
10990                     break;
10991                 }
10992             }
10993             else {
10994                 UNREACHABLE;
10995             }
10996         }
10997         if (NIL_P(buf)) {
10998             if (p == e) {
10999                 ENC_CODERANGE_SET(str, cr);
11000                 return Qnil;
11001             }
11002             buf = rb_str_buf_new(RSTRING_LEN(str));
11003         }
11004         if (p1 < p) {
11005             rb_str_buf_cat(buf, p1, p - p1);
11006         }
11007         if (p < e) {
11008             if (rep) {
11009                 rb_str_buf_cat(buf, rep, replen);
11010                 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11011             }
11012             else {
11013                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11014                 str_mod_check(str, sp, slen);
11015                 repl = str_compat_and_valid(repl, enc);
11016                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11017                 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11018                     cr = ENC_CODERANGE_VALID;
11019             }
11020         }
11021     }
11022     else {
11023         /* ASCII incompatible */
11024         long mbminlen = rb_enc_mbminlen(enc);
11025         if (!replen) {
11026             rep = NULL;
11027         }
11028         else if (!NIL_P(repl)) {
11029             rep = RSTRING_PTR(repl);
11030             replen = RSTRING_LEN(repl);
11031         }
11032         else if (encidx == ENCINDEX_UTF_16BE) {
11033             DEFAULT_REPLACE_CHAR("\xFF\xFD");
11034         }
11035         else if (encidx == ENCINDEX_UTF_16LE) {
11036             DEFAULT_REPLACE_CHAR("\xFD\xFF");
11037         }
11038         else if (encidx == ENCINDEX_UTF_32BE) {
11039             DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11040         }
11041         else if (encidx == ENCINDEX_UTF_32LE) {
11042             DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11043         }
11044         else {
11045             DEFAULT_REPLACE_CHAR("?");
11046         }
11047
11048         while (p < e) {
11049             int ret = rb_enc_precise_mbclen(p, e, enc);
11050             if (MBCLEN_NEEDMORE_P(ret)) {
11051                 break;
11052             }
11053             else if (MBCLEN_CHARFOUND_P(ret)) {
11054                 p += MBCLEN_CHARFOUND_LEN(ret);
11055             }
11056             else if (MBCLEN_INVALID_P(ret)) {
11057                 const char *q = p;
11058                 long clen = rb_enc_mbmaxlen(enc);
11059                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11060                 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11061
11062                 if (e - p < clen) clen = e - p;
11063                 if (clen <= mbminlen * 2) {
11064                     clen = mbminlen;
11065                 }
11066                 else {
11067                     clen -= mbminlen;
11068                     for (; clen > mbminlen; clen-=mbminlen) {
11069                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
11070                         if (MBCLEN_NEEDMORE_P(ret)) break;
11071                         if (MBCLEN_INVALID_P(ret)) continue;
11072                         UNREACHABLE;
11073                     }
11074                 }
11075                 if (rep) {
11076                     rb_str_buf_cat(buf, rep, replen);
11077                 }
11078                 else {
11079                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
11080                     str_mod_check(str, sp, slen);
11081                     repl = str_compat_and_valid(repl, enc);
11082                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11083                 }
11084                 p += clen;
11085                 p1 = p;
11086             }
11087             else {
11088                 UNREACHABLE;
11089             }
11090         }
11091         if (NIL_P(buf)) {
11092             if (p == e) {
11093                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11094                 return Qnil;
11095             }
11096             buf = rb_str_buf_new(RSTRING_LEN(str));
11097         }
11098         if (p1 < p) {
11099             rb_str_buf_cat(buf, p1, p - p1);
11100         }
11101         if (p < e) {
11102             if (rep) {
11103                 rb_str_buf_cat(buf, rep, replen);
11104             }
11105             else {
11106                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11107                 str_mod_check(str, sp, slen);
11108                 repl = str_compat_and_valid(repl, enc);
11109                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11110             }
11111         }
11112         cr = ENC_CODERANGE_VALID;
11113     }
11114     ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11115     return buf;
11116 }
11117
11118 /*
11119  *  call-seq:
11120  *    str.scrub -> new_str
11121  *    str.scrub(repl) -> new_str
11122  *    str.scrub{|bytes|} -> new_str
11123  *
11124  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
11125  *  character, else returns self.
11126  *  If block is given, replace invalid bytes with returned value of the block.
11127  *
11128  *     "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11129  *     "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11130  *     "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11131  */
11132 static VALUE
11133 str_scrub(int argc, VALUE *argv, VALUE str)
11134 {
11135     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11136     VALUE new = rb_str_scrub(str, repl);
11137     return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11138 }
11139
11140 /*
11141  *  call-seq:
11142  *    str.scrub! -> str
11143  *    str.scrub!(repl) -> str
11144  *    str.scrub!{|bytes|} -> str
11145  *
11146  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
11147  *  character, else returns self.
11148  *  If block is given, replace invalid bytes with returned value of the block.
11149  *
11150  *     "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11151  *     "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11152  *     "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11153  */
11154 static VALUE
11155 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11156 {
11157     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11158     VALUE new = rb_str_scrub(str, repl);
11159     if (!NIL_P(new)) rb_str_replace(str, new);
11160     return str;
11161 }
11162
11163 static ID id_normalize;
11164 static ID id_normalized_p;
11165 static VALUE mUnicodeNormalize;
11166
11167 static VALUE
11168 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11169 {
11170     static int UnicodeNormalizeRequired = 0;
11171     VALUE argv2[2];
11172
11173     if (!UnicodeNormalizeRequired) {
11174         rb_require("unicode_normalize/normalize.rb");
11175         UnicodeNormalizeRequired = 1;
11176     }
11177     argv2[0] = str;
11178     if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11179     return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11180 }
11181
11182 /*
11183  *  call-seq:
11184  *    str.unicode_normalize(form=:nfc)
11185  *
11186  *  Unicode Normalization---Returns a normalized form of +str+,
11187  *  using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11188  *  The normalization form used is determined by +form+, which can
11189  *  be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11190  *  The default is +:nfc+.
11191  *
11192  *  If the string is not in a Unicode Encoding, then an Exception is raised.
11193  *  In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11194  *  and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11195  *  Anything other than UTF-8 is implemented by converting to UTF-8,
11196  *  which makes it slower than UTF-8.
11197  *
11198  *    "a\u0300".unicode_normalize        #=> "\u00E0"
11199  *    "a\u0300".unicode_normalize(:nfc)  #=> "\u00E0"
11200  *    "\u00E0".unicode_normalize(:nfd)   #=> "a\u0300"
11201  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11202  *                                       #=> Encoding::CompatibilityError raised
11203  */
11204 static VALUE
11205 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11206 {
11207     return unicode_normalize_common(argc, argv, str, id_normalize);
11208 }
11209
11210 /*
11211  *  call-seq:
11212  *    str.unicode_normalize!(form=:nfc)
11213  *
11214  *  Destructive version of String#unicode_normalize, doing Unicode
11215  *  normalization in place.
11216  */
11217 static VALUE
11218 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11219 {
11220     return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11221 }
11222
11223 /*  call-seq:
11224  *    str.unicode_normalized?(form=:nfc)
11225  *
11226  *  Checks whether +str+ is in Unicode normalization form +form+,
11227  *  which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11228  *  The default is +:nfc+.
11229  *
11230  *  If the string is not in a Unicode Encoding, then an Exception is raised.
11231  *  For details, see String#unicode_normalize.
11232  *
11233  *    "a\u0300".unicode_normalized?        #=> false
11234  *    "a\u0300".unicode_normalized?(:nfd)  #=> true
11235  *    "\u00E0".unicode_normalized?         #=> true
11236  *    "\u00E0".unicode_normalized?(:nfd)   #=> false
11237  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11238  *                                         #=> Encoding::CompatibilityError raised
11239  */
11240 static VALUE
11241 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11242 {
11243     return unicode_normalize_common(argc, argv, str, id_normalized_p);
11244 }
11245
11246 /**********************************************************************
11247  * Document-class: Symbol
11248  *
11249  * Symbol objects represent named identifiers inside the Ruby interpreter.
11250  *
11251  * You can create a \Symbol object explicitly with:
11252  *
11253  * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11254  *
11255  * The same Symbol object will be
11256  * created for a given name or string for the duration of a program's
11257  * execution, regardless of the context or meaning of that name. Thus
11258  * if <code>Fred</code> is a constant in one context, a method in
11259  * another, and a class in a third, the Symbol <code>:Fred</code>
11260  * will be the same object in all three contexts.
11261  *
11262  *     module One
11263  *       class Fred
11264  *       end
11265  *       $f1 = :Fred
11266  *     end
11267  *     module Two
11268  *       Fred = 1
11269  *       $f2 = :Fred
11270  *     end
11271  *     def Fred()
11272  *     end
11273  *     $f3 = :Fred
11274  *     $f1.object_id   #=> 2514190
11275  *     $f2.object_id   #=> 2514190
11276  *     $f3.object_id   #=> 2514190
11277  *
11278  * Constant, method, and variable names are returned as symbols:
11279  *
11280  *     module One
11281  *       Two = 2
11282  *       def three; 3 end
11283  *       @four = 4
11284  *       @@five = 5
11285  *       $six = 6
11286  *     end
11287  *     seven = 7
11288  *
11289  *     One.constants
11290  *     # => [:Two]
11291  *     One.instance_methods(true)
11292  *     # => [:three]
11293  *     One.instance_variables
11294  *     # => [:@four]
11295  *     One.class_variables
11296  *     # => [:@@five]
11297  *     global_variables.grep(/six/)
11298  *     # => [:$six]
11299  *     local_variables
11300  *     # => [:seven]
11301  *
11302  * Symbol objects are different from String objects in that
11303  * Symbol objects represent identifiers, while String objects
11304  * represent text or data.
11305  *
11306  * == What's Here
11307  *
11308  * First, what's elsewhere. \Class \Symbol:
11309  *
11310  * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11311  * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11312  *
11313  * Here, class \Symbol provides methods that are useful for:
11314  *
11315  * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11316  * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11317  * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11318  *
11319  * === Methods for Querying
11320  *
11321  * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11322  * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11323  *                          in symbol that matches a given Regexp
11324  *                          or other object; returns +nil+ if no match is found.
11325  * - #[], #slice :: Returns a substring of symbol
11326  *                  determined by a given index, start/length, or range, or string.
11327  * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11328  * - #encoding:: Returns the Encoding object that represents the encoding
11329  *               of symbol.
11330  * - #end_with?:: Returns +true+ if symbol ends with
11331  *                any of the given strings.
11332  * - #match:: Returns a MatchData object if symbol
11333  *            matches a given Regexp; +nil+ otherwise.
11334  * - #match?:: Returns +true+ if symbol
11335  *             matches a given Regexp; +false+ otherwise.
11336  * - #length, #size:: Returns the number of characters in symbol.
11337  * - #start_with?:: Returns +true+ if symbol starts with
11338  *                  any of the given strings.
11339  *
11340  * === Methods for Comparing
11341  *
11342  * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11343  * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11344  *                                  has the same content and encoding.
11345  * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11346  *              symbol is smaller than, equal to, or larger than symbol.
11347  * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11348  *               after Unicode case folding; +false+ otherwise.
11349  *
11350  * === Methods for Converting
11351  *
11352  * - #capitalize:: Returns symbol with the first character upcased
11353  *                 and all other characters downcased.
11354  * - #downcase:: Returns symbol with all characters downcased.
11355  * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11356  * - #name:: Returns the frozen string corresponding to symbol.
11357  * - #succ, #next:: Returns the symbol that is the successor to symbol.
11358  * - #swapcase:: Returns symbol with all upcase characters downcased
11359  *               and all downcase characters upcased.
11360  * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11361  * - #to_s, #id2name:: Returns the string corresponding to +self+.
11362  * - #to_sym, #intern:: Returns +self+.
11363  * - #upcase:: Returns symbol with all characters upcased.
11364  *
11365  */
11366
11367
11368 /*
11369  *  call-seq:
11370  *     sym == obj   -> true or false
11371  *
11372  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11373  *  symbol, returns <code>true</code>.
11374  */
11375
11376 #define sym_equal rb_obj_equal
11377
11378 static int
11379 sym_printable(const char *s, const char *send, rb_encoding *enc)
11380 {
11381     while (s < send) {
11382         int n;
11383         int c = rb_enc_precise_mbclen(s, send, enc);
11384
11385         if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11386         n = MBCLEN_CHARFOUND_LEN(c);
11387         c = rb_enc_mbc_to_codepoint(s, send, enc);
11388         if (!rb_enc_isprint(c, enc)) return FALSE;
11389         s += n;
11390     }
11391     return TRUE;
11392 }
11393
11394 int
11395 rb_str_symname_p(VALUE sym)
11396 {
11397     rb_encoding *enc;
11398     const char *ptr;
11399     long len;
11400     rb_encoding *resenc = rb_default_internal_encoding();
11401
11402     if (resenc == NULL) resenc = rb_default_external_encoding();
11403     enc = STR_ENC_GET(sym);
11404     ptr = RSTRING_PTR(sym);
11405     len = RSTRING_LEN(sym);
11406     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11407         !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11408         return FALSE;
11409     }
11410     return TRUE;
11411 }
11412
11413 VALUE
11414 rb_str_quote_unprintable(VALUE str)
11415 {
11416     rb_encoding *enc;
11417     const char *ptr;
11418     long len;
11419     rb_encoding *resenc;
11420
11421     Check_Type(str, T_STRING);
11422     resenc = rb_default_internal_encoding();
11423     if (resenc == NULL) resenc = rb_default_external_encoding();
11424     enc = STR_ENC_GET(str);
11425     ptr = RSTRING_PTR(str);
11426     len = RSTRING_LEN(str);
11427     if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11428         !sym_printable(ptr, ptr + len, enc)) {
11429         return rb_str_escape(str);
11430     }
11431     return str;
11432 }
11433
11434 MJIT_FUNC_EXPORTED VALUE
11435 rb_id_quote_unprintable(ID id)
11436 {
11437     VALUE str = rb_id2str(id);
11438     if (!rb_str_symname_p(str)) {
11439         return rb_str_escape(str);
11440     }
11441     return str;
11442 }
11443
11444 /*
11445  *  call-seq:
11446  *     sym.inspect    -> string
11447  *
11448  *  Returns the representation of <i>sym</i> as a symbol literal.
11449  *
11450  *     :fred.inspect   #=> ":fred"
11451  */
11452
11453 static VALUE
11454 sym_inspect(VALUE sym)
11455 {
11456     VALUE str = rb_sym2str(sym);
11457     const char *ptr;
11458     long len;
11459     char *dest;
11460
11461     if (!rb_str_symname_p(str)) {
11462         str = rb_str_inspect(str);
11463         len = RSTRING_LEN(str);
11464         rb_str_resize(str, len + 1);
11465         dest = RSTRING_PTR(str);
11466         memmove(dest + 1, dest, len);
11467     }
11468     else {
11469         rb_encoding *enc = STR_ENC_GET(str);
11470         RSTRING_GETMEM(str, ptr, len);
11471         str = rb_enc_str_new(0, len + 1, enc);
11472         dest = RSTRING_PTR(str);
11473         memcpy(dest + 1, ptr, len);
11474     }
11475     dest[0] = ':';
11476     return str;
11477 }
11478
11479 #if 0 /* for RDoc */
11480 /*
11481  *  call-seq:
11482  *     sym.name   -> string
11483  *
11484  *  Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11485  *  returned string is frozen.
11486  *
11487  *     :fred.name         #=> "fred"
11488  *     :fred.name.frozen? #=> true
11489  *     :fred.to_s         #=> "fred"
11490  *     :fred.to_s.frozen? #=> false
11491  */
11492 VALUE
11493 rb_sym2str(VALUE sym)
11494 {
11495
11496 }
11497 #endif
11498
11499
11500 /*
11501  *  call-seq:
11502  *     sym.id2name   -> string
11503  *     sym.to_s      -> string
11504  *
11505  *  Returns the name or string corresponding to <i>sym</i>.
11506  *
11507  *     :fred.id2name   #=> "fred"
11508  *     :ginger.to_s    #=> "ginger"
11509  *
11510  *  Note that this string is not frozen (unlike the symbol itself).
11511  *  To get a frozen string, use #name.
11512  */
11513
11514
11515 VALUE
11516 rb_sym_to_s(VALUE sym)
11517 {
11518     return str_new_shared(rb_cString, rb_sym2str(sym));
11519 }
11520
11521
11522 /*
11523  * call-seq:
11524  *   sym.to_sym   -> sym
11525  *   sym.intern   -> sym
11526  *
11527  * In general, <code>to_sym</code> returns the Symbol corresponding
11528  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11529  * in this case.
11530  */
11531
11532 static VALUE
11533 sym_to_sym(VALUE sym)
11534 {
11535     return sym;
11536 }
11537
11538 MJIT_FUNC_EXPORTED VALUE
11539 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11540 {
11541     VALUE obj;
11542
11543     if (argc < 1) {
11544         rb_raise(rb_eArgError, "no receiver given");
11545     }
11546     obj = argv[0];
11547     return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11548 }
11549
11550 #if 0
11551 /*
11552  * call-seq:
11553  *   sym.to_proc
11554  *
11555  * Returns a _Proc_ object which responds to the given method by _sym_.
11556  *
11557  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
11558  */
11559
11560 VALUE
11561 rb_sym_to_proc(VALUE sym)
11562 {
11563 }
11564 #endif
11565
11566 /*
11567  * call-seq:
11568  *
11569  *   sym.succ
11570  *
11571  * Same as <code>sym.to_s.succ.intern</code>.
11572  */
11573
11574 static VALUE
11575 sym_succ(VALUE sym)
11576 {
11577     return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11578 }
11579
11580 /*
11581  * call-seq:
11582  *
11583  *   symbol <=> other_symbol       -> -1, 0, +1, or nil
11584  *
11585  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11586  * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11587  * less than, equal to, or greater than +other_symbol+.
11588  *
11589  * +nil+ is returned if the two values are incomparable.
11590  *
11591  * See String#<=> for more information.
11592  */
11593
11594 static VALUE
11595 sym_cmp(VALUE sym, VALUE other)
11596 {
11597     if (!SYMBOL_P(other)) {
11598         return Qnil;
11599     }
11600     return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11601 }
11602
11603 /*
11604  *  call-seq:
11605  *    casecmp(other_symbol) -> -1, 0, 1, or nil
11606  *
11607  *  Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11608  *
11609  *    :aBcDeF.casecmp(:abcde)   # => 1
11610  *    :aBcDeF.casecmp(:abcdef)  # => 0
11611  *    :aBcDeF.casecmp(:abcdefg) # => -1
11612  *    :abcdef.casecmp(:ABCDEF)  # => 0
11613  *
11614  *  Returns +nil+ if the two symbols have incompatible encodings,
11615  *  or if +other_symbol+ is not a symbol:
11616  *
11617  *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11618  *    other_sym = :"\u{c4 d6 dc}"
11619  *    sym.casecmp(other_sym) # => nil
11620  *    :foo.casecmp(2)        # => nil
11621  *
11622  *  Currently, case-insensitivity only works on characters A-Z/a-z,
11623  *  not all of Unicode. This is different from Symbol#casecmp?.
11624  *
11625  *  Related: Symbol#casecmp?.
11626  *
11627  */
11628
11629 static VALUE
11630 sym_casecmp(VALUE sym, VALUE other)
11631 {
11632     if (!SYMBOL_P(other)) {
11633         return Qnil;
11634     }
11635     return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11636 }
11637
11638 /*
11639  *  call-seq:
11640  *    casecmp?(other_symbol) -> true, false, or nil
11641  *
11642  *  Returns +true+ if +sym+ and +other_symbol+ are equal after
11643  *  Unicode case folding, +false+ if they are not equal:
11644  *
11645  *    :aBcDeF.casecmp?(:abcde)                  # => false
11646  *    :aBcDeF.casecmp?(:abcdef)                 # => true
11647  *    :aBcDeF.casecmp?(:abcdefg)                # => false
11648  *    :abcdef.casecmp?(:ABCDEF)                 # => true
11649  *    :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11650  *
11651  *  Returns +nil+ if the two symbols have incompatible encodings,
11652  *  or if +other_symbol+ is not a symbol:
11653  *
11654  *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11655  *    other_sym = :"\u{c4 d6 dc}"
11656  *    sym.casecmp?(other_sym) # => nil
11657  *    :foo.casecmp?(2)        # => nil
11658  *
11659  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
11660  *
11661  *  Related: Symbol#casecmp.
11662  *
11663  */
11664
11665 static VALUE
11666 sym_casecmp_p(VALUE sym, VALUE other)
11667 {
11668     if (!SYMBOL_P(other)) {
11669         return Qnil;
11670     }
11671     return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11672 }
11673
11674 /*
11675  * call-seq:
11676  *   sym =~ obj   -> integer or nil
11677  *
11678  * Returns <code>sym.to_s =~ obj</code>.
11679  */
11680
11681 static VALUE
11682 sym_match(VALUE sym, VALUE other)
11683 {
11684     return rb_str_match(rb_sym2str(sym), other);
11685 }
11686
11687 /*
11688  * call-seq:
11689  *   sym.match(pattern)        -> matchdata or nil
11690  *   sym.match(pattern, pos)   -> matchdata or nil
11691  *
11692  * Returns <code>sym.to_s.match</code>.
11693  */
11694
11695 static VALUE
11696 sym_match_m(int argc, VALUE *argv, VALUE sym)
11697 {
11698     return rb_str_match_m(argc, argv, rb_sym2str(sym));
11699 }
11700
11701 /*
11702  * call-seq:
11703  *   sym.match?(pattern)        -> true or false
11704  *   sym.match?(pattern, pos)   -> true or false
11705  *
11706  * Returns <code>sym.to_s.match?</code>.
11707  */
11708
11709 static VALUE
11710 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11711 {
11712     return rb_str_match_m_p(argc, argv, sym);
11713 }
11714
11715 /*
11716  * call-seq:
11717  *   sym[idx]      -> char
11718  *   sym[b, n]     -> string
11719  *   sym.slice(idx)      -> char
11720  *   sym.slice(b, n)     -> string
11721  *
11722  * Returns <code>sym.to_s[]</code>.
11723  */
11724
11725 static VALUE
11726 sym_aref(int argc, VALUE *argv, VALUE sym)
11727 {
11728     return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11729 }
11730
11731 /*
11732  * call-seq:
11733  *   sym.length   -> integer
11734  *   sym.size     -> integer
11735  *
11736  * Same as <code>sym.to_s.length</code>.
11737  */
11738
11739 static VALUE
11740 sym_length(VALUE sym)
11741 {
11742     return rb_str_length(rb_sym2str(sym));
11743 }
11744
11745 /*
11746  * call-seq:
11747  *   sym.empty?   -> true or false
11748  *
11749  * Returns whether _sym_ is :"" or not.
11750  */
11751
11752 static VALUE
11753 sym_empty(VALUE sym)
11754 {
11755     return rb_str_empty(rb_sym2str(sym));
11756 }
11757
11758 /*
11759  *  call-seq:
11760  *    upcase(*options) -> symbol
11761  *
11762  *  Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11763  *
11764  *  See String#upcase.
11765  *
11766  */
11767
11768 static VALUE
11769 sym_upcase(int argc, VALUE *argv, VALUE sym)
11770 {
11771     return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11772 }
11773
11774 /*
11775  *  call-seq:
11776  *    downcase(*options) -> symbol
11777  *
11778  *  Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11779  *
11780  *  See String#downcase.
11781  *
11782  *  Related: Symbol#upcase.
11783  *
11784  */
11785
11786 static VALUE
11787 sym_downcase(int argc, VALUE *argv, VALUE sym)
11788 {
11789     return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11790 }
11791
11792 /*
11793  *  call-seq:
11794  *    capitalize(*options) -> symbol
11795  *
11796  *  Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11797  *
11798  *  See String#capitalize.
11799  *
11800  */
11801
11802 static VALUE
11803 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11804 {
11805     return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11806 }
11807
11808 /*
11809  *  call-seq:
11810  *    swapcase(*options) -> symbol
11811  *
11812  *  Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11813  *
11814  *  See String#swapcase.
11815  *
11816  */
11817
11818 static VALUE
11819 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11820 {
11821     return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11822 }
11823
11824 /*
11825  *  call-seq:
11826  *     sym.start_with?([prefixes]+)   -> true or false
11827  *
11828  *  Returns true if +sym+ starts with one of the +prefixes+ given.
11829  *  Each of the +prefixes+ should be a String or a Regexp.
11830  *
11831  *    :hello.start_with?("hell")               #=> true
11832  *    :hello.start_with?(/H/i)                 #=> true
11833  *
11834  *    # returns true if one of the prefixes matches.
11835  *    :hello.start_with?("heaven", "hell")     #=> true
11836  *    :hello.start_with?("heaven", "paradise") #=> false
11837  */
11838
11839 static VALUE
11840 sym_start_with(int argc, VALUE *argv, VALUE sym)
11841 {
11842     return rb_str_start_with(argc, argv, rb_sym2str(sym));
11843 }
11844
11845 /*
11846  *  call-seq:
11847  *     sym.end_with?([suffixes]+)   -> true or false
11848  *
11849  *  Returns true if +sym+ ends with one of the +suffixes+ given.
11850  *
11851  *    :hello.end_with?("ello")               #=> true
11852  *
11853  *    # returns true if one of the +suffixes+ matches.
11854  *    :hello.end_with?("heaven", "ello")     #=> true
11855  *    :hello.end_with?("heaven", "paradise") #=> false
11856  */
11857
11858 static VALUE
11859 sym_end_with(int argc, VALUE *argv, VALUE sym)
11860 {
11861     return rb_str_end_with(argc, argv, rb_sym2str(sym));
11862 }
11863
11864 /*
11865  * call-seq:
11866  *   sym.encoding   -> encoding
11867  *
11868  * Returns the Encoding object that represents the encoding of _sym_.
11869  */
11870
11871 static VALUE
11872 sym_encoding(VALUE sym)
11873 {
11874     return rb_obj_encoding(rb_sym2str(sym));
11875 }
11876
11877 static VALUE
11878 string_for_symbol(VALUE name)
11879 {
11880     if (!RB_TYPE_P(name, T_STRING)) {
11881         VALUE tmp = rb_check_string_type(name);
11882         if (NIL_P(tmp)) {
11883             rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11884                      name);
11885         }
11886         name = tmp;
11887     }
11888     return name;
11889 }
11890
11891 ID
11892 rb_to_id(VALUE name)
11893 {
11894     if (SYMBOL_P(name)) {
11895         return SYM2ID(name);
11896     }
11897     name = string_for_symbol(name);
11898     return rb_intern_str(name);
11899 }
11900
11901 VALUE
11902 rb_to_symbol(VALUE name)
11903 {
11904     if (SYMBOL_P(name)) {
11905         return name;
11906     }
11907     name = string_for_symbol(name);
11908     return rb_str_intern(name);
11909 }
11910
11911 /*
11912  *  call-seq:
11913  *     Symbol.all_symbols    => array
11914  *
11915  *  Returns an array of all the symbols currently in Ruby's symbol
11916  *  table.
11917  *
11918  *     Symbol.all_symbols.size    #=> 903
11919  *     Symbol.all_symbols[1,20]   #=> [:floor, :ARGV, :Binding, :symlink,
11920  *                                     :chown, :EOFError, :$;, :String,
11921  *                                     :LOCK_SH, :"setuid?", :$<,
11922  *                                     :default_proc, :compact, :extend,
11923  *                                     :Tms, :getwd, :$=, :ThreadGroup,
11924  *                                     :wait2, :$>]
11925  */
11926
11927 static VALUE
11928 sym_all_symbols(VALUE _)
11929 {
11930     return rb_sym_all_symbols();
11931 }
11932
11933 VALUE
11934 rb_str_to_interned_str(VALUE str)
11935 {
11936     return rb_fstring(str);
11937 }
11938
11939 VALUE
11940 rb_interned_str(const char *ptr, long len)
11941 {
11942     struct RString fake_str;
11943     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11944 }
11945
11946 VALUE
11947 rb_interned_str_cstr(const char *ptr)
11948 {
11949     return rb_interned_str(ptr, strlen(ptr));
11950 }
11951
11952 VALUE
11953 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11954 {
11955     if (UNLIKELY(rb_enc_autoload_p(enc))) {
11956         rb_enc_autoload(enc);
11957     }
11958
11959     struct RString fake_str;
11960     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11961 }
11962
11963 VALUE
11964 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
11965 {
11966     return rb_enc_interned_str(ptr, strlen(ptr), enc);
11967 }
11968
11969 /*
11970  *  A \String object has an arbitrary sequence of bytes,
11971  *  typically representing text or binary data.
11972  *  A \String object may be created using String::new or as literals.
11973  *
11974  *  String objects differ from Symbol objects in that Symbol objects are
11975  *  designed to be used as identifiers, instead of text or data.
11976  *
11977  *  You can create a \String object explicitly with:
11978  *
11979  *  - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11980  *  - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11981  *
11982  *  You can convert certain objects to Strings with:
11983  *
11984  *  - \Method {String}[Kernel.html#method-i-String].
11985  *
11986  *  Some \String methods modify +self+.
11987  *  Typically, a method whose name ends with <tt>!</tt> modifies +self+
11988  *  and returns +self+;
11989  *  often a similarly named method (without the <tt>!</tt>)
11990  *  returns a new string.
11991  *
11992  *  In general, if there exist both bang and non-bang version of method,
11993  *  the bang! mutates and the non-bang! does not.
11994  *  However, a method without a bang can also mutate, such as String#replace.
11995  *
11996  *  == Substitution Methods
11997  *
11998  *  These methods perform substitutions:
11999  *
12000  *  - String#sub: One substitution (or none); returns a new string.
12001  *  - String#sub!: One substitution (or none); returns +self+.
12002  *  - String#gsub: Zero or more substitutions; returns a new string.
12003  *  - String#gsub!: Zero or more substitutions; returns +self+.
12004  *
12005  *  Each of these methods takes:
12006  *
12007  *  - A first argument, +pattern+ (string or regexp),
12008  *    that specifies the substring(s) to be replaced.
12009  *
12010  *  - Either of these:
12011  *
12012  *    - A second argument, +replacement+ (string or hash),
12013  *      that determines the replacing string.
12014  *    - A block that will determine the replacing string.
12015  *
12016  *  The examples in this section mostly use methods String#sub and String#gsub;
12017  *  the principles illustrated apply to all four substitution methods.
12018  *
12019  *  <b>Argument +pattern+</b>
12020  *
12021  *  Argument +pattern+ is commonly a regular expression:
12022  *
12023  *    s = 'hello'
12024  *    s.sub(/[aeiou]/, '*')  # => "h*llo"
12025  *    s.gsub(/[aeiou]/, '*') # => "h*ll*"
12026  *    s.gsub(/[aeiou]/, '')  # => "hll"
12027  *    s.sub(/ell/, 'al')     # => "halo"
12028  *    s.gsub(/xyzzy/, '*')   # => "hello"
12029  *    'THX1138'.gsub(/\d+/, '00') # => "THX00"
12030  *
12031  *  When +pattern+ is a string, all its characters are treated
12032  *  as ordinary characters (not as regexp special characters):
12033  *
12034  *    'THX1138'.gsub('\d+', '00') # => "THX1138"
12035  *
12036  *  <b>\String +replacement+</b>
12037  *
12038  *  If +replacement+ is a string, that string will determine
12039  *  the replacing string that is to be substituted for the matched text.
12040  *
12041  *  Each of the examples above uses a simple string as the replacing string.
12042  *
12043  *  \String +replacement+ may contain back-references to the pattern's captures:
12044  *
12045  *  - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12046  *  - <tt>\k<name></tt> refers to the named capture +name+.
12047  *
12048  *  See rdoc-ref:regexp.rdoc for details.
12049  *
12050  *  Note that within the string +replacement+, a character combination
12051  *  such as <tt>$&</tt> is treated as ordinary text, and not as
12052  *  a special match variable.
12053  *  However, you may refer to some special match variables using these
12054  *  combinations:
12055  *
12056  *  - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12057  *    which contains the complete matched text.
12058  *  - <tt>\'</tt> corresponds to <tt>$'</tt>,
12059  *    which contains string after match.
12060  *  - <tt>\`</tt> corresponds to <tt>$`</tt>,
12061  *    which contains string before match.
12062  *  - <tt>\+</tt> corresponds to <tt>$+</tt>,
12063  *    which contains last capture group.
12064  *
12065  *  See rdoc-ref:regexp.rdoc for details.
12066  *
12067  *  Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12068  *
12069  *  Note also that a string literal consumes backslashes.
12070  *  See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12071  *
12072  *  A back-reference is typically preceded by an additional backslash.
12073  *  For example, if you want to write a back-reference <tt>\&</tt> in
12074  *  +replacement+ with a double-quoted string literal, you need to write
12075  *  <tt>"..\\\\&.."</tt>.
12076  *
12077  *  If you want to write a non-back-reference string <tt>\&</tt> in
12078  *  +replacement+, you need first to escape the backslash to prevent
12079  *  this method from interpreting it as a back-reference, and then you
12080  *  need to escape the backslashes again to prevent a string literal from
12081  *  consuming them: <tt>"..\\\\\\\\&.."</tt>.
12082  *
12083  *  You may want to use the block form to avoid a lot of backslashes.
12084  *
12085  *  <b>\Hash +replacement+</b>
12086  *
12087  *  If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12088  *  the replacing string is the value for that key:
12089  *
12090  *    h = {'foo' => 'bar', 'baz' => 'bat'}
12091  *    'food'.sub('foo', h) # => "bard"
12092  *
12093  *  Note that a symbol key does not match:
12094  *
12095  *    h = {foo: 'bar', baz: 'bat'}
12096  *    'food'.sub('foo', h) # => "d"
12097  *
12098  *  <b>Block</b>
12099  *
12100  *  In the block form, the current match string is passed to the block;
12101  *  the block's return value becomes the replacing string:
12102  *
12103  *    s = '@'
12104  *   '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12105  *
12106  *  Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12107  *  <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12108  *
12109  *
12110  *  == What's Here
12111  *
12112  *  First, what's elsewhere. \Class \String:
12113  *
12114  *  - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12115  *  - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12116  *
12117  *  Here, class \String provides methods that are useful for:
12118  *
12119  *  - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12120  *  - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12121  *  - {Querying}[#class-String-label-Methods+for+Querying]
12122  *  - {Comparing}[#class-String-label-Methods+for+Comparing]
12123  *  - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12124  *  - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12125  *  - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12126  *  - {Iterating}[#class-String-label-Methods+for+Iterating]
12127  *
12128  *  === Methods for Creating a \String
12129  *
12130  *  - ::new:: Returns a new string.
12131  *  - ::try_convert:: Returns a new string created from a given object.
12132  *
12133  *  === Methods for a Frozen/Unfrozen String
12134  *
12135  *  - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12136  *                                  +self+, if not frozen; +self.dup+ otherwise.
12137  *  - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12138  *                                  +self+, if already frozen; +self.freeze+ otherwise.
12139  *  - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12140  *
12141  *  === Methods for Querying
12142  *
12143  *  _Counts_
12144  *
12145  *  - #length, #size:: Returns the count of characters (not bytes).
12146  *  - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12147  *  - #bytesize:: Returns the count of bytes.
12148  *  - #count:: Returns the count of substrings matching given strings.
12149  *
12150  *  _Substrings_
12151  *
12152  *  - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12153  *                           returns +nil+ if no match is found.
12154  *  - #index:: Returns the index of the _first_ occurrence of a given substring;
12155  *             returns +nil+ if none found.
12156  *  - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12157  *              returns +nil+ if none found.
12158  *  - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12159  *  - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12160  *  - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12161  *  - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12162  *  - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12163  *
12164  *  _Encodings_
12165  *
12166  *  - #encoding:: Returns the Encoding object that represents the encoding of the string.
12167  *  - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12168  *  - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12169  *                       for its encoding.
12170  *  - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12171  *
12172  *  _Other_
12173  *
12174  *  - #sum:: Returns a basic checksum for the string: the sum of each byte.
12175  *  - #hash:: Returns the integer hash code.
12176  *
12177  *  === Methods for Comparing
12178  *
12179  *  - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12180  *  - #eql?:: Returns +true+ if the content is the same as the given other string.
12181  *  - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12182  *  - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12183  *               other string is smaller than, equal to, or larger than +self+.
12184  *  - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12185  *                +false+ otherwise.
12186  *
12187  *  === Methods for Modifying a \String
12188  *
12189  *  Each of these methods modifies +self+.
12190  *
12191  *  _Insertion_
12192  *
12193  *  - #insert:: Returns +self+ with a given string inserted at a given offset.
12194  *  - #<<:: Returns +self+ concatenated with a given string or integer.
12195  *
12196  *  _Substitution_
12197  *
12198  *  - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12199  *            returns +self+ if any changes, +nil+ otherwise.
12200  *  - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12201  *             returns +self+ if any changes, +nil+ otherwise.
12202  *  - #succ!, #next!:: Returns +self+ modified to become its own successor.
12203  *  - #replace:: Returns +self+ with its entire content replaced by a given string.
12204  *  - #reverse!:: Returns +self+ with its characters in reverse order.
12205  *  - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12206  *  - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12207  *           returns +self+ if any changes, +nil+ otherwise.
12208  *  - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12209  *             removing duplicates from the substrings that were modified;
12210  *             returns +self+ if any changes, +nil+ otherwise.
12211  *
12212  *  _Casing_
12213  *
12214  *  - #capitalize!:: Upcases the initial character and downcases all others;
12215  *                   returns +self+ if any changes, +nil+ otherwise.
12216  *  - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12217  *  - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12218  *  - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12219  *                 returns +self+ if any changes, +nil+ otherwise.
12220  *
12221  *  _Encoding_
12222  *
12223  *  - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12224  *  - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12225  *  - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12226  *  - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12227  *
12228  *  _Deletion_
12229  *
12230  *  - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12231  *  - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12232  *  - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12233  *  - #delete!:: Removes characters as determined by the intersection of substring arguments.
12234  *  - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12235  *  - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12236  *  - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12237  *  - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12238  *  - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12239  *             returns +self+ if any changes, +nil+ otherwise.
12240  *
12241  *  === Methods for Converting to New \String
12242  *
12243  *  Each of these methods returns a new \String based on +self+,
12244  *  often just a modified copy of +self+.
12245  *
12246  *  _Extension_
12247  *
12248  *  - #*:: Returns the concatenation of multiple copies of +self+,
12249  *  - #+:: Returns the concatenation of +self+ and a given other string.
12250  *  - #center:: Returns a copy of +self+ centered between pad substring.
12251  *  - #concat:: Returns the concatenation of +self+ with given other strings.
12252  *  - #prepend:: Returns the concatenation of a given other string with +self+.
12253  *  - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12254  *  - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12255  *
12256  *  _Encoding_
12257  *
12258  *  - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12259  *  - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12260  *  - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12261  *  - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12262  *
12263  *  _Substitution_
12264  *
12265  *  - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12266  *            and all special characters escaped.
12267  *  - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12268  *              and all escaped characters unescaped.
12269  *  - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12270  *           replaced with a given replacement string;.
12271  *  - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12272  *            replaced with a given replacement string.
12273  *  - #succ, #next:: Returns the string that is the successor to +self+.
12274  *  - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12275  *  - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12276  *  - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12277  *            removing duplicates from the substrings that were modified.
12278  *  - #%:: Returns the string resulting from formatting a given object into +self+
12279  *
12280  *  _Casing_
12281  *
12282  *  - #capitalize:: Returns a copy of +self+ with the first character upcased
12283  *                  and all other characters downcased.
12284  *  - #downcase:: Returns a copy of +self+ with all characters downcased.
12285  *  - #upcase:: Returns a copy of +self+ with all characters upcased.
12286  *  - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12287  *                and all downcase characters upcased.
12288  *
12289  *  _Deletion_
12290  *
12291  *  - #delete:: Returns a copy of +self+ with characters removed
12292  *  - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12293  *  - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12294  *  - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12295  *  - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12296  *  - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12297  *  - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12298  *  - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12299  *  - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12300  *  - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12301  *  - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12302  *  - #chr:: Returns the first character.
12303  *
12304  *  _Duplication_
12305  *
12306  *  - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12307  *                     otherwise, returns +self+.
12308  *
12309  *  === Methods for Converting to Non-\String
12310  *
12311  *  Each of these methods converts the contents of +self+ to a non-\String.
12312  *
12313  *  <em>Characters, Bytes, and Clusters</em>
12314  *
12315  *  - #bytes:: Returns an array of the bytes in +self+.
12316  *  - #chars:: Returns an array of the characters in +self+.
12317  *  - #codepoints:: Returns an array of the integer ordinals in +self+.
12318  *  - #getbyte:: Returns an integer byte as determined by a given index.
12319  *  - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12320  *
12321  *  _Splitting_
12322  *
12323  *  - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12324  *  - #partition:: Returns a 3-element array determined by the first substring that matches
12325  *                 a given substring or regexp,
12326  *  - #rpartition:: Returns a 3-element array determined by the last substring that matches
12327  *                  a given substring or regexp,
12328  *  - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12329  *             or, if a block given, passes those substrings to the block.
12330  *
12331  *  _Matching_
12332  *
12333  *  - #scan:: Returns an array of substrings matching a given regexp or string, or,
12334  *            if a block given, passes each matching substring to the  block.
12335  *  - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12336  *  - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12337  *
12338  *  _Numerics_
12339  *
12340  *  - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12341  *  - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12342  *  - #ord:: Returns the integer ordinal of the first character in +self+.
12343  *  - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12344  *  - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12345  *
12346  *  <em>Strings and Symbols</em>
12347  *
12348  *  - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12349  *  - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12350  *
12351  *  === Methods for Iterating
12352  *
12353  *  - #each_byte:: Calls the given block with each successive byte in +self+.
12354  *  - #each_char:: Calls the given block with each successive character in +self+.
12355  *  - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12356  *  - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12357  *  - #each_line:: Calls the given block with each successive line in +self+,
12358  *                 as determined by a given record separator.
12359  *  - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12360  */
12361
12362 void
12363 Init_String(void)
12364 {
12365     rb_cString  = rb_define_class("String", rb_cObject);
12366     assert(rb_vm_fstring_table());
12367     st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12368     rb_include_module(rb_cString, rb_mComparable);
12369     rb_define_alloc_func(rb_cString, empty_str_alloc);
12370     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12371     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12372     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12373     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12374     rb_define_method(rb_cString, "==", rb_str_equal, 1);
12375     rb_define_method(rb_cString, "===", rb_str_equal, 1);
12376     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12377     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12378     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12379     rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12380     rb_define_method(rb_cString, "+", rb_str_plus, 1);
12381     rb_define_method(rb_cString, "*", rb_str_times, 1);
12382     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12383     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12384     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12385     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12386     rb_define_method(rb_cString, "length", rb_str_length, 0);
12387     rb_define_method(rb_cString, "size", rb_str_length, 0);
12388     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12389     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12390     rb_define_method(rb_cString, "=~", rb_str_match, 1);
12391     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12392     rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12393     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12394     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12395     rb_define_method(rb_cString, "next", rb_str_succ, 0);
12396     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12397     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12398     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12399     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12400     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12401     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12402     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12403     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12404     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12405     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12406     rb_define_method(rb_cString, "scrub", str_scrub, -1);
12407     rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12408     rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12409     rb_define_method(rb_cString, "+@", str_uplus, 0);
12410     rb_define_method(rb_cString, "-@", str_uminus, 0);
12411
12412     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12413     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12414     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12415     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12416     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12417     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12418     rb_define_method(rb_cString, "undump", str_undump, 0);
12419
12420     sym_ascii      = ID2SYM(rb_intern_const("ascii"));
12421     sym_turkic     = ID2SYM(rb_intern_const("turkic"));
12422     sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12423     sym_fold       = ID2SYM(rb_intern_const("fold"));
12424
12425     rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12426     rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12427     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12428     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12429
12430     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12431     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12432     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12433     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12434
12435     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12436     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12437     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12438     rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12439     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12440     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12441     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12442     rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12443     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12444     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12445     rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12446     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12447     rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12448     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12449     rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12450     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12451     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12452
12453     rb_define_method(rb_cString, "include?", rb_str_include, 1);
12454     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12455     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12456
12457     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12458
12459     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12460     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12461     rb_define_method(rb_cString, "center", rb_str_center, -1);
12462
12463     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12464     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12465     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12466     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12467     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12468     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12469     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12470     rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12471     rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12472
12473     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12474     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12475     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12476     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12477     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12478     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12479     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12480     rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12481     rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12482
12483     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12484     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12485     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12486     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12487     rb_define_method(rb_cString, "count", rb_str_count, -1);
12488
12489     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12490     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12491     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12492     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12493
12494     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12495     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12496     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12497     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12498     rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12499
12500     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12501
12502     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12503     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12504
12505     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12506     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12507
12508     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12509     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12510     rb_define_method(rb_cString, "b", rb_str_b, 0);
12511     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12512     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12513
12514     /* define UnicodeNormalize module here so that we don't have to look it up */
12515     mUnicodeNormalize          = rb_define_module("UnicodeNormalize");
12516     id_normalize               = rb_intern_const("normalize");
12517     id_normalized_p            = rb_intern_const("normalized?");
12518
12519     rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12520     rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12521     rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12522
12523     rb_fs = Qnil;
12524     rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12525     rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12526     rb_gc_register_address(&rb_fs);
12527
12528     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12529     rb_include_module(rb_cSymbol, rb_mComparable);
12530     rb_undef_alloc_func(rb_cSymbol);
12531     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12532     rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12533
12534     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12535     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12536     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12537     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12538     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12539     rb_define_method(rb_cSymbol, "name", rb_sym2str, 0);
12540     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12541     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12542     rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12543     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12544     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12545
12546     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12547     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12548     rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12549     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12550
12551     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12552     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12553     rb_define_method(rb_cSymbol, "length", sym_length, 0);
12554     rb_define_method(rb_cSymbol, "size", sym_length, 0);
12555     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12556     rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12557     rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12558
12559     rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12560     rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12561     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12562     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12563
12564     rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12565     rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12566
12567     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12568 }