string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/internal/config.h"
  15
  16 #include <ctype.h>
  17 #include <errno.h>
  18 #include <math.h>
  19
  20 #ifdef HAVE_UNISTD_H
  21 # include <unistd.h>
  22 #endif
  23
  24 #include "debug_counter.h"
  25 #include "encindex.h"
  26 #include "gc.h"
  27 #include "id.h"
  28 #include "internal.h"
  29 #include "internal/array.h"
  30 #include "internal/compar.h"
  31 #include "internal/compilers.h"
  32 #include "internal/encoding.h"
  33 #include "internal/error.h"
  34 #include "internal/gc.h"
  35 #include "internal/numeric.h"
  36 #include "internal/object.h"
  37 #include "internal/proc.h"
  38 #include "internal/re.h"
  39 #include "internal/sanitizers.h"
  40 #include "internal/string.h"
  41 #include "internal/transcode.h"
  42 #include "probes.h"
  43 #include "ruby/encoding.h"
  44 #include "ruby/re.h"
  45 #include "ruby/util.h"
  46 #include "ruby_assert.h"
  47 #include "vm_sync.h"
  48
  49 #if defined HAVE_CRYPT_R
  50 # if defined HAVE_CRYPT_H
  51 #  include <crypt.h>
  52 # endif
  53 #elif !defined HAVE_CRYPT
  54 # include "missing/crypt.h"
  55 # define HAVE_CRYPT_R 1
  56 #endif
  57
  58 #define BEG(no) (regs->beg[(no)])
  59 #define END(no) (regs->end[(no)])
  60
  61 #undef rb_str_new
  62 #undef rb_usascii_str_new
  63 #undef rb_utf8_str_new
  64 #undef rb_enc_str_new
  65 #undef rb_str_new_cstr
  66 #undef rb_usascii_str_new_cstr
  67 #undef rb_utf8_str_new_cstr
  68 #undef rb_enc_str_new_cstr
  69 #undef rb_external_str_new_cstr
  70 #undef rb_locale_str_new_cstr
  71 #undef rb_str_dup_frozen
  72 #undef rb_str_buf_new_cstr
  73 #undef rb_str_buf_cat
  74 #undef rb_str_buf_cat2
  75 #undef rb_str_cat2
  76 #undef rb_str_cat_cstr
  77 #undef rb_fstring_cstr
  78
  79 VALUE rb_cString;
  80 VALUE rb_cSymbol;
  81
  82 /* FLAGS of RString
  83  *
  84  * 1:     RSTRING_NOEMBED
  85  * 2:     STR_SHARED (== ELTS_SHARED)
  86  * 2-6:   RSTRING_EMBED_LEN (5 bits == 32)
  87  * 5:     STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
  88  *                         other strings that rely on this string's buffer)
  89  * 6:     STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
  90  *                      early, specific to rb_str_tmp_frozen_{acquire,release})
  91  * 7:     STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
  92  *                     such as read(2). Any modification and realloc is prohibited)
  93  *
  94  * 8-9:   ENC_CODERANGE (2 bits)
  95  * 10-16: ENCODING (7 bits == 128)
  96  * 17:    RSTRING_FSTR
  97  * 18:    STR_NOFREE (do not free this string's buffer when a String is freed.
  98  *                    used for a string object based on C string literal)
  99  * 19:    STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
 100  *                     object header is temporarily allocated on C stack)
 101  */
 102
 103 #define RUBY_MAX_CHAR_LEN 16
 104 #define STR_SHARED_ROOT FL_USER5
 105 #define STR_BORROWED FL_USER6
 106 #define STR_TMPLOCK FL_USER7
 107 #define STR_NOFREE FL_USER18
 108 #define STR_FAKESTR FL_USER19
 109
 110 #define STR_SET_NOEMBED(str) do {\
 111     FL_SET((str), STR_NOEMBED);\
 112     if (USE_RVARGC) {\
 113         FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
 114     }\
 115     else {\
 116         STR_SET_EMBED_LEN((str), 0);\
 117     }\
 118 } while (0)
 119 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
 120 #if USE_RVARGC
 121 # define STR_SET_EMBED_LEN(str, n) do { \
 122     assert(str_embed_capa(str) > (n));\
 123     RSTRING(str)->as.embed.len = (n);\
 124 } while (0)
 125 #else
 126 # define STR_SET_EMBED_LEN(str, n) do { \
 127     long tmp_n = (n);\
 128     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
 129     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
 130 } while (0)
 131 #endif
 132
 133 #define STR_SET_LEN(str, n) do { \
 134     if (STR_EMBED_P(str)) {\
 135         STR_SET_EMBED_LEN((str), (n));\
 136     }\
 137     else {\
 138         RSTRING(str)->as.heap.len = (n);\
 139     }\
 140 } while (0)
 141
 142 #define STR_DEC_LEN(str) do {\
 143     if (STR_EMBED_P(str)) {\
 144         long n = RSTRING_LEN(str);\
 145         n--;\
 146         STR_SET_EMBED_LEN((str), n);\
 147     }\
 148     else {\
 149         RSTRING(str)->as.heap.len--;\
 150     }\
 151 } while (0)
 152
 153 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
 154 #define TERM_FILL(ptr, termlen) do {\
 155     char *const term_fill_ptr = (ptr);\
 156     const int term_fill_len = (termlen);\
 157     *term_fill_ptr = '\0';\
 158     if (UNLIKELY(term_fill_len > 1))\
 159         memset(term_fill_ptr, 0, term_fill_len);\
 160 } while (0)
 161
 162 #define RESIZE_CAPA(str,capacity) do {\
 163     const int termlen = TERM_LEN(str);\
 164     RESIZE_CAPA_TERM(str,capacity,termlen);\
 165 } while (0)
 166 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
 167     if (STR_EMBED_P(str)) {\
 168         if (str_embed_capa(str) < capacity + termlen) {\
 169             char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
 170             const long tlen = RSTRING_LEN(str);\
 171             memcpy(tmp, RSTRING_PTR(str), tlen);\
 172             RSTRING(str)->as.heap.ptr = tmp;\
 173             RSTRING(str)->as.heap.len = tlen;\
 174             STR_SET_NOEMBED(str);\
 175             RSTRING(str)->as.heap.aux.capa = (capacity);\
 176         }\
 177     }\
 178     else {\
 179         assert(!FL_TEST((str), STR_SHARED)); \
 180         SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
 181                         (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
 182         RSTRING(str)->as.heap.aux.capa = (capacity);\
 183     }\
 184 } while (0)
 185
 186 #define STR_SET_SHARED(str, shared_str) do { \
 187     if (!FL_TEST(str, STR_FAKESTR)) { \
 188         assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
 189         assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
 190         RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
 191         FL_SET((str), STR_SHARED); \
 192         FL_SET((shared_str), STR_SHARED_ROOT); \
 193         if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
 194             FL_SET_RAW((shared_str), STR_BORROWED); \
 195     } \
 196 } while (0)
 197
 198 #define STR_HEAP_PTR(str)  (RSTRING(str)->as.heap.ptr)
 199 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
 200 /* TODO: include the terminator size in capa. */
 201
 202 #define STR_ENC_GET(str) get_encoding(str)
 203
 204 #if !defined SHARABLE_MIDDLE_SUBSTRING
 205 # define SHARABLE_MIDDLE_SUBSTRING 0
 206 #endif
 207 #if !SHARABLE_MIDDLE_SUBSTRING
 208 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
 209 #else
 210 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
 211 #endif
 212
 213
 214 static inline long
 215 str_embed_capa(VALUE str)
 216 {
 217 #if USE_RVARGC
 218     return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
 219 #else
 220     return RSTRING_EMBED_LEN_MAX + 1;
 221 #endif
 222 }
 223
 224 static inline size_t
 225 str_embed_size(long capa)
 226 {
 227     return offsetof(struct RString, as.embed.ary) + capa;
 228 }
 229
 230 static inline bool
 231 STR_EMBEDDABLE_P(long len, long termlen)
 232 {
 233 #if USE_RVARGC
 234     return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
 235 #else
 236     return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
 237 #endif
 238 }
 239
 240 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
 241 static VALUE str_new_frozen(VALUE klass, VALUE orig);
 242 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
 243 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
 244 static VALUE str_new(VALUE klass, const char *ptr, long len);
 245 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
 246 static inline void str_modifiable(VALUE str);
 247 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
 248
 249 static inline void
 250 str_make_independent(VALUE str)
 251 {
 252     long len = RSTRING_LEN(str);
 253     int termlen = TERM_LEN(str);
 254     str_make_independent_expand((str), len, 0L, termlen);
 255 }
 256
 257 static inline int str_dependent_p(VALUE str);
 258
 259 void
 260 rb_str_make_independent(VALUE str)
 261 {
 262     if (str_dependent_p(str)) {
 263         str_make_independent(str);
 264     }
 265 }
 266
 267 void
 268 rb_debug_rstring_null_ptr(const char *func)
 269 {
 270     fprintf(stderr, "%s is returning NULL!! "
 271             "SIGSEGV is highly expected to follow immediately. "
 272             "If you could reproduce, attach your debugger here, "
 273             "and look at the passed string.",
 274             func);
 275 }
 276
 277 /* symbols for [up|down|swap]case/capitalize options */
 278 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
 279
 280 static rb_encoding *
 281 get_actual_encoding(const int encidx, VALUE str)
 282 {
 283     const unsigned char *q;
 284
 285     switch (encidx) {
 286       case ENCINDEX_UTF_16:
 287         if (RSTRING_LEN(str) < 2) break;
 288         q = (const unsigned char *)RSTRING_PTR(str);
 289         if (q[0] == 0xFE && q[1] == 0xFF) {
 290             return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
 291         }
 292         if (q[0] == 0xFF && q[1] == 0xFE) {
 293             return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
 294         }
 295         return rb_ascii8bit_encoding();
 296       case ENCINDEX_UTF_32:
 297         if (RSTRING_LEN(str) < 4) break;
 298         q = (const unsigned char *)RSTRING_PTR(str);
 299         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
 300             return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
 301         }
 302         if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
 303             return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
 304         }
 305         return rb_ascii8bit_encoding();
 306     }
 307     return rb_enc_from_index(encidx);
 308 }
 309
 310 static rb_encoding *
 311 get_encoding(VALUE str)
 312 {
 313     return get_actual_encoding(ENCODING_GET(str), str);
 314 }
 315
 316 static void
 317 mustnot_broken(VALUE str)
 318 {
 319     if (is_broken_string(str)) {
 320         rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
 321     }
 322 }
 323
 324 static void
 325 mustnot_wchar(VALUE str)
 326 {
 327     rb_encoding *enc = STR_ENC_GET(str);
 328     if (rb_enc_mbminlen(enc) > 1) {
 329         rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
 330     }
 331 }
 332
 333 static int fstring_cmp(VALUE a, VALUE b);
 334
 335 static VALUE register_fstring(VALUE str, bool copy);
 336
 337 const struct st_hash_type rb_fstring_hash_type = {
 338     fstring_cmp,
 339     rb_str_hash,
 340 };
 341
 342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
 343
 344 struct fstr_update_arg {
 345     VALUE fstr;
 346     bool copy;
 347 };
 348
 349 static int
 350 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
 351 {
 352
 353     struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
 354     VALUE str = (VALUE)*key;
 355
 356     if (existing) {
 357         /* because of lazy sweep, str may be unmarked already and swept
 358          * at next time */
 359
 360         if (rb_objspace_garbage_object_p(str)) {
 361             arg->fstr = Qundef;
 362             return ST_DELETE;
 363         }
 364
 365         arg->fstr = str;
 366         return ST_STOP;
 367     }
 368     else {
 369         if (FL_TEST_RAW(str, STR_FAKESTR)) {
 370             if (arg->copy) {
 371                 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
 372                 rb_enc_copy(new_str, str);
 373                 str = new_str;
 374             }
 375             else {
 376                 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
 377                                      RSTRING(str)->as.heap.len,
 378                                      ENCODING_GET(str));
 379             }
 380             OBJ_FREEZE_RAW(str);
 381         }
 382         else {
 383             if (!OBJ_FROZEN(str))
 384                 str = str_new_frozen(rb_cString, str);
 385             if (STR_SHARED_P(str)) { /* str should not be shared */
 386                 /* shared substring  */
 387                 str_make_independent(str);
 388                 assert(OBJ_FROZEN(str));
 389             }
 390             if (!BARE_STRING_P(str)) {
 391                 str = str_new_frozen(rb_cString, str);
 392             }
 393         }
 394         RBASIC(str)->flags |= RSTRING_FSTR;
 395
 396         *key = *value = arg->fstr = str;
 397         return ST_CONTINUE;
 398     }
 399 }
 400
 401 RUBY_FUNC_EXPORTED
 402 VALUE
 403 rb_fstring(VALUE str)
 404 {
 405     VALUE fstr;
 406     int bare;
 407
 408     Check_Type(str, T_STRING);
 409
 410     if (FL_TEST(str, RSTRING_FSTR))
 411         return str;
 412
 413     bare = BARE_STRING_P(str);
 414     if (!bare) {
 415         if (STR_EMBED_P(str)) {
 416             OBJ_FREEZE_RAW(str);
 417             return str;
 418         }
 419         if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
 420             assert(OBJ_FROZEN(str));
 421             return str;
 422         }
 423     }
 424
 425     if (!OBJ_FROZEN(str))
 426         rb_str_resize(str, RSTRING_LEN(str));
 427
 428     fstr = register_fstring(str, FALSE);
 429
 430     if (!bare) {
 431         str_replace_shared_without_enc(str, fstr);
 432         OBJ_FREEZE_RAW(str);
 433         return str;
 434     }
 435     return fstr;
 436 }
 437
 438 static VALUE
 439 register_fstring(VALUE str, bool copy)
 440 {
 441     struct fstr_update_arg args;
 442     args.copy = copy;
 443
 444     RB_VM_LOCK_ENTER();
 445     {
 446         st_table *frozen_strings = rb_vm_fstring_table();
 447         do {
 448             args.fstr = str;
 449             st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
 450         } while (args.fstr == Qundef);
 451     }
 452     RB_VM_LOCK_LEAVE();
 453
 454     assert(OBJ_FROZEN(args.fstr));
 455     assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
 456     assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
 457     assert(RBASIC_CLASS(args.fstr) == rb_cString);
 458     return args.fstr;
 459 }
 460
 461 static VALUE
 462 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
 463 {
 464     fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
 465     /* SHARED to be allocated by the callback */
 466
 467     if (!name) {
 468         RUBY_ASSERT_ALWAYS(len == 0);
 469         name = "";
 470     }
 471
 472     ENCODING_SET_INLINED((VALUE)fake_str, encidx);
 473
 474     RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
 475     fake_str->as.heap.len = len;
 476     fake_str->as.heap.ptr = (char *)name;
 477     fake_str->as.heap.aux.capa = len;
 478     return (VALUE)fake_str;
 479 }
 480
 481 /*
 482  * set up a fake string which refers a static string literal.
 483  */
 484 VALUE
 485 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
 486 {
 487     return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
 488 }
 489
 490 /*
 491  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
 492  * shared string which refers a static string literal.  `ptr` must
 493  * point a constant string.
 494  */
 495 MJIT_FUNC_EXPORTED VALUE
 496 rb_fstring_new(const char *ptr, long len)
 497 {
 498     struct RString fake_str;
 499     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
 500 }
 501
 502 VALUE
 503 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
 504 {
 505     struct RString fake_str;
 506     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
 507 }
 508
 509 VALUE
 510 rb_fstring_cstr(const char *ptr)
 511 {
 512     return rb_fstring_new(ptr, strlen(ptr));
 513 }
 514
 515 static int
 516 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
 517 {
 518     RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
 519     return ST_CONTINUE;
 520 }
 521
 522 static int
 523 fstring_cmp(VALUE a, VALUE b)
 524 {
 525     long alen, blen;
 526     const char *aptr, *bptr;
 527     RSTRING_GETMEM(a, aptr, alen);
 528     RSTRING_GETMEM(b, bptr, blen);
 529     return (alen != blen ||
 530             ENCODING_GET(a) != ENCODING_GET(b) ||
 531             memcmp(aptr, bptr, alen) != 0);
 532 }
 533
 534 static inline int
 535 single_byte_optimizable(VALUE str)
 536 {
 537     rb_encoding *enc;
 538
 539     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 540     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 541         return 1;
 542
 543     enc = STR_ENC_GET(str);
 544     if (rb_enc_mbmaxlen(enc) == 1)
 545         return 1;
 546
 547     /* Conservative.  Possibly single byte.
 548      * "\xa1" in Shift_JIS for example. */
 549     return 0;
 550 }
 551
 552 VALUE rb_fs;
 553
 554 static inline const char *
 555 search_nonascii(const char *p, const char *e)
 556 {
 557     const uintptr_t *s, *t;
 558
 559 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
 560 # if SIZEOF_UINTPTR_T == 8
 561 #  define NONASCII_MASK UINT64_C(0x8080808080808080)
 562 # elif SIZEOF_UINTPTR_T == 4
 563 #  define NONASCII_MASK UINT32_C(0x80808080)
 564 # else
 565 #  error "don't know what to do."
 566 # endif
 567 #else
 568 # if SIZEOF_UINTPTR_T == 8
 569 #  define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
 570 # elif SIZEOF_UINTPTR_T == 4
 571 #  define NONASCII_MASK 0x80808080UL /* or...? */
 572 # else
 573 #  error "don't know what to do."
 574 # endif
 575 #endif
 576
 577     if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
 578 #if !UNALIGNED_WORD_ACCESS
 579         if ((uintptr_t)p % SIZEOF_VOIDP) {
 580             int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
 581             p += l;
 582             switch (l) {
 583               default: UNREACHABLE;
 584 #if SIZEOF_VOIDP > 4
 585               case 7: if (p[-7]&0x80) return p-7;
 586               case 6: if (p[-6]&0x80) return p-6;
 587               case 5: if (p[-5]&0x80) return p-5;
 588               case 4: if (p[-4]&0x80) return p-4;
 589 #endif
 590               case 3: if (p[-3]&0x80) return p-3;
 591               case 2: if (p[-2]&0x80) return p-2;
 592               case 1: if (p[-1]&0x80) return p-1;
 593               case 0: break;
 594             }
 595         }
 596 #endif
 597 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
 598 #define aligned_ptr(value) \
 599         __builtin_assume_aligned((value), sizeof(uintptr_t))
 600 #else
 601 #define aligned_ptr(value) (uintptr_t *)(value)
 602 #endif
 603         s = aligned_ptr(p);
 604         t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
 605 #undef aligned_ptr
 606         for (;s < t; s++) {
 607             if (*s & NONASCII_MASK) {
 608 #ifdef WORDS_BIGENDIAN
 609                 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
 610 #else
 611                 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
 612 #endif
 613             }
 614         }
 615         p = (const char *)s;
 616     }
 617
 618     switch (e - p) {
 619       default: UNREACHABLE;
 620 #if SIZEOF_VOIDP > 4
 621       case 7: if (e[-7]&0x80) return e-7;
 622       case 6: if (e[-6]&0x80) return e-6;
 623       case 5: if (e[-5]&0x80) return e-5;
 624       case 4: if (e[-4]&0x80) return e-4;
 625 #endif
 626       case 3: if (e[-3]&0x80) return e-3;
 627       case 2: if (e[-2]&0x80) return e-2;
 628       case 1: if (e[-1]&0x80) return e-1;
 629       case 0: return NULL;
 630     }
 631 }
 632
 633 static int
 634 coderange_scan(const char *p, long len, rb_encoding *enc)
 635 {
 636     const char *e = p + len;
 637
 638     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 639         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 640         p = search_nonascii(p, e);
 641         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 642     }
 643
 644     if (rb_enc_asciicompat(enc)) {
 645         p = search_nonascii(p, e);
 646         if (!p) return ENC_CODERANGE_7BIT;
 647         for (;;) {
 648             int ret = rb_enc_precise_mbclen(p, e, enc);
 649             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 650             p += MBCLEN_CHARFOUND_LEN(ret);
 651             if (p == e) break;
 652             p = search_nonascii(p, e);
 653             if (!p) break;
 654         }
 655     }
 656     else {
 657         while (p < e) {
 658             int ret = rb_enc_precise_mbclen(p, e, enc);
 659             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 660             p += MBCLEN_CHARFOUND_LEN(ret);
 661         }
 662     }
 663     return ENC_CODERANGE_VALID;
 664 }
 665
 666 long
 667 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 668 {
 669     const char *p = s;
 670
 671     if (*cr == ENC_CODERANGE_BROKEN)
 672         return e - s;
 673
 674     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 675         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 676         if (*cr == ENC_CODERANGE_VALID) return e - s;
 677         p = search_nonascii(p, e);
 678         *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 679         return e - s;
 680     }
 681     else if (rb_enc_asciicompat(enc)) {
 682         p = search_nonascii(p, e);
 683         if (!p) {
 684             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 685             return e - s;
 686         }
 687         for (;;) {
 688             int ret = rb_enc_precise_mbclen(p, e, enc);
 689             if (!MBCLEN_CHARFOUND_P(ret)) {
 690                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 691                 return p - s;
 692             }
 693             p += MBCLEN_CHARFOUND_LEN(ret);
 694             if (p == e) break;
 695             p = search_nonascii(p, e);
 696             if (!p) break;
 697         }
 698     }
 699     else {
 700         while (p < e) {
 701             int ret = rb_enc_precise_mbclen(p, e, enc);
 702             if (!MBCLEN_CHARFOUND_P(ret)) {
 703                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 704                 return p - s;
 705             }
 706             p += MBCLEN_CHARFOUND_LEN(ret);
 707         }
 708     }
 709     *cr = ENC_CODERANGE_VALID;
 710     return e - s;
 711 }
 712
 713 static inline void
 714 str_enc_copy(VALUE str1, VALUE str2)
 715 {
 716     rb_enc_set_index(str1, ENCODING_GET(str2));
 717 }
 718
 719 static void
 720 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 721 {
 722     /* this function is designed for copying encoding and coderange
 723      * from src to new string "dest" which is made from the part of src.
 724      */
 725     str_enc_copy(dest, src);
 726     if (RSTRING_LEN(dest) == 0) {
 727         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 728             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 729         else
 730             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 731         return;
 732     }
 733     switch (ENC_CODERANGE(src)) {
 734       case ENC_CODERANGE_7BIT:
 735         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 736         break;
 737       case ENC_CODERANGE_VALID:
 738         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 739             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 740             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 741         else
 742             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 743         break;
 744       default:
 745         break;
 746     }
 747 }
 748
 749 static void
 750 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 751 {
 752     str_enc_copy(dest, src);
 753     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 754 }
 755
 756 static int
 757 enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
 758 {
 759     if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
 760         rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
 761         return ENC_CODERANGE_BROKEN;
 762     }
 763     else {
 764         return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 765     }
 766 }
 767
 768 int
 769 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
 770 {
 771     return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
 772 }
 773
 774 int
 775 rb_enc_str_coderange(VALUE str)
 776 {
 777     int cr = ENC_CODERANGE(str);
 778
 779     if (cr == ENC_CODERANGE_UNKNOWN) {
 780         int encidx = ENCODING_GET(str);
 781         rb_encoding *enc = rb_enc_from_index(encidx);
 782         cr = enc_coderange_scan(str, enc, encidx);
 783         ENC_CODERANGE_SET(str, cr);
 784     }
 785     return cr;
 786 }
 787
 788 int
 789 rb_enc_str_asciionly_p(VALUE str)
 790 {
 791     rb_encoding *enc = STR_ENC_GET(str);
 792
 793     if (!rb_enc_asciicompat(enc))
 794         return FALSE;
 795     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
 796         return TRUE;
 797     return FALSE;
 798 }
 799
 800 static inline void
 801 str_mod_check(VALUE s, const char *p, long len)
 802 {
 803     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 804         rb_raise(rb_eRuntimeError, "string modified");
 805     }
 806 }
 807
 808 static size_t
 809 str_capacity(VALUE str, const int termlen)
 810 {
 811     if (STR_EMBED_P(str)) {
 812 #if USE_RVARGC
 813         return str_embed_capa(str) - termlen;
 814 #else
 815         return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
 816 #endif
 817     }
 818     else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
 819         return RSTRING(str)->as.heap.len;
 820     }
 821     else {
 822         return RSTRING(str)->as.heap.aux.capa;
 823     }
 824 }
 825
 826 size_t
 827 rb_str_capacity(VALUE str)
 828 {
 829     return str_capacity(str, TERM_LEN(str));
 830 }
 831
 832 static inline void
 833 must_not_null(const char *ptr)
 834 {
 835     if (!ptr) {
 836         rb_raise(rb_eArgError, "NULL pointer given");
 837     }
 838 }
 839
 840 static inline VALUE
 841 str_alloc(VALUE klass, size_t size)
 842 {
 843     assert(size > 0);
 844     RVARGC_NEWOBJ_OF(str, struct RString, klass,
 845                      T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
 846     return (VALUE)str;
 847 }
 848
 849 static inline VALUE
 850 str_alloc_embed(VALUE klass, size_t capa)
 851 {
 852     size_t size = str_embed_size(capa);
 853     assert(rb_gc_size_allocatable_p(size));
 854 #if !USE_RVARGC
 855     assert(size <= sizeof(struct RString));
 856 #endif
 857     return str_alloc(klass, size);
 858 }
 859
 860 static inline VALUE
 861 str_alloc_heap(VALUE klass)
 862 {
 863     return str_alloc(klass, sizeof(struct RString));
 864 }
 865
 866 static inline VALUE
 867 empty_str_alloc(VALUE klass)
 868 {
 869     RUBY_DTRACE_CREATE_HOOK(STRING, 0);
 870     VALUE str = str_alloc_embed(klass, 0);
 871     memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
 872     return str;
 873 }
 874
 875 static VALUE
 876 str_new0(VALUE klass, const char *ptr, long len, int termlen)
 877 {
 878     VALUE str;
 879
 880     if (len < 0) {
 881         rb_raise(rb_eArgError, "negative string size (or size too big)");
 882     }
 883
 884     RUBY_DTRACE_CREATE_HOOK(STRING, len);
 885
 886     if (STR_EMBEDDABLE_P(len, termlen)) {
 887         str = str_alloc_embed(klass, len + termlen);
 888         if (len == 0) {
 889             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
 890         }
 891     }
 892     else {
 893         str = str_alloc_heap(klass);
 894         RSTRING(str)->as.heap.aux.capa = len;
 895         /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
 896          * integer overflow.  If we can STATIC_ASSERT that, the following
 897          * mul_add_mul can be reverted to a simple ALLOC_N. */
 898         RSTRING(str)->as.heap.ptr =
 899             rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
 900         STR_SET_NOEMBED(str);
 901     }
 902     if (ptr) {
 903         memcpy(RSTRING_PTR(str), ptr, len);
 904     }
 905     STR_SET_LEN(str, len);
 906     TERM_FILL(RSTRING_PTR(str) + len, termlen);
 907     return str;
 908 }
 909
 910 static VALUE
 911 str_new(VALUE klass, const char *ptr, long len)
 912 {
 913     return str_new0(klass, ptr, len, 1);
 914 }
 915
 916 VALUE
 917 rb_str_new(const char *ptr, long len)
 918 {
 919     return str_new(rb_cString, ptr, len);
 920 }
 921
 922 VALUE
 923 rb_usascii_str_new(const char *ptr, long len)
 924 {
 925     VALUE str = rb_str_new(ptr, len);
 926     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 927     return str;
 928 }
 929
 930 VALUE
 931 rb_utf8_str_new(const char *ptr, long len)
 932 {
 933     VALUE str = str_new(rb_cString, ptr, len);
 934     rb_enc_associate_index(str, rb_utf8_encindex());
 935     return str;
 936 }
 937
 938 VALUE
 939 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 940 {
 941     VALUE str;
 942
 943     if (!enc) return rb_str_new(ptr, len);
 944
 945     str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
 946     rb_enc_associate(str, enc);
 947     return str;
 948 }
 949
 950 VALUE
 951 rb_str_new_cstr(const char *ptr)
 952 {
 953     must_not_null(ptr);
 954     /* rb_str_new_cstr() can take pointer from non-malloc-generated
 955      * memory regions, and that cannot be detected by the MSAN.  Just
 956      * trust the programmer that the argument passed here is a sane C
 957      * string. */
 958     __msan_unpoison_string(ptr);
 959     return rb_str_new(ptr, strlen(ptr));
 960 }
 961
 962 VALUE
 963 rb_usascii_str_new_cstr(const char *ptr)
 964 {
 965     VALUE str = rb_str_new_cstr(ptr);
 966     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 967     return str;
 968 }
 969
 970 VALUE
 971 rb_utf8_str_new_cstr(const char *ptr)
 972 {
 973     VALUE str = rb_str_new_cstr(ptr);
 974     rb_enc_associate_index(str, rb_utf8_encindex());
 975     return str;
 976 }
 977
 978 VALUE
 979 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
 980 {
 981     must_not_null(ptr);
 982     if (rb_enc_mbminlen(enc) != 1) {
 983         rb_raise(rb_eArgError, "wchar encoding given");
 984     }
 985     return rb_enc_str_new(ptr, strlen(ptr), enc);
 986 }
 987
 988 static VALUE
 989 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
 990 {
 991     VALUE str;
 992
 993     if (len < 0) {
 994         rb_raise(rb_eArgError, "negative string size (or size too big)");
 995     }
 996
 997     if (!ptr) {
 998         rb_encoding *enc = rb_enc_get_from_index(encindex);
 999         str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1000     }
1001     else {
1002         RUBY_DTRACE_CREATE_HOOK(STRING, len);
1003         str = str_alloc_heap(klass);
1004         RSTRING(str)->as.heap.len = len;
1005         RSTRING(str)->as.heap.ptr = (char *)ptr;
1006         RSTRING(str)->as.heap.aux.capa = len;
1007         STR_SET_NOEMBED(str);
1008         RBASIC(str)->flags |= STR_NOFREE;
1009     }
1010     rb_enc_associate_index(str, encindex);
1011     return str;
1012 }
1013
1014 VALUE
1015 rb_str_new_static(const char *ptr, long len)
1016 {
1017     return str_new_static(rb_cString, ptr, len, 0);
1018 }
1019
1020 VALUE
1021 rb_usascii_str_new_static(const char *ptr, long len)
1022 {
1023     return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1024 }
1025
1026 VALUE
1027 rb_utf8_str_new_static(const char *ptr, long len)
1028 {
1029     return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1030 }
1031
1032 VALUE
1033 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1034 {
1035     return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1036 }
1037
1038 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1039                                    rb_encoding *from, rb_encoding *to,
1040                                    int ecflags, VALUE ecopts);
1041
1042 static inline bool
1043 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1044 {
1045     int encidx = rb_enc_to_index(enc);
1046     if (rb_enc_get_index(str) == encidx)
1047         return is_ascii_string(str);
1048     return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1049 }
1050
1051 VALUE
1052 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1053 {
1054     long len;
1055     const char *ptr;
1056     VALUE newstr;
1057
1058     if (!to) return str;
1059     if (!from) from = rb_enc_get(str);
1060     if (from == to) return str;
1061     if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1062         to == rb_ascii8bit_encoding()) {
1063         if (STR_ENC_GET(str) != to) {
1064             str = rb_str_dup(str);
1065             rb_enc_associate(str, to);
1066         }
1067         return str;
1068     }
1069
1070     RSTRING_GETMEM(str, ptr, len);
1071     newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1072                                    from, to, ecflags, ecopts);
1073     if (NIL_P(newstr)) {
1074         /* some error, return original */
1075         return str;
1076     }
1077     return newstr;
1078 }
1079
1080 VALUE
1081 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1082                          rb_encoding *from, int ecflags, VALUE ecopts)
1083 {
1084     long olen;
1085
1086     olen = RSTRING_LEN(newstr);
1087     if (ofs < -olen || olen < ofs)
1088         rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1089     if (ofs < 0) ofs += olen;
1090     if (!from) {
1091         STR_SET_LEN(newstr, ofs);
1092         return rb_str_cat(newstr, ptr, len);
1093     }
1094
1095     rb_str_modify(newstr);
1096     return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1097                                  rb_enc_get(newstr),
1098                                  ecflags, ecopts);
1099 }
1100
1101 VALUE
1102 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1103 {
1104     STR_SET_LEN(str, 0);
1105     rb_enc_associate(str, enc);
1106     rb_str_cat(str, ptr, len);
1107     return str;
1108 }
1109
1110 static VALUE
1111 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1112                       rb_encoding *from, rb_encoding *to,
1113                       int ecflags, VALUE ecopts)
1114 {
1115     rb_econv_t *ec;
1116     rb_econv_result_t ret;
1117     long olen;
1118     VALUE econv_wrapper;
1119     const unsigned char *start, *sp;
1120     unsigned char *dest, *dp;
1121     size_t converted_output = (size_t)ofs;
1122
1123     olen = rb_str_capacity(newstr);
1124
1125     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1126     RBASIC_CLEAR_CLASS(econv_wrapper);
1127     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1128     if (!ec) return Qnil;
1129     DATA_PTR(econv_wrapper) = ec;
1130
1131     sp = (unsigned char*)ptr;
1132     start = sp;
1133     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1134            (dp = dest + converted_output),
1135            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1136            ret == econv_destination_buffer_full) {
1137         /* destination buffer short */
1138         size_t converted_input = sp - start;
1139         size_t rest = len - converted_input;
1140         converted_output = dp - dest;
1141         rb_str_set_len(newstr, converted_output);
1142         if (converted_input && converted_output &&
1143             rest < (LONG_MAX / converted_output)) {
1144             rest = (rest * converted_output) / converted_input;
1145         }
1146         else {
1147             rest = olen;
1148         }
1149         olen += rest < 2 ? 2 : rest;
1150         rb_str_resize(newstr, olen);
1151     }
1152     DATA_PTR(econv_wrapper) = 0;
1153     rb_econv_close(ec);
1154     switch (ret) {
1155       case econv_finished:
1156         len = dp - (unsigned char*)RSTRING_PTR(newstr);
1157         rb_str_set_len(newstr, len);
1158         rb_enc_associate(newstr, to);
1159         return newstr;
1160
1161       default:
1162         return Qnil;
1163     }
1164 }
1165
1166 VALUE
1167 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1168 {
1169     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1170 }
1171
1172 VALUE
1173 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1174 {
1175     rb_encoding *ienc;
1176     VALUE str;
1177     const int eidx = rb_enc_to_index(eenc);
1178
1179     if (!ptr) {
1180         return rb_enc_str_new(ptr, len, eenc);
1181     }
1182
1183     /* ASCII-8BIT case, no conversion */
1184     if ((eidx == rb_ascii8bit_encindex()) ||
1185         (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1186         return rb_str_new(ptr, len);
1187     }
1188     /* no default_internal or same encoding, no conversion */
1189     ienc = rb_default_internal_encoding();
1190     if (!ienc || eenc == ienc) {
1191         return rb_enc_str_new(ptr, len, eenc);
1192     }
1193     /* ASCII compatible, and ASCII only string, no conversion in
1194      * default_internal */
1195     if ((eidx == rb_ascii8bit_encindex()) ||
1196         (eidx == rb_usascii_encindex()) ||
1197         (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1198         return rb_enc_str_new(ptr, len, ienc);
1199     }
1200     /* convert from the given encoding to default_internal */
1201     str = rb_enc_str_new(NULL, 0, ienc);
1202     /* when the conversion failed for some reason, just ignore the
1203      * default_internal and result in the given encoding as-is. */
1204     if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1205         rb_str_initialize(str, ptr, len, eenc);
1206     }
1207     return str;
1208 }
1209
1210 VALUE
1211 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1212 {
1213     int eidx = rb_enc_to_index(eenc);
1214     if (eidx == rb_usascii_encindex() &&
1215         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
1216         rb_enc_associate_index(str, rb_ascii8bit_encindex());
1217         return str;
1218     }
1219     rb_enc_associate_index(str, eidx);
1220     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1221 }
1222
1223 VALUE
1224 rb_external_str_new(const char *ptr, long len)
1225 {
1226     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1227 }
1228
1229 VALUE
1230 rb_external_str_new_cstr(const char *ptr)
1231 {
1232     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1233 }
1234
1235 VALUE
1236 rb_locale_str_new(const char *ptr, long len)
1237 {
1238     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1239 }
1240
1241 VALUE
1242 rb_locale_str_new_cstr(const char *ptr)
1243 {
1244     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1245 }
1246
1247 VALUE
1248 rb_filesystem_str_new(const char *ptr, long len)
1249 {
1250     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1251 }
1252
1253 VALUE
1254 rb_filesystem_str_new_cstr(const char *ptr)
1255 {
1256     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1257 }
1258
1259 VALUE
1260 rb_str_export(VALUE str)
1261 {
1262     return rb_str_export_to_enc(str, rb_default_external_encoding());
1263 }
1264
1265 VALUE
1266 rb_str_export_locale(VALUE str)
1267 {
1268     return rb_str_export_to_enc(str, rb_locale_encoding());
1269 }
1270
1271 VALUE
1272 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1273 {
1274     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1275 }
1276
1277 static VALUE
1278 str_replace_shared_without_enc(VALUE str2, VALUE str)
1279 {
1280     const int termlen = TERM_LEN(str);
1281     char *ptr;
1282     long len;
1283
1284     RSTRING_GETMEM(str, ptr, len);
1285     if (str_embed_capa(str2) >= len + termlen) {
1286         char *ptr2 = RSTRING(str2)->as.embed.ary;
1287         STR_SET_EMBED(str2);
1288         memcpy(ptr2, RSTRING_PTR(str), len);
1289         STR_SET_EMBED_LEN(str2, len);
1290         TERM_FILL(ptr2+len, termlen);
1291     }
1292     else {
1293         VALUE root;
1294         if (STR_SHARED_P(str)) {
1295             root = RSTRING(str)->as.heap.aux.shared;
1296             RSTRING_GETMEM(str, ptr, len);
1297         }
1298         else {
1299             root = rb_str_new_frozen(str);
1300             RSTRING_GETMEM(root, ptr, len);
1301         }
1302         assert(OBJ_FROZEN(root));
1303         if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1304             if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1305                 rb_fatal("about to free a possible shared root");
1306             }
1307             char *ptr2 = STR_HEAP_PTR(str2);
1308             if (ptr2 != ptr) {
1309                 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1310             }
1311         }
1312         FL_SET(str2, STR_NOEMBED);
1313         RSTRING(str2)->as.heap.len = len;
1314         RSTRING(str2)->as.heap.ptr = ptr;
1315         STR_SET_SHARED(str2, root);
1316     }
1317     return str2;
1318 }
1319
1320 static VALUE
1321 str_replace_shared(VALUE str2, VALUE str)
1322 {
1323     str_replace_shared_without_enc(str2, str);
1324     rb_enc_cr_str_exact_copy(str2, str);
1325     return str2;
1326 }
1327
1328 static VALUE
1329 str_new_shared(VALUE klass, VALUE str)
1330 {
1331     return str_replace_shared(str_alloc_heap(klass), str);
1332 }
1333
1334 VALUE
1335 rb_str_new_shared(VALUE str)
1336 {
1337     return str_new_shared(rb_obj_class(str), str);
1338 }
1339
1340 VALUE
1341 rb_str_new_frozen(VALUE orig)
1342 {
1343     if (OBJ_FROZEN(orig)) return orig;
1344     return str_new_frozen(rb_obj_class(orig), orig);
1345 }
1346
1347 static VALUE
1348 rb_str_new_frozen_String(VALUE orig)
1349 {
1350     if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1351     return str_new_frozen(rb_cString, orig);
1352 }
1353
1354 VALUE
1355 rb_str_tmp_frozen_acquire(VALUE orig)
1356 {
1357     if (OBJ_FROZEN_RAW(orig)) return orig;
1358     return str_new_frozen_buffer(0, orig, FALSE);
1359 }
1360
1361 void
1362 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1363 {
1364     if (RBASIC_CLASS(tmp) != 0)
1365         return;
1366
1367     if (STR_EMBED_P(tmp)) {
1368         assert(OBJ_FROZEN_RAW(tmp));
1369     }
1370     else if (FL_TEST_RAW(orig, STR_SHARED) &&
1371             !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1372         VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1373
1374         if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1375             assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1376             assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1377
1378             /* Unshare orig since the root (tmp) only has this one child. */
1379             FL_UNSET_RAW(orig, STR_SHARED);
1380             RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1381             RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1382             assert(OBJ_FROZEN_RAW(tmp));
1383
1384             /* Make tmp embedded and empty so it is safe for sweeping. */
1385             STR_SET_EMBED(tmp);
1386             STR_SET_EMBED_LEN(tmp, 0);
1387         }
1388     }
1389 }
1390
1391 static VALUE
1392 str_new_frozen(VALUE klass, VALUE orig)
1393 {
1394     return str_new_frozen_buffer(klass, orig, TRUE);
1395 }
1396
1397 static VALUE
1398 heap_str_make_shared(VALUE klass, VALUE orig)
1399 {
1400     assert(!STR_EMBED_P(orig));
1401     assert(!STR_SHARED_P(orig));
1402
1403     VALUE str = str_alloc_heap(klass);
1404     STR_SET_NOEMBED(str);
1405     RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1406     RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1407     RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1408     RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1409     RBASIC(orig)->flags &= ~STR_NOFREE;
1410     STR_SET_SHARED(orig, str);
1411     if (klass == 0)
1412         FL_UNSET_RAW(str, STR_BORROWED);
1413     return str;
1414 }
1415
1416 static VALUE
1417 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1418 {
1419     VALUE str;
1420
1421     long len = RSTRING_LEN(orig);
1422
1423     if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, 1)) {
1424         str = str_new(klass, RSTRING_PTR(orig), len);
1425         assert(STR_EMBED_P(str));
1426     }
1427     else {
1428         if (FL_TEST_RAW(orig, STR_SHARED)) {
1429             VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1430             long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1431             long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1432             assert(ofs >= 0);
1433             assert(rest >= 0);
1434             assert(ofs + rest <= RSTRING_LEN(shared));
1435 #if !USE_RVARGC
1436             assert(!STR_EMBED_P(shared));
1437 #endif
1438             assert(OBJ_FROZEN(shared));
1439
1440             if ((ofs > 0) || (rest > 0) ||
1441                 (klass != RBASIC(shared)->klass) ||
1442                 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1443                 str = str_new_shared(klass, shared);
1444                 assert(!STR_EMBED_P(str));
1445                 RSTRING(str)->as.heap.ptr += ofs;
1446                 RSTRING(str)->as.heap.len -= ofs + rest;
1447             }
1448             else {
1449                 if (RBASIC_CLASS(shared) == 0)
1450                     FL_SET_RAW(shared, STR_BORROWED);
1451                 return shared;
1452             }
1453         }
1454         else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1455             str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1456             STR_SET_EMBED(str);
1457             memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1458             STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1459             TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1460         }
1461         else {
1462             str = heap_str_make_shared(klass, orig);
1463         }
1464     }
1465
1466     if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1467     OBJ_FREEZE(str);
1468     return str;
1469 }
1470
1471 VALUE
1472 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1473 {
1474     return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1475 }
1476
1477 static VALUE
1478 str_new_empty_String(VALUE str)
1479 {
1480     VALUE v = rb_str_new(0, 0);
1481     rb_enc_copy(v, str);
1482     return v;
1483 }
1484
1485 #define STR_BUF_MIN_SIZE 63
1486 #if !USE_RVARGC
1487 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1488 #endif
1489
1490 VALUE
1491 rb_str_buf_new(long capa)
1492 {
1493     if (STR_EMBEDDABLE_P(capa, 1)) {
1494         return str_alloc_embed(rb_cString, capa + 1);
1495     }
1496
1497     VALUE str = str_alloc_heap(rb_cString);
1498
1499 #if !USE_RVARGC
1500     if (capa < STR_BUF_MIN_SIZE) {
1501         capa = STR_BUF_MIN_SIZE;
1502     }
1503 #endif
1504     FL_SET(str, STR_NOEMBED);
1505     RSTRING(str)->as.heap.aux.capa = capa;
1506     RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1507     RSTRING(str)->as.heap.ptr[0] = '\0';
1508
1509     return str;
1510 }
1511
1512 VALUE
1513 rb_str_buf_new_cstr(const char *ptr)
1514 {
1515     VALUE str;
1516     long len = strlen(ptr);
1517
1518     str = rb_str_buf_new(len);
1519     rb_str_buf_cat(str, ptr, len);
1520
1521     return str;
1522 }
1523
1524 VALUE
1525 rb_str_tmp_new(long len)
1526 {
1527     return str_new(0, 0, len);
1528 }
1529
1530 void
1531 rb_str_free(VALUE str)
1532 {
1533     if (FL_TEST(str, RSTRING_FSTR)) {
1534         st_data_t fstr = (st_data_t)str;
1535
1536         RB_VM_LOCK_ENTER();
1537         {
1538             st_delete(rb_vm_fstring_table(), &fstr, NULL);
1539             RB_DEBUG_COUNTER_INC(obj_str_fstr);
1540         }
1541         RB_VM_LOCK_LEAVE();
1542     }
1543
1544     if (STR_EMBED_P(str)) {
1545         RB_DEBUG_COUNTER_INC(obj_str_embed);
1546     }
1547     else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1548         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1549         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1550     }
1551     else {
1552         RB_DEBUG_COUNTER_INC(obj_str_ptr);
1553         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1554     }
1555 }
1556
1557 RUBY_FUNC_EXPORTED size_t
1558 rb_str_memsize(VALUE str)
1559 {
1560     if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1561         return STR_HEAP_SIZE(str);
1562     }
1563     else {
1564         return 0;
1565     }
1566 }
1567
1568 VALUE
1569 rb_str_to_str(VALUE str)
1570 {
1571     return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1572 }
1573
1574 static inline void str_discard(VALUE str);
1575 static void str_shared_replace(VALUE str, VALUE str2);
1576
1577 void
1578 rb_str_shared_replace(VALUE str, VALUE str2)
1579 {
1580     if (str != str2) str_shared_replace(str, str2);
1581 }
1582
1583 static void
1584 str_shared_replace(VALUE str, VALUE str2)
1585 {
1586     rb_encoding *enc;
1587     int cr;
1588     int termlen;
1589
1590     RUBY_ASSERT(str2 != str);
1591     enc = STR_ENC_GET(str2);
1592     cr = ENC_CODERANGE(str2);
1593     str_discard(str);
1594     termlen = rb_enc_mbminlen(enc);
1595
1596     if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1597         STR_SET_EMBED(str);
1598         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1599         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1600         rb_enc_associate(str, enc);
1601         ENC_CODERANGE_SET(str, cr);
1602     }
1603     else {
1604 #if USE_RVARGC
1605         if (STR_EMBED_P(str2)) {
1606             assert(!FL_TEST(str2, STR_SHARED));
1607             long len = RSTRING(str2)->as.embed.len;
1608             assert(len + termlen <= str_embed_capa(str2));
1609
1610             char *new_ptr = ALLOC_N(char, len + termlen);
1611             memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1612             RSTRING(str2)->as.heap.ptr = new_ptr;
1613             RSTRING(str2)->as.heap.len = len;
1614             RSTRING(str2)->as.heap.aux.capa = len;
1615             STR_SET_NOEMBED(str2);
1616         }
1617 #endif
1618
1619         STR_SET_NOEMBED(str);
1620         FL_UNSET(str, STR_SHARED);
1621         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1622         RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1623
1624         if (FL_TEST(str2, STR_SHARED)) {
1625             VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1626             STR_SET_SHARED(str, shared);
1627         }
1628         else {
1629             RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1630         }
1631
1632         /* abandon str2 */
1633         STR_SET_EMBED(str2);
1634         RSTRING_PTR(str2)[0] = 0;
1635         STR_SET_EMBED_LEN(str2, 0);
1636         rb_enc_associate(str, enc);
1637         ENC_CODERANGE_SET(str, cr);
1638     }
1639 }
1640
1641 VALUE
1642 rb_obj_as_string(VALUE obj)
1643 {
1644     VALUE str;
1645
1646     if (RB_TYPE_P(obj, T_STRING)) {
1647         return obj;
1648     }
1649     str = rb_funcall(obj, idTo_s, 0);
1650     return rb_obj_as_string_result(str, obj);
1651 }
1652
1653 MJIT_FUNC_EXPORTED VALUE
1654 rb_obj_as_string_result(VALUE str, VALUE obj)
1655 {
1656     if (!RB_TYPE_P(str, T_STRING))
1657         return rb_any_to_s(obj);
1658     return str;
1659 }
1660
1661 static VALUE
1662 str_replace(VALUE str, VALUE str2)
1663 {
1664     long len;
1665
1666     len = RSTRING_LEN(str2);
1667     if (STR_SHARED_P(str2)) {
1668         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1669         assert(OBJ_FROZEN(shared));
1670         STR_SET_NOEMBED(str);
1671         RSTRING(str)->as.heap.len = len;
1672         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1673         STR_SET_SHARED(str, shared);
1674         rb_enc_cr_str_exact_copy(str, str2);
1675     }
1676     else {
1677         str_replace_shared(str, str2);
1678     }
1679
1680     return str;
1681 }
1682
1683 static inline VALUE
1684 ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1685 {
1686     assert(size > 0);
1687     RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1688                            T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size);
1689     return (VALUE)str;
1690 }
1691
1692 static inline VALUE
1693 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1694 {
1695     size_t size = str_embed_size(capa);
1696     assert(rb_gc_size_allocatable_p(size));
1697 #if !USE_RVARGC
1698     assert(size <= sizeof(struct RString));
1699 #endif
1700     return ec_str_alloc(ec, klass, size);
1701 }
1702
1703 static inline VALUE
1704 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1705 {
1706     return ec_str_alloc(ec, klass, sizeof(struct RString));
1707 }
1708
1709 static inline VALUE
1710 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1711 {
1712     const VALUE flag_mask =
1713 #if !USE_RVARGC
1714         RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1715 #endif
1716         ENC_CODERANGE_MASK | ENCODING_MASK |
1717         FL_FREEZE
1718         ;
1719     VALUE flags = FL_TEST_RAW(str, flag_mask);
1720     int encidx = 0;
1721     if (STR_EMBED_P(str)) {
1722         long len = RSTRING_EMBED_LEN(str);
1723
1724         assert(str_embed_capa(dup) >= len + 1);
1725         STR_SET_EMBED_LEN(dup, len);
1726         MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1727     }
1728     else {
1729         VALUE root = str;
1730         if (FL_TEST_RAW(str, STR_SHARED)) {
1731             root = RSTRING(str)->as.heap.aux.shared;
1732         }
1733         else if (UNLIKELY(!(flags & FL_FREEZE))) {
1734             root = str = str_new_frozen(klass, str);
1735             flags = FL_TEST_RAW(str, flag_mask);
1736         }
1737         assert(!STR_SHARED_P(root));
1738         assert(RB_OBJ_FROZEN_RAW(root));
1739 #if USE_RVARGC
1740         if (1) {
1741 #else
1742         if (STR_EMBED_P(root)) {
1743             MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1744                    char, RSTRING_EMBED_LEN_MAX + 1);
1745         }
1746         else {
1747 #endif
1748             RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1749             RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1750             RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1751             flags |= RSTRING_NOEMBED | STR_SHARED;
1752         }
1753     }
1754
1755     if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1756         encidx = rb_enc_get_index(str);
1757         flags &= ~ENCODING_MASK;
1758     }
1759     FL_SET_RAW(dup, flags & ~FL_FREEZE);
1760     if (encidx) rb_enc_associate_index(dup, encidx);
1761     return dup;
1762 }
1763
1764 static inline VALUE
1765 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1766 {
1767     VALUE dup;
1768     if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1769         dup = ec_str_alloc_heap(ec, klass);
1770     }
1771     else {
1772         dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1773     }
1774
1775     return str_duplicate_setup(klass, str, dup);
1776 }
1777
1778 static inline VALUE
1779 str_duplicate(VALUE klass, VALUE str)
1780 {
1781     VALUE dup;
1782     if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1783         dup = str_alloc_heap(klass);
1784     }
1785     else {
1786        dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1787     }
1788
1789     return str_duplicate_setup(klass, str, dup);
1790 }
1791
1792 VALUE
1793 rb_str_dup(VALUE str)
1794 {
1795     return str_duplicate(rb_obj_class(str), str);
1796 }
1797
1798 VALUE
1799 rb_str_resurrect(VALUE str)
1800 {
1801     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1802     return str_duplicate(rb_cString, str);
1803 }
1804
1805 VALUE
1806 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1807 {
1808     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1809     return ec_str_duplicate(ec, rb_cString, str);
1810 }
1811
1812 /*
1813  *  call-seq:
1814  *    String.new(string = '') -> new_string
1815  *    String.new(string = '', encoding: encoding) -> new_string
1816  *    String.new(string = '', capacity: size) -> new_string
1817  *
1818  *  Returns a new \String that is a copy of +string+.
1819  *
1820  *  With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1821  *    s = String.new
1822  *    s # => ""
1823  *    s.encoding # => #<Encoding:ASCII-8BIT>
1824  *
1825  *  With the single \String argument +string+, returns a copy of +string+
1826  *  with the same encoding as +string+:
1827  *    s = String.new("Que veut dire \u{e7}a?")
1828  *    s # => "Que veut dire \u{e7}a?"
1829  *    s.encoding # => #<Encoding:UTF-8>
1830  *
1831  *  Literal strings like <tt>""</tt> or here-documents always use
1832  *  {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1833  *
1834  *  With keyword +encoding+, returns a copy of +str+
1835  *  with the specified encoding:
1836  *    s = String.new(encoding: 'ASCII')
1837  *    s.encoding # => #<Encoding:US-ASCII>
1838  *    s = String.new('foo', encoding: 'ASCII')
1839  *    s.encoding # => #<Encoding:US-ASCII>
1840  *
1841  *  Note that these are equivalent:
1842  *    s0 = String.new('foo', encoding: 'ASCII')
1843  *    s1 = 'foo'.force_encoding('ASCII')
1844  *    s0.encoding == s1.encoding # => true
1845  *
1846  *  With keyword +capacity+, returns a copy of +str+;
1847  *  the given +capacity+ may set the size of the internal buffer,
1848  *  which may affect performance:
1849  *    String.new(capacity: 1) # => ""
1850  *    String.new(capacity: 4096) # => ""
1851  *
1852  *  The +string+, +encoding+, and +capacity+ arguments may all be used together:
1853  *
1854  *    String.new('hello', encoding: 'UTF-8', capacity: 25)
1855  *
1856  */
1857
1858 static VALUE
1859 rb_str_init(int argc, VALUE *argv, VALUE str)
1860 {
1861     static ID keyword_ids[2];
1862     VALUE orig, opt, venc, vcapa;
1863     VALUE kwargs[2];
1864     rb_encoding *enc = 0;
1865     int n;
1866
1867     if (!keyword_ids[0]) {
1868         keyword_ids[0] = rb_id_encoding();
1869         CONST_ID(keyword_ids[1], "capacity");
1870     }
1871
1872     n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1873     if (!NIL_P(opt)) {
1874         rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1875         venc = kwargs[0];
1876         vcapa = kwargs[1];
1877         if (venc != Qundef && !NIL_P(venc)) {
1878             enc = rb_to_encoding(venc);
1879         }
1880         if (vcapa != Qundef && !NIL_P(vcapa)) {
1881             long capa = NUM2LONG(vcapa);
1882             long len = 0;
1883             int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1884
1885             if (capa < STR_BUF_MIN_SIZE) {
1886                 capa = STR_BUF_MIN_SIZE;
1887             }
1888             if (n == 1) {
1889                 StringValue(orig);
1890                 len = RSTRING_LEN(orig);
1891                 if (capa < len) {
1892                     capa = len;
1893                 }
1894                 if (orig == str) n = 0;
1895             }
1896             str_modifiable(str);
1897             if (STR_EMBED_P(str)) { /* make noembed always */
1898                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1899 #if USE_RVARGC
1900                 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1901                 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1902 #else
1903                 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1904 #endif
1905                 RSTRING(str)->as.heap.ptr = new_ptr;
1906             }
1907             else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1908                 const size_t size = (size_t)capa + termlen;
1909                 const char *const old_ptr = RSTRING_PTR(str);
1910                 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1911                 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1912                 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1913                 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1914                 RSTRING(str)->as.heap.ptr = new_ptr;
1915             }
1916             else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1917                 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1918                         (size_t)capa + termlen, STR_HEAP_SIZE(str));
1919             }
1920             RSTRING(str)->as.heap.len = len;
1921             TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1922             if (n == 1) {
1923                 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1924                 rb_enc_cr_str_exact_copy(str, orig);
1925             }
1926             FL_SET(str, STR_NOEMBED);
1927             RSTRING(str)->as.heap.aux.capa = capa;
1928         }
1929         else if (n == 1) {
1930             rb_str_replace(str, orig);
1931         }
1932         if (enc) {
1933             rb_enc_associate(str, enc);
1934             ENC_CODERANGE_CLEAR(str);
1935         }
1936     }
1937     else if (n == 1) {
1938         rb_str_replace(str, orig);
1939     }
1940     return str;
1941 }
1942
1943 #ifdef NONASCII_MASK
1944 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1945
1946 /*
1947  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1948  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1949  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1950  *
1951  * if (!(byte & 0x80))
1952  *   byte |= 0x40;          // turn on bit6
1953  * return ((byte>>6) & 1);  // bit6 represent whether this byte is leading or not.
1954  *
1955  * This function calculates whether a byte is leading or not for all bytes
1956  * in the argument word by concurrently using the above logic, and then
1957  * adds up the number of leading bytes in the word.
1958  */
1959 static inline uintptr_t
1960 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1961 {
1962     uintptr_t d = *s;
1963
1964     /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1965     d = (d>>6) | (~d>>7);
1966     d &= NONASCII_MASK >> 7;
1967
1968     /* Gather all bytes. */
1969 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1970     /* use only if it can use POPCNT */
1971     return rb_popcount_intptr(d);
1972 #else
1973     d += (d>>8);
1974     d += (d>>16);
1975 # if SIZEOF_VOIDP == 8
1976     d += (d>>32);
1977 # endif
1978     return (d&0xF);
1979 #endif
1980 }
1981 #endif
1982
1983 static inline long
1984 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1985 {
1986     long c;
1987     const char *q;
1988
1989     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1990         long diff = (long)(e - p);
1991         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1992     }
1993 #ifdef NONASCII_MASK
1994     else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1995         uintptr_t len = 0;
1996         if ((int)sizeof(uintptr_t) * 2 < e - p) {
1997             const uintptr_t *s, *t;
1998             const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1999             s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2000             t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2001             while (p < (const char *)s) {
2002                 if (is_utf8_lead_byte(*p)) len++;
2003                 p++;
2004             }
2005             while (s < t) {
2006                 len += count_utf8_lead_bytes_with_word(s);
2007                 s++;
2008             }
2009             p = (const char *)s;
2010         }
2011         while (p < e) {
2012             if (is_utf8_lead_byte(*p)) len++;
2013             p++;
2014         }
2015         return (long)len;
2016     }
2017 #endif
2018     else if (rb_enc_asciicompat(enc)) {
2019         c = 0;
2020         if (ENC_CODERANGE_CLEAN_P(cr)) {
2021             while (p < e) {
2022                 if (ISASCII(*p)) {
2023                     q = search_nonascii(p, e);
2024                     if (!q)
2025                         return c + (e - p);
2026                     c += q - p;
2027                     p = q;
2028                 }
2029                 p += rb_enc_fast_mbclen(p, e, enc);
2030                 c++;
2031             }
2032         }
2033         else {
2034             while (p < e) {
2035                 if (ISASCII(*p)) {
2036                     q = search_nonascii(p, e);
2037                     if (!q)
2038                         return c + (e - p);
2039                     c += q - p;
2040                     p = q;
2041                 }
2042                 p += rb_enc_mbclen(p, e, enc);
2043                 c++;
2044             }
2045         }
2046         return c;
2047     }
2048
2049     for (c=0; p<e; c++) {
2050         p += rb_enc_mbclen(p, e, enc);
2051     }
2052     return c;
2053 }
2054
2055 long
2056 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2057 {
2058     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2059 }
2060
2061 /* To get strlen with cr
2062  * Note that given cr is not used.
2063  */
2064 long
2065 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2066 {
2067     long c;
2068     const char *q;
2069     int ret;
2070
2071     *cr = 0;
2072     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2073         long diff = (long)(e - p);
2074         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2075     }
2076     else if (rb_enc_asciicompat(enc)) {
2077         c = 0;
2078         while (p < e) {
2079             if (ISASCII(*p)) {
2080                 q = search_nonascii(p, e);
2081                 if (!q) {
2082                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2083                     return c + (e - p);
2084                 }
2085                 c += q - p;
2086                 p = q;
2087             }
2088             ret = rb_enc_precise_mbclen(p, e, enc);
2089             if (MBCLEN_CHARFOUND_P(ret)) {
2090                 *cr |= ENC_CODERANGE_VALID;
2091                 p += MBCLEN_CHARFOUND_LEN(ret);
2092             }
2093             else {
2094                 *cr = ENC_CODERANGE_BROKEN;
2095                 p++;
2096             }
2097             c++;
2098         }
2099         if (!*cr) *cr = ENC_CODERANGE_7BIT;
2100         return c;
2101     }
2102
2103     for (c=0; p<e; c++) {
2104         ret = rb_enc_precise_mbclen(p, e, enc);
2105         if (MBCLEN_CHARFOUND_P(ret)) {
2106             *cr |= ENC_CODERANGE_VALID;
2107             p += MBCLEN_CHARFOUND_LEN(ret);
2108         }
2109         else {
2110             *cr = ENC_CODERANGE_BROKEN;
2111             if (p + rb_enc_mbminlen(enc) <= e)
2112                 p += rb_enc_mbminlen(enc);
2113             else
2114                 p = e;
2115         }
2116     }
2117     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2118     return c;
2119 }
2120
2121 /* enc must be str's enc or rb_enc_check(str, str2) */
2122 static long
2123 str_strlen(VALUE str, rb_encoding *enc)
2124 {
2125     const char *p, *e;
2126     int cr;
2127
2128     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2129     if (!enc) enc = STR_ENC_GET(str);
2130     p = RSTRING_PTR(str);
2131     e = RSTRING_END(str);
2132     cr = ENC_CODERANGE(str);
2133
2134     if (cr == ENC_CODERANGE_UNKNOWN) {
2135         long n = rb_enc_strlen_cr(p, e, enc, &cr);
2136         if (cr) ENC_CODERANGE_SET(str, cr);
2137         return n;
2138     }
2139     else {
2140         return enc_strlen(p, e, enc, cr);
2141     }
2142 }
2143
2144 long
2145 rb_str_strlen(VALUE str)
2146 {
2147     return str_strlen(str, NULL);
2148 }
2149
2150 /*
2151  *  call-seq:
2152  *    length -> integer
2153  *
2154  *  Returns the count of characters (not bytes) in +self+:
2155  *
2156  *    "\x80\u3042".length # => 2
2157  *    "hello".length # => 5
2158  *
2159  *  String#size is an alias for String#length.
2160  *
2161  *  Related: String#bytesize.
2162  */
2163
2164 VALUE
2165 rb_str_length(VALUE str)
2166 {
2167     return LONG2NUM(str_strlen(str, NULL));
2168 }
2169
2170 /*
2171  *  call-seq:
2172  *    bytesize -> integer
2173  *
2174  *  Returns the count  of bytes in +self+:
2175  *
2176  *    "\x80\u3042".bytesize # => 4
2177  *    "hello".bytesize # => 5
2178  *
2179  *  Related: String#length.
2180  */
2181
2182 static VALUE
2183 rb_str_bytesize(VALUE str)
2184 {
2185     return LONG2NUM(RSTRING_LEN(str));
2186 }
2187
2188 /*
2189  *  call-seq:
2190  *    empty? -> true or false
2191  *
2192  *  Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2193  *
2194  *    "hello".empty? # => false
2195  *    " ".empty? # => false
2196  *    "".empty? # => true
2197  *
2198  */
2199
2200 static VALUE
2201 rb_str_empty(VALUE str)
2202 {
2203     return RBOOL(RSTRING_LEN(str) == 0);
2204 }
2205
2206 /*
2207  *  call-seq:
2208  *    string + other_string -> new_string
2209  *
2210  *  Returns a new \String containing +other_string+ concatenated to +self+:
2211  *
2212  *    "Hello from " + self.to_s # => "Hello from main"
2213  *
2214  */
2215
2216 VALUE
2217 rb_str_plus(VALUE str1, VALUE str2)
2218 {
2219     VALUE str3;
2220     rb_encoding *enc;
2221     char *ptr1, *ptr2, *ptr3;
2222     long len1, len2;
2223     int termlen;
2224
2225     StringValue(str2);
2226     enc = rb_enc_check_str(str1, str2);
2227     RSTRING_GETMEM(str1, ptr1, len1);
2228     RSTRING_GETMEM(str2, ptr2, len2);
2229     termlen = rb_enc_mbminlen(enc);
2230     if (len1 > LONG_MAX - len2) {
2231         rb_raise(rb_eArgError, "string size too big");
2232     }
2233     str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2234     ptr3 = RSTRING_PTR(str3);
2235     memcpy(ptr3, ptr1, len1);
2236     memcpy(ptr3+len1, ptr2, len2);
2237     TERM_FILL(&ptr3[len1+len2], termlen);
2238
2239     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2240                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2241     RB_GC_GUARD(str1);
2242     RB_GC_GUARD(str2);
2243     return str3;
2244 }
2245
2246 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2247 MJIT_FUNC_EXPORTED VALUE
2248 rb_str_opt_plus(VALUE str1, VALUE str2)
2249 {
2250     assert(RBASIC_CLASS(str1) == rb_cString);
2251     assert(RBASIC_CLASS(str2) == rb_cString);
2252     long len1, len2;
2253     MAYBE_UNUSED(char) *ptr1, *ptr2;
2254     RSTRING_GETMEM(str1, ptr1, len1);
2255     RSTRING_GETMEM(str2, ptr2, len2);
2256     int enc1 = rb_enc_get_index(str1);
2257     int enc2 = rb_enc_get_index(str2);
2258
2259     if (enc1 < 0) {
2260         return Qundef;
2261     }
2262     else if (enc2 < 0) {
2263         return Qundef;
2264     }
2265     else if (enc1 != enc2) {
2266         return Qundef;
2267     }
2268     else if (len1 > LONG_MAX - len2) {
2269         return Qundef;
2270     }
2271     else {
2272         return rb_str_plus(str1, str2);
2273     }
2274
2275 }
2276
2277 /*
2278  *  call-seq:
2279  *    string * integer -> new_string
2280  *
2281  *  Returns a new \String containing +integer+ copies of +self+:
2282  *
2283  *    "Ho! " * 3 # => "Ho! Ho! Ho! "
2284  *    "Ho! " * 0 # => ""
2285  *
2286  */
2287
2288 VALUE
2289 rb_str_times(VALUE str, VALUE times)
2290 {
2291     VALUE str2;
2292     long n, len;
2293     char *ptr2;
2294     int termlen;
2295
2296     if (times == INT2FIX(1)) {
2297         return str_duplicate(rb_cString, str);
2298     }
2299     if (times == INT2FIX(0)) {
2300         str2 = str_alloc_embed(rb_cString, 0);
2301         rb_enc_copy(str2, str);
2302         return str2;
2303     }
2304     len = NUM2LONG(times);
2305     if (len < 0) {
2306         rb_raise(rb_eArgError, "negative argument");
2307     }
2308     if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2309         if (STR_EMBEDDABLE_P(len, 1)) {
2310             str2 = str_alloc_embed(rb_cString, len + 1);
2311             memset(RSTRING_PTR(str2), 0, len + 1);
2312         }
2313         else {
2314             str2 = str_alloc_heap(rb_cString);
2315             RSTRING(str2)->as.heap.aux.capa = len;
2316             RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2317             STR_SET_NOEMBED(str2);
2318         }
2319         STR_SET_LEN(str2, len);
2320         rb_enc_copy(str2, str);
2321         return str2;
2322     }
2323     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
2324         rb_raise(rb_eArgError, "argument too big");
2325     }
2326
2327     len *= RSTRING_LEN(str);
2328     termlen = TERM_LEN(str);
2329     str2 = str_new0(rb_cString, 0, len, termlen);
2330     ptr2 = RSTRING_PTR(str2);
2331     if (len) {
2332         n = RSTRING_LEN(str);
2333         memcpy(ptr2, RSTRING_PTR(str), n);
2334         while (n <= len/2) {
2335             memcpy(ptr2 + n, ptr2, n);
2336             n *= 2;
2337         }
2338         memcpy(ptr2 + n, ptr2, len-n);
2339     }
2340     STR_SET_LEN(str2, len);
2341     TERM_FILL(&ptr2[len], termlen);
2342     rb_enc_cr_str_copy_for_substr(str2, str);
2343
2344     return str2;
2345 }
2346
2347 /*
2348  *  call-seq:
2349  *    string % object -> new_string
2350  *
2351  *  Returns the result of formatting +object+ into the format specification +self+
2352  *  (see Kernel#sprintf for formatting details):
2353  *
2354  *    "%05d" % 123 # => "00123"
2355  *
2356  *  If +self+ contains multiple substitutions, +object+ must be
2357  *  an \Array or \Hash containing the values to be substituted:
2358  *
2359  *    "%-5s: %016x" % [ "ID", self.object_id ] # => "ID   : 00002b054ec93168"
2360  *    "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2361  *    "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2362  *
2363  */
2364
2365 static VALUE
2366 rb_str_format_m(VALUE str, VALUE arg)
2367 {
2368     VALUE tmp = rb_check_array_type(arg);
2369
2370     if (!NIL_P(tmp)) {
2371         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2372     }
2373     return rb_str_format(1, &arg, str);
2374 }
2375
2376 static inline void
2377 rb_check_lockedtmp(VALUE str)
2378 {
2379     if (FL_TEST(str, STR_TMPLOCK)) {
2380         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2381     }
2382 }
2383
2384 static inline void
2385 str_modifiable(VALUE str)
2386 {
2387     rb_check_lockedtmp(str);
2388     rb_check_frozen(str);
2389 }
2390
2391 static inline int
2392 str_dependent_p(VALUE str)
2393 {
2394     if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2395         return 0;
2396     }
2397     else {
2398         return 1;
2399     }
2400 }
2401
2402 static inline int
2403 str_independent(VALUE str)
2404 {
2405     str_modifiable(str);
2406     return !str_dependent_p(str);
2407 }
2408
2409 static void
2410 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2411 {
2412     char *ptr;
2413     char *oldptr;
2414     long capa = len + expand;
2415
2416     if (len > capa) len = capa;
2417
2418     if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2419         ptr = RSTRING(str)->as.heap.ptr;
2420         STR_SET_EMBED(str);
2421         memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2422         TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2423         STR_SET_EMBED_LEN(str, len);
2424         return;
2425     }
2426
2427     ptr = ALLOC_N(char, (size_t)capa + termlen);
2428     oldptr = RSTRING_PTR(str);
2429     if (oldptr) {
2430         memcpy(ptr, oldptr, len);
2431     }
2432     if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2433         xfree(oldptr);
2434     }
2435     STR_SET_NOEMBED(str);
2436     FL_UNSET(str, STR_SHARED|STR_NOFREE);
2437     TERM_FILL(ptr + len, termlen);
2438     RSTRING(str)->as.heap.ptr = ptr;
2439     RSTRING(str)->as.heap.len = len;
2440     RSTRING(str)->as.heap.aux.capa = capa;
2441 }
2442
2443 void
2444 rb_str_modify(VALUE str)
2445 {
2446     if (!str_independent(str))
2447         str_make_independent(str);
2448     ENC_CODERANGE_CLEAR(str);
2449 }
2450
2451 void
2452 rb_str_modify_expand(VALUE str, long expand)
2453 {
2454     int termlen = TERM_LEN(str);
2455     long len = RSTRING_LEN(str);
2456
2457     if (expand < 0) {
2458         rb_raise(rb_eArgError, "negative expanding string size");
2459     }
2460     if (expand >= LONG_MAX - len) {
2461         rb_raise(rb_eArgError, "string size too big");
2462     }
2463
2464     if (!str_independent(str)) {
2465         str_make_independent_expand(str, len, expand, termlen);
2466     }
2467     else if (expand > 0) {
2468         RESIZE_CAPA_TERM(str, len + expand, termlen);
2469     }
2470     ENC_CODERANGE_CLEAR(str);
2471 }
2472
2473 /* As rb_str_modify(), but don't clear coderange */
2474 static void
2475 str_modify_keep_cr(VALUE str)
2476 {
2477     if (!str_independent(str))
2478         str_make_independent(str);
2479     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2480         /* Force re-scan later */
2481         ENC_CODERANGE_CLEAR(str);
2482 }
2483
2484 static inline void
2485 str_discard(VALUE str)
2486 {
2487     str_modifiable(str);
2488     if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2489         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2490         RSTRING(str)->as.heap.ptr = 0;
2491         RSTRING(str)->as.heap.len = 0;
2492     }
2493 }
2494
2495 void
2496 rb_must_asciicompat(VALUE str)
2497 {
2498     rb_encoding *enc = rb_enc_get(str);
2499     if (!rb_enc_asciicompat(enc)) {
2500         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2501     }
2502 }
2503
2504 VALUE
2505 rb_string_value(volatile VALUE *ptr)
2506 {
2507     VALUE s = *ptr;
2508     if (!RB_TYPE_P(s, T_STRING)) {
2509         s = rb_str_to_str(s);
2510         *ptr = s;
2511     }
2512     return s;
2513 }
2514
2515 char *
2516 rb_string_value_ptr(volatile VALUE *ptr)
2517 {
2518     VALUE str = rb_string_value(ptr);
2519     return RSTRING_PTR(str);
2520 }
2521
2522 static int
2523 zero_filled(const char *s, int n)
2524 {
2525     for (; n > 0; --n) {
2526         if (*s++) return 0;
2527     }
2528     return 1;
2529 }
2530
2531 static const char *
2532 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2533 {
2534     const char *e = s + len;
2535
2536     for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2537         if (zero_filled(s, minlen)) return s;
2538     }
2539     return 0;
2540 }
2541
2542 static char *
2543 str_fill_term(VALUE str, char *s, long len, int termlen)
2544 {
2545     /* This function assumes that (capa + termlen) bytes of memory
2546      * is allocated, like many other functions in this file.
2547      */
2548     if (str_dependent_p(str)) {
2549         if (!zero_filled(s + len, termlen))
2550             str_make_independent_expand(str, len, 0L, termlen);
2551     }
2552     else {
2553         TERM_FILL(s + len, termlen);
2554         return s;
2555     }
2556     return RSTRING_PTR(str);
2557 }
2558
2559 void
2560 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2561 {
2562     long capa = str_capacity(str, oldtermlen) + oldtermlen;
2563     long len = RSTRING_LEN(str);
2564
2565     assert(capa >= len);
2566     if (capa - len < termlen) {
2567         rb_check_lockedtmp(str);
2568         str_make_independent_expand(str, len, 0L, termlen);
2569     }
2570     else if (str_dependent_p(str)) {
2571         if (termlen > oldtermlen)
2572             str_make_independent_expand(str, len, 0L, termlen);
2573     }
2574     else {
2575         if (!STR_EMBED_P(str)) {
2576             /* modify capa instead of realloc */
2577             assert(!FL_TEST((str), STR_SHARED));
2578             RSTRING(str)->as.heap.aux.capa = capa - termlen;
2579         }
2580         if (termlen > oldtermlen) {
2581             TERM_FILL(RSTRING_PTR(str) + len, termlen);
2582         }
2583     }
2584
2585     return;
2586 }
2587
2588 static char *
2589 str_null_check(VALUE str, int *w)
2590 {
2591     char *s = RSTRING_PTR(str);
2592     long len = RSTRING_LEN(str);
2593     rb_encoding *enc = rb_enc_get(str);
2594     const int minlen = rb_enc_mbminlen(enc);
2595
2596     if (minlen > 1) {
2597         *w = 1;
2598         if (str_null_char(s, len, minlen, enc)) {
2599             return NULL;
2600         }
2601         return str_fill_term(str, s, len, minlen);
2602     }
2603     *w = 0;
2604     if (!s || memchr(s, 0, len)) {
2605         return NULL;
2606     }
2607     if (s[len]) {
2608         s = str_fill_term(str, s, len, minlen);
2609     }
2610     return s;
2611 }
2612
2613 char *
2614 rb_str_to_cstr(VALUE str)
2615 {
2616     int w;
2617     return str_null_check(str, &w);
2618 }
2619
2620 char *
2621 rb_string_value_cstr(volatile VALUE *ptr)
2622 {
2623     VALUE str = rb_string_value(ptr);
2624     int w;
2625     char *s = str_null_check(str, &w);
2626     if (!s) {
2627         if (w) {
2628             rb_raise(rb_eArgError, "string contains null char");
2629         }
2630         rb_raise(rb_eArgError, "string contains null byte");
2631     }
2632     return s;
2633 }
2634
2635 char *
2636 rb_str_fill_terminator(VALUE str, const int newminlen)
2637 {
2638     char *s = RSTRING_PTR(str);
2639     long len = RSTRING_LEN(str);
2640     return str_fill_term(str, s, len, newminlen);
2641 }
2642
2643 VALUE
2644 rb_check_string_type(VALUE str)
2645 {
2646     str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2647     return str;
2648 }
2649
2650 /*
2651  *  call-seq:
2652  *    String.try_convert(object) -> object, new_string, or nil
2653  *
2654  *  If +object+ is a \String object, returns +object+.
2655  *
2656  *  Otherwise if +object+ responds to <tt>:to_str</tt>,
2657  *  calls <tt>object.to_str</tt> and returns the result.
2658  *
2659  *  Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2660  *
2661  *  Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2662  */
2663 static VALUE
2664 rb_str_s_try_convert(VALUE dummy, VALUE str)
2665 {
2666     return rb_check_string_type(str);
2667 }
2668
2669 static char*
2670 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2671 {
2672     long nth = *nthp;
2673     if (rb_enc_mbmaxlen(enc) == 1) {
2674         p += nth;
2675     }
2676     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2677         p += nth * rb_enc_mbmaxlen(enc);
2678     }
2679     else if (rb_enc_asciicompat(enc)) {
2680         const char *p2, *e2;
2681         int n;
2682
2683         while (p < e && 0 < nth) {
2684             e2 = p + nth;
2685             if (e < e2) {
2686                 *nthp = nth;
2687                 return (char *)e;
2688             }
2689             if (ISASCII(*p)) {
2690                 p2 = search_nonascii(p, e2);
2691                 if (!p2) {
2692                     nth -= e2 - p;
2693                     *nthp = nth;
2694                     return (char *)e2;
2695                 }
2696                 nth -= p2 - p;
2697                 p = p2;
2698             }
2699             n = rb_enc_mbclen(p, e, enc);
2700             p += n;
2701             nth--;
2702         }
2703         *nthp = nth;
2704         if (nth != 0) {
2705             return (char *)e;
2706         }
2707         return (char *)p;
2708     }
2709     else {
2710         while (p < e && nth--) {
2711             p += rb_enc_mbclen(p, e, enc);
2712         }
2713     }
2714     if (p > e) p = e;
2715     *nthp = nth;
2716     return (char*)p;
2717 }
2718
2719 char*
2720 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2721 {
2722     return str_nth_len(p, e, &nth, enc);
2723 }
2724
2725 static char*
2726 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2727 {
2728     if (singlebyte)
2729         p += nth;
2730     else {
2731         p = str_nth_len(p, e, &nth, enc);
2732     }
2733     if (!p) return 0;
2734     if (p > e) p = e;
2735     return (char *)p;
2736 }
2737
2738 /* char offset to byte offset */
2739 static long
2740 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2741 {
2742     const char *pp = str_nth(p, e, nth, enc, singlebyte);
2743     if (!pp) return e - p;
2744     return pp - p;
2745 }
2746
2747 long
2748 rb_str_offset(VALUE str, long pos)
2749 {
2750     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2751                       STR_ENC_GET(str), single_byte_optimizable(str));
2752 }
2753
2754 #ifdef NONASCII_MASK
2755 static char *
2756 str_utf8_nth(const char *p, const char *e, long *nthp)
2757 {
2758     long nth = *nthp;
2759     if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2760         const uintptr_t *s, *t;
2761         const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2762         s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2763         t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2764         while (p < (const char *)s) {
2765             if (is_utf8_lead_byte(*p)) nth--;
2766             p++;
2767         }
2768         do {
2769             nth -= count_utf8_lead_bytes_with_word(s);
2770             s++;
2771         } while (s < t && (int)SIZEOF_VOIDP <= nth);
2772         p = (char *)s;
2773     }
2774     while (p < e) {
2775         if (is_utf8_lead_byte(*p)) {
2776             if (nth == 0) break;
2777             nth--;
2778         }
2779         p++;
2780     }
2781     *nthp = nth;
2782     return (char *)p;
2783 }
2784
2785 static long
2786 str_utf8_offset(const char *p, const char *e, long nth)
2787 {
2788     const char *pp = str_utf8_nth(p, e, &nth);
2789     return pp - p;
2790 }
2791 #endif
2792
2793 /* byte offset to char offset */
2794 long
2795 rb_str_sublen(VALUE str, long pos)
2796 {
2797     if (single_byte_optimizable(str) || pos < 0)
2798         return pos;
2799     else {
2800         char *p = RSTRING_PTR(str);
2801         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2802     }
2803 }
2804
2805 VALUE
2806 rb_str_subseq(VALUE str, long beg, long len)
2807 {
2808     VALUE str2;
2809
2810     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2811         SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2812         long olen;
2813         str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2814         RSTRING(str2)->as.heap.ptr += beg;
2815         olen = RSTRING(str2)->as.heap.len;
2816         if (olen > len) RSTRING(str2)->as.heap.len = len;
2817     }
2818     else {
2819         str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2820         RB_GC_GUARD(str);
2821     }
2822
2823     rb_enc_cr_str_copy_for_substr(str2, str);
2824
2825     return str2;
2826 }
2827
2828 char *
2829 rb_str_subpos(VALUE str, long beg, long *lenp)
2830 {
2831     long len = *lenp;
2832     long slen = -1L;
2833     long blen = RSTRING_LEN(str);
2834     rb_encoding *enc = STR_ENC_GET(str);
2835     char *p, *s = RSTRING_PTR(str), *e = s + blen;
2836
2837     if (len < 0) return 0;
2838     if (!blen) {
2839         len = 0;
2840     }
2841     if (single_byte_optimizable(str)) {
2842         if (beg > blen) return 0;
2843         if (beg < 0) {
2844             beg += blen;
2845             if (beg < 0) return 0;
2846         }
2847         if (len > blen - beg)
2848             len = blen - beg;
2849         if (len < 0) return 0;
2850         p = s + beg;
2851         goto end;
2852     }
2853     if (beg < 0) {
2854         if (len > -beg) len = -beg;
2855         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2856             beg = -beg;
2857             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2858             p = e;
2859             if (!p) return 0;
2860             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2861             if (!p) return 0;
2862             len = e - p;
2863             goto end;
2864         }
2865         else {
2866             slen = str_strlen(str, enc);
2867             beg += slen;
2868             if (beg < 0) return 0;
2869             p = s + beg;
2870             if (len == 0) goto end;
2871         }
2872     }
2873     else if (beg > 0 && beg > RSTRING_LEN(str)) {
2874         return 0;
2875     }
2876     if (len == 0) {
2877         if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2878         p = s + beg;
2879     }
2880 #ifdef NONASCII_MASK
2881     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2882         enc == rb_utf8_encoding()) {
2883         p = str_utf8_nth(s, e, &beg);
2884         if (beg > 0) return 0;
2885         len = str_utf8_offset(p, e, len);
2886     }
2887 #endif
2888     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2889         int char_sz = rb_enc_mbmaxlen(enc);
2890
2891         p = s + beg * char_sz;
2892         if (p > e) {
2893             return 0;
2894         }
2895         else if (len * char_sz > e - p)
2896             len = e - p;
2897         else
2898             len *= char_sz;
2899     }
2900     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2901         if (beg > 0) return 0;
2902         len = 0;
2903     }
2904     else {
2905         len = str_offset(p, e, len, enc, 0);
2906     }
2907   end:
2908     *lenp = len;
2909     RB_GC_GUARD(str);
2910     return p;
2911 }
2912
2913 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2914
2915 VALUE
2916 rb_str_substr(VALUE str, long beg, long len)
2917 {
2918     return str_substr(str, beg, len, TRUE);
2919 }
2920
2921 static VALUE
2922 str_substr(VALUE str, long beg, long len, int empty)
2923 {
2924     VALUE str2;
2925     char *p = rb_str_subpos(str, beg, &len);
2926
2927     if (!p) return Qnil;
2928     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2929         SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2930         long ofs = p - RSTRING_PTR(str);
2931         str2 = rb_str_new_frozen(str);
2932         str2 = str_new_shared(rb_cString, str2);
2933         RSTRING(str2)->as.heap.ptr += ofs;
2934         RSTRING(str2)->as.heap.len = len;
2935         ENC_CODERANGE_CLEAR(str2);
2936     }
2937     else {
2938         if (!len && !empty) return Qnil;
2939         str2 = rb_str_new(p, len);
2940         RB_GC_GUARD(str);
2941     }
2942     rb_enc_cr_str_copy_for_substr(str2, str);
2943
2944     return str2;
2945 }
2946
2947 VALUE
2948 rb_str_freeze(VALUE str)
2949 {
2950     if (OBJ_FROZEN(str)) return str;
2951     rb_str_resize(str, RSTRING_LEN(str));
2952     return rb_obj_freeze(str);
2953 }
2954
2955
2956 /*
2957  * call-seq:
2958  *   +string -> new_string or self
2959  *
2960  * Returns +self+ if +self+ is not frozen.
2961  *
2962  * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2963  */
2964 static VALUE
2965 str_uplus(VALUE str)
2966 {
2967     if (OBJ_FROZEN(str)) {
2968         return rb_str_dup(str);
2969     }
2970     else {
2971         return str;
2972     }
2973 }
2974
2975 /*
2976  * call-seq:
2977  *   -string -> frozen_string
2978  *
2979  * Returns a frozen, possibly pre-existing copy of the string.
2980  *
2981  * The returned \String will be deduplicated as long as it does not have
2982  * any instance variables set on it.
2983  */
2984 static VALUE
2985 str_uminus(VALUE str)
2986 {
2987     if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2988         str = rb_str_dup(str);
2989     }
2990     return rb_fstring(str);
2991 }
2992
2993 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2994 #define rb_str_dup_frozen rb_str_new_frozen
2995
2996 VALUE
2997 rb_str_locktmp(VALUE str)
2998 {
2999     if (FL_TEST(str, STR_TMPLOCK)) {
3000         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3001     }
3002     FL_SET(str, STR_TMPLOCK);
3003     return str;
3004 }
3005
3006 VALUE
3007 rb_str_unlocktmp(VALUE str)
3008 {
3009     if (!FL_TEST(str, STR_TMPLOCK)) {
3010         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3011     }
3012     FL_UNSET(str, STR_TMPLOCK);
3013     return str;
3014 }
3015
3016 RUBY_FUNC_EXPORTED VALUE
3017 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3018 {
3019     rb_str_locktmp(str);
3020     return rb_ensure(func, arg, rb_str_unlocktmp, str);
3021 }
3022
3023 void
3024 rb_str_set_len(VALUE str, long len)
3025 {
3026     long capa;
3027     const int termlen = TERM_LEN(str);
3028
3029     str_modifiable(str);
3030     if (STR_SHARED_P(str)) {
3031         rb_raise(rb_eRuntimeError, "can't set length of shared string");
3032     }
3033     if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3034         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3035     }
3036     STR_SET_LEN(str, len);
3037     TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3038 }
3039
3040 VALUE
3041 rb_str_resize(VALUE str, long len)
3042 {
3043     long slen;
3044     int independent;
3045
3046     if (len < 0) {
3047         rb_raise(rb_eArgError, "negative string size (or size too big)");
3048     }
3049
3050     independent = str_independent(str);
3051     ENC_CODERANGE_CLEAR(str);
3052     slen = RSTRING_LEN(str);
3053
3054     {
3055         long capa;
3056         const int termlen = TERM_LEN(str);
3057         if (STR_EMBED_P(str)) {
3058             if (len == slen) return str;
3059             if (str_embed_capa(str) >= len + termlen) {
3060                 STR_SET_EMBED_LEN(str, len);
3061                 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3062                 return str;
3063             }
3064             str_make_independent_expand(str, slen, len - slen, termlen);
3065         }
3066         else if (str_embed_capa(str) >= len + termlen) {
3067             char *ptr = STR_HEAP_PTR(str);
3068             STR_SET_EMBED(str);
3069             if (slen > len) slen = len;
3070             if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3071             TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3072             STR_SET_EMBED_LEN(str, len);
3073             if (independent) ruby_xfree(ptr);
3074             return str;
3075         }
3076         else if (!independent) {
3077             if (len == slen) return str;
3078             str_make_independent_expand(str, slen, len - slen, termlen);
3079         }
3080         else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3081                  (capa - len) > (len < 1024 ? len : 1024)) {
3082             SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3083                             (size_t)len + termlen, STR_HEAP_SIZE(str));
3084             RSTRING(str)->as.heap.aux.capa = len;
3085         }
3086         else if (len == slen) return str;
3087         RSTRING(str)->as.heap.len = len;
3088         TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3089     }
3090     return str;
3091 }
3092
3093 static VALUE
3094 str_buf_cat(VALUE str, const char *ptr, long len)
3095 {
3096     long capa, total, olen, off = -1;
3097     char *sptr;
3098     const int termlen = TERM_LEN(str);
3099 #if !USE_RVARGC
3100     assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3101 #endif
3102
3103     RSTRING_GETMEM(str, sptr, olen);
3104     if (ptr >= sptr && ptr <= sptr + olen) {
3105         off = ptr - sptr;
3106     }
3107     rb_str_modify(str);
3108     if (len == 0) return 0;
3109     if (STR_EMBED_P(str)) {
3110         capa = str_embed_capa(str) - termlen;
3111         sptr = RSTRING(str)->as.embed.ary;
3112         olen = RSTRING_EMBED_LEN(str);
3113     }
3114     else {
3115         capa = RSTRING(str)->as.heap.aux.capa;
3116         sptr = RSTRING(str)->as.heap.ptr;
3117         olen = RSTRING(str)->as.heap.len;
3118     }
3119     if (olen > LONG_MAX - len) {
3120         rb_raise(rb_eArgError, "string sizes too big");
3121     }
3122     total = olen + len;
3123     if (capa < total) {
3124         if (total >= LONG_MAX / 2) {
3125             capa = total;
3126         }
3127         while (total > capa) {
3128             capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3129         }
3130         RESIZE_CAPA_TERM(str, capa, termlen);
3131         sptr = RSTRING_PTR(str);
3132     }
3133     if (off != -1) {
3134         ptr = sptr + off;
3135     }
3136     memcpy(sptr + olen, ptr, len);
3137     STR_SET_LEN(str, total);
3138     TERM_FILL(sptr + total, termlen); /* sentinel */
3139
3140     return str;
3141 }
3142
3143 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3144
3145 VALUE
3146 rb_str_cat(VALUE str, const char *ptr, long len)
3147 {
3148     if (len == 0) return str;
3149     if (len < 0) {
3150         rb_raise(rb_eArgError, "negative string size (or size too big)");
3151     }
3152     return str_buf_cat(str, ptr, len);
3153 }
3154
3155 VALUE
3156 rb_str_cat_cstr(VALUE str, const char *ptr)
3157 {
3158     must_not_null(ptr);
3159     return rb_str_buf_cat(str, ptr, strlen(ptr));
3160 }
3161
3162 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3163 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3164 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3165
3166 static VALUE
3167 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3168     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3169 {
3170     int str_encindex = ENCODING_GET(str);
3171     int res_encindex;
3172     int str_cr, res_cr;
3173     rb_encoding *str_enc, *ptr_enc;
3174
3175     str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3176
3177     if (str_encindex == ptr_encindex) {
3178         if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3179             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3180         }
3181     }
3182     else {
3183         str_enc = rb_enc_from_index(str_encindex);
3184         ptr_enc = rb_enc_from_index(ptr_encindex);
3185         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3186             if (len == 0)
3187                 return str;
3188             if (RSTRING_LEN(str) == 0) {
3189                 rb_str_buf_cat(str, ptr, len);
3190                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3191                 return str;
3192             }
3193             goto incompatible;
3194         }
3195         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3196             ptr_cr = coderange_scan(ptr, len, ptr_enc);
3197         }
3198         if (str_cr == ENC_CODERANGE_UNKNOWN) {
3199             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3200                 str_cr = rb_enc_str_coderange(str);
3201             }
3202         }
3203     }
3204     if (ptr_cr_ret)
3205         *ptr_cr_ret = ptr_cr;
3206
3207     if (str_encindex != ptr_encindex &&
3208         str_cr != ENC_CODERANGE_7BIT &&
3209         ptr_cr != ENC_CODERANGE_7BIT) {
3210         str_enc = rb_enc_from_index(str_encindex);
3211         ptr_enc = rb_enc_from_index(ptr_encindex);
3212         goto incompatible;
3213     }
3214
3215     if (str_cr == ENC_CODERANGE_UNKNOWN) {
3216         res_encindex = str_encindex;
3217         res_cr = ENC_CODERANGE_UNKNOWN;
3218     }
3219     else if (str_cr == ENC_CODERANGE_7BIT) {
3220         if (ptr_cr == ENC_CODERANGE_7BIT) {
3221             res_encindex = str_encindex;
3222             res_cr = ENC_CODERANGE_7BIT;
3223         }
3224         else {
3225             res_encindex = ptr_encindex;
3226             res_cr = ptr_cr;
3227         }
3228     }
3229     else if (str_cr == ENC_CODERANGE_VALID) {
3230         res_encindex = str_encindex;
3231         if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3232             res_cr = str_cr;
3233         else
3234             res_cr = ptr_cr;
3235     }
3236     else { /* str_cr == ENC_CODERANGE_BROKEN */
3237         res_encindex = str_encindex;
3238         res_cr = str_cr;
3239         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3240     }
3241
3242     if (len < 0) {
3243         rb_raise(rb_eArgError, "negative string size (or size too big)");
3244     }
3245     str_buf_cat(str, ptr, len);
3246     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3247     return str;
3248
3249   incompatible:
3250     rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3251              rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3252     UNREACHABLE_RETURN(Qundef);
3253 }
3254
3255 VALUE
3256 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3257 {
3258     return rb_enc_cr_str_buf_cat(str, ptr, len,
3259         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3260 }
3261
3262 VALUE
3263 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3264 {
3265     /* ptr must reference NUL terminated ASCII string. */
3266     int encindex = ENCODING_GET(str);
3267     rb_encoding *enc = rb_enc_from_index(encindex);
3268     if (rb_enc_asciicompat(enc)) {
3269         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3270             encindex, ENC_CODERANGE_7BIT, 0);
3271     }
3272     else {
3273         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3274         while (*ptr) {
3275             unsigned int c = (unsigned char)*ptr;
3276             int len = rb_enc_codelen(c, enc);
3277             rb_enc_mbcput(c, buf, enc);
3278             rb_enc_cr_str_buf_cat(str, buf, len,
3279                 encindex, ENC_CODERANGE_VALID, 0);
3280             ptr++;
3281         }
3282         return str;
3283     }
3284 }
3285
3286 VALUE
3287 rb_str_buf_append(VALUE str, VALUE str2)
3288 {
3289     int str2_cr;
3290
3291     str2_cr = ENC_CODERANGE(str2);
3292
3293     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3294         ENCODING_GET(str2), str2_cr, &str2_cr);
3295
3296     ENC_CODERANGE_SET(str2, str2_cr);
3297
3298     return str;
3299 }
3300
3301 VALUE
3302 rb_str_append(VALUE str, VALUE str2)
3303 {
3304     StringValue(str2);
3305     return rb_str_buf_append(str, str2);
3306 }
3307
3308 #define MIN_PRE_ALLOC_SIZE 48
3309
3310 MJIT_FUNC_EXPORTED VALUE
3311 rb_str_concat_literals(size_t num, const VALUE *strary)
3312 {
3313     VALUE str;
3314     size_t i, s;
3315     long len = 1;
3316
3317     if (UNLIKELY(!num)) return rb_str_new(0, 0);
3318     if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3319
3320     for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3321     if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3322         str = rb_str_resurrect(strary[0]);
3323         s = 1;
3324     }
3325     else {
3326         str = rb_str_buf_new(len);
3327         rb_enc_copy(str, strary[0]);
3328         s = 0;
3329     }
3330
3331     for (i = s; i < num; ++i) {
3332         const VALUE v = strary[i];
3333         int encidx = ENCODING_GET(v);
3334
3335         rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3336                               encidx, ENC_CODERANGE(v), NULL);
3337         if (encidx != ENCINDEX_US_ASCII) {
3338             if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3339                 rb_enc_set_index(str, encidx);
3340         }
3341     }
3342     return str;
3343 }
3344
3345 /*
3346  *  call-seq:
3347  *     concat(*objects) -> string
3348  *
3349  *  Concatenates each object in +objects+ to +self+ and returns +self+:
3350  *
3351  *    s = 'foo'
3352  *    s.concat('bar', 'baz') # => "foobarbaz"
3353  *    s                      # => "foobarbaz"
3354  *
3355  *  For each given object +object+ that is an \Integer,
3356  *  the value is considered a codepoint and converted to a character before concatenation:
3357  *
3358  *    s = 'foo'
3359  *    s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3360  *
3361  *  Related: String#<<, which takes a single argument.
3362  */
3363 static VALUE
3364 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3365 {
3366     str_modifiable(str);
3367
3368     if (argc == 1) {
3369         return rb_str_concat(str, argv[0]);
3370     }
3371     else if (argc > 1) {
3372         int i;
3373         VALUE arg_str = rb_str_tmp_new(0);
3374         rb_enc_copy(arg_str, str);
3375         for (i = 0; i < argc; i++) {
3376             rb_str_concat(arg_str, argv[i]);
3377         }
3378         rb_str_buf_append(str, arg_str);
3379     }
3380
3381     return str;
3382 }
3383
3384 /*
3385  *  call-seq:
3386  *    string << object -> string
3387  *
3388  *  Concatenates +object+ to +self+ and returns +self+:
3389  *
3390  *    s = 'foo'
3391  *    s << 'bar' # => "foobar"
3392  *    s          # => "foobar"
3393  *
3394  *  If +object+ is an \Integer,
3395  *  the value is considered a codepoint and converted to a character before concatenation:
3396  *
3397  *    s = 'foo'
3398  *    s << 33 # => "foo!"
3399  *
3400  *  Related: String#concat, which takes multiple arguments.
3401  */
3402 VALUE
3403 rb_str_concat(VALUE str1, VALUE str2)
3404 {
3405     unsigned int code;
3406     rb_encoding *enc = STR_ENC_GET(str1);
3407     int encidx;
3408
3409     if (RB_INTEGER_TYPE_P(str2)) {
3410         if (rb_num_to_uint(str2, &code) == 0) {
3411         }
3412         else if (FIXNUM_P(str2)) {
3413             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3414         }
3415         else {
3416             rb_raise(rb_eRangeError, "bignum out of char range");
3417         }
3418     }
3419     else {
3420         return rb_str_append(str1, str2);
3421     }
3422
3423     encidx = rb_enc_to_index(enc);
3424     if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3425         /* US-ASCII automatically extended to ASCII-8BIT */
3426         char buf[1];
3427         buf[0] = (char)code;
3428         if (code > 0xFF) {
3429             rb_raise(rb_eRangeError, "%u out of char range", code);
3430         }
3431         rb_str_cat(str1, buf, 1);
3432         if (encidx == ENCINDEX_US_ASCII && code > 127) {
3433             rb_enc_associate_index(str1, ENCINDEX_ASCII);
3434             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3435         }
3436     }
3437     else {
3438         long pos = RSTRING_LEN(str1);
3439         int cr = ENC_CODERANGE(str1);
3440         int len;
3441         char *buf;
3442
3443         switch (len = rb_enc_codelen(code, enc)) {
3444           case ONIGERR_INVALID_CODE_POINT_VALUE:
3445             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3446             break;
3447           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3448           case 0:
3449             rb_raise(rb_eRangeError, "%u out of char range", code);
3450             break;
3451         }
3452         buf = ALLOCA_N(char, len + 1);
3453         rb_enc_mbcput(code, buf, enc);
3454         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3455             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3456         }
3457         rb_str_resize(str1, pos+len);
3458         memcpy(RSTRING_PTR(str1) + pos, buf, len);
3459         if (cr == ENC_CODERANGE_7BIT && code > 127)
3460             cr = ENC_CODERANGE_VALID;
3461         ENC_CODERANGE_SET(str1, cr);
3462     }
3463     return str1;
3464 }
3465
3466 /*
3467  *  call-seq:
3468  *    prepend(*other_strings)  -> string
3469  *
3470  *  Prepends each string in +other_strings+ to +self+ and returns +self+:
3471  *
3472  *    s = 'foo'
3473  *    s.prepend('bar', 'baz') # => "barbazfoo"
3474  *    s                       # => "barbazfoo"
3475  *
3476  *  Related: String#concat.
3477  */
3478
3479 static VALUE
3480 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3481 {
3482     str_modifiable(str);
3483
3484     if (argc == 1) {
3485         rb_str_update(str, 0L, 0L, argv[0]);
3486     }
3487     else if (argc > 1) {
3488         int i;
3489         VALUE arg_str = rb_str_tmp_new(0);
3490         rb_enc_copy(arg_str, str);
3491         for (i = 0; i < argc; i++) {
3492             rb_str_append(arg_str, argv[i]);
3493         }
3494         rb_str_update(str, 0L, 0L, arg_str);
3495     }
3496
3497     return str;
3498 }
3499
3500 st_index_t
3501 rb_str_hash(VALUE str)
3502 {
3503     int e = ENCODING_GET(str);
3504     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3505         e = 0;
3506     }
3507     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3508 }
3509
3510 int
3511 rb_str_hash_cmp(VALUE str1, VALUE str2)
3512 {
3513     long len1, len2;
3514     const char *ptr1, *ptr2;
3515     RSTRING_GETMEM(str1, ptr1, len1);
3516     RSTRING_GETMEM(str2, ptr2, len2);
3517     return (len1 != len2 ||
3518             !rb_str_comparable(str1, str2) ||
3519             memcmp(ptr1, ptr2, len1) != 0);
3520 }
3521
3522 /*
3523  * call-seq:
3524  *   hash -> integer
3525  *
3526  * Returns the integer hash value for +self+.
3527  * The value is based on the length, content and encoding of +self+.
3528  *
3529  * Related: Object#hash.
3530  */
3531
3532 static VALUE
3533 rb_str_hash_m(VALUE str)
3534 {
3535     st_index_t hval = rb_str_hash(str);
3536     return ST2FIX(hval);
3537 }
3538
3539 #define lesser(a,b) (((a)>(b))?(b):(a))
3540
3541 int
3542 rb_str_comparable(VALUE str1, VALUE str2)
3543 {
3544     int idx1, idx2;
3545     int rc1, rc2;
3546
3547     if (RSTRING_LEN(str1) == 0) return TRUE;
3548     if (RSTRING_LEN(str2) == 0) return TRUE;
3549     idx1 = ENCODING_GET(str1);
3550     idx2 = ENCODING_GET(str2);
3551     if (idx1 == idx2) return TRUE;
3552     rc1 = rb_enc_str_coderange(str1);
3553     rc2 = rb_enc_str_coderange(str2);
3554     if (rc1 == ENC_CODERANGE_7BIT) {
3555         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3556         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3557             return TRUE;
3558     }
3559     if (rc2 == ENC_CODERANGE_7BIT) {
3560         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3561             return TRUE;
3562     }
3563     return FALSE;
3564 }
3565
3566 int
3567 rb_str_cmp(VALUE str1, VALUE str2)
3568 {
3569     long len1, len2;
3570     const char *ptr1, *ptr2;
3571     int retval;
3572
3573     if (str1 == str2) return 0;
3574     RSTRING_GETMEM(str1, ptr1, len1);
3575     RSTRING_GETMEM(str2, ptr2, len2);
3576     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3577         if (len1 == len2) {
3578             if (!rb_str_comparable(str1, str2)) {
3579                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3580                     return 1;
3581                 return -1;
3582             }
3583             return 0;
3584         }
3585         if (len1 > len2) return 1;
3586         return -1;
3587     }
3588     if (retval > 0) return 1;
3589     return -1;
3590 }
3591
3592 /*
3593  *  call-seq:
3594  *    string == object -> true or false
3595  *    string === object -> true or false
3596  *
3597  *  Returns +true+ if +object+ has the same length and content;
3598  *  as +self+; +false+ otherwise:
3599  *
3600  *    s = 'foo'
3601  *    s == 'foo' # => true
3602  *    s == 'food' # => false
3603  *    s == 'FOO' # => false
3604  *
3605  *  Returns +false+ if the two strings' encodings are not compatible:
3606  *    "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3607  *
3608  *  If +object+ is not an instance of \String but responds to +to_str+, then the
3609  *  two strings are compared using <code>object.==</code>.
3610  */
3611
3612 VALUE
3613 rb_str_equal(VALUE str1, VALUE str2)
3614 {
3615     if (str1 == str2) return Qtrue;
3616     if (!RB_TYPE_P(str2, T_STRING)) {
3617         if (!rb_respond_to(str2, idTo_str)) {
3618             return Qfalse;
3619         }
3620         return rb_equal(str2, str1);
3621     }
3622     return rb_str_eql_internal(str1, str2);
3623 }
3624
3625 /*
3626  * call-seq:
3627  *   eql?(object) -> true or false
3628  *
3629  *  Returns +true+ if +object+ has the same length and content;
3630  *  as +self+; +false+ otherwise:
3631  *
3632  *    s = 'foo'
3633  *    s.eql?('foo') # => true
3634  *    s.eql?('food') # => false
3635  *    s.eql?('FOO') # => false
3636  *
3637  *  Returns +false+ if the two strings' encodings are not compatible:
3638  *
3639  *    "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3640  *
3641  */
3642
3643 MJIT_FUNC_EXPORTED VALUE
3644 rb_str_eql(VALUE str1, VALUE str2)
3645 {
3646     if (str1 == str2) return Qtrue;
3647     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3648     return rb_str_eql_internal(str1, str2);
3649 }
3650
3651 /*
3652  *  call-seq:
3653  *    string <=> other_string -> -1, 0, 1, or nil
3654  *
3655  *  Compares +self+ and +other_string+, returning:
3656  *
3657  *  - -1 if +other_string+ is larger.
3658  *  - 0 if the two are equal.
3659  *  - 1 if +other_string+ is smaller.
3660  *  - +nil+ if the two are incomparable.
3661  *
3662  *  Examples:
3663  *
3664  *    'foo' <=> 'foo' # => 0
3665  *    'foo' <=> 'food' # => -1
3666  *    'food' <=> 'foo' # => 1
3667  *    'FOO' <=> 'foo' # => -1
3668  *    'foo' <=> 'FOO' # => 1
3669  *    'foo' <=> 1 # => nil
3670  *
3671  */
3672
3673 static VALUE
3674 rb_str_cmp_m(VALUE str1, VALUE str2)
3675 {
3676     int result;
3677     VALUE s = rb_check_string_type(str2);
3678     if (NIL_P(s)) {
3679         return rb_invcmp(str1, str2);
3680     }
3681     result = rb_str_cmp(str1, s);
3682     return INT2FIX(result);
3683 }
3684
3685 static VALUE str_casecmp(VALUE str1, VALUE str2);
3686 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3687
3688 /*
3689  *  call-seq:
3690  *    casecmp(other_string) -> -1, 0, 1, or nil
3691  *
3692  *  Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3693  *
3694  *  - -1 if <tt>other_string.downcase</tt> is larger.
3695  *  - 0 if the two are equal.
3696  *  - 1 if <tt>other_string.downcase</tt> is smaller.
3697  *  - +nil+ if the two are incomparable.
3698  *
3699  *  Examples:
3700  *
3701  *    'foo'.casecmp('foo') # => 0
3702  *    'foo'.casecmp('food') # => -1
3703  *    'food'.casecmp('foo') # => 1
3704  *    'FOO'.casecmp('foo') # => 0
3705  *    'foo'.casecmp('FOO') # => 0
3706  *    'foo'.casecmp(1) # => nil
3707  *
3708  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
3709  *
3710  *  Related: String#casecmp?.
3711  *
3712  */
3713
3714 static VALUE
3715 rb_str_casecmp(VALUE str1, VALUE str2)
3716 {
3717     VALUE s = rb_check_string_type(str2);
3718     if (NIL_P(s)) {
3719         return Qnil;
3720     }
3721     return str_casecmp(str1, s);
3722 }
3723
3724 static VALUE
3725 str_casecmp(VALUE str1, VALUE str2)
3726 {
3727     long len;
3728     rb_encoding *enc;
3729     const char *p1, *p1end, *p2, *p2end;
3730
3731     enc = rb_enc_compatible(str1, str2);
3732     if (!enc) {
3733         return Qnil;
3734     }
3735
3736     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3737     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3738     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3739         while (p1 < p1end && p2 < p2end) {
3740             if (*p1 != *p2) {
3741                 unsigned int c1 = TOLOWER(*p1 & 0xff);
3742                 unsigned int c2 = TOLOWER(*p2 & 0xff);
3743                 if (c1 != c2)
3744                     return INT2FIX(c1 < c2 ? -1 : 1);
3745             }
3746             p1++;
3747             p2++;
3748         }
3749     }
3750     else {
3751         while (p1 < p1end && p2 < p2end) {
3752             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3753             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3754
3755             if (0 <= c1 && 0 <= c2) {
3756                 c1 = TOLOWER(c1);
3757                 c2 = TOLOWER(c2);
3758                 if (c1 != c2)
3759                     return INT2FIX(c1 < c2 ? -1 : 1);
3760             }
3761             else {
3762                 int r;
3763                 l1 = rb_enc_mbclen(p1, p1end, enc);
3764                 l2 = rb_enc_mbclen(p2, p2end, enc);
3765                 len = l1 < l2 ? l1 : l2;
3766                 r = memcmp(p1, p2, len);
3767                 if (r != 0)
3768                     return INT2FIX(r < 0 ? -1 : 1);
3769                 if (l1 != l2)
3770                     return INT2FIX(l1 < l2 ? -1 : 1);
3771             }
3772             p1 += l1;
3773             p2 += l2;
3774         }
3775     }
3776     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3777     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3778     return INT2FIX(-1);
3779 }
3780
3781 /*
3782  *  call-seq:
3783  *    casecmp?(other_string) -> true, false, or nil
3784  *
3785  *  Returns +true+ if +self+ and +other_string+ are equal after
3786  *  Unicode case folding, otherwise +false+:
3787  *
3788  *    'foo'.casecmp?('foo') # => true
3789  *    'foo'.casecmp?('food') # => false
3790  *    'food'.casecmp?('foo') # => false
3791  *    'FOO'.casecmp?('foo') # => true
3792  *    'foo'.casecmp?('FOO') # => true
3793  *
3794  *  Returns +nil+ if the two values are incomparable:
3795  *
3796  *    'foo'.casecmp?(1) # => nil
3797  *
3798  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
3799  *
3800  *  Related: String#casecmp.
3801  *
3802  */
3803
3804 static VALUE
3805 rb_str_casecmp_p(VALUE str1, VALUE str2)
3806 {
3807     VALUE s = rb_check_string_type(str2);
3808     if (NIL_P(s)) {
3809         return Qnil;
3810     }
3811     return str_casecmp_p(str1, s);
3812 }
3813
3814 static VALUE
3815 str_casecmp_p(VALUE str1, VALUE str2)
3816 {
3817     rb_encoding *enc;
3818     VALUE folded_str1, folded_str2;
3819     VALUE fold_opt = sym_fold;
3820
3821     enc = rb_enc_compatible(str1, str2);
3822     if (!enc) {
3823         return Qnil;
3824     }
3825
3826     folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3827     folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3828
3829     return rb_str_eql(folded_str1, folded_str2);
3830 }
3831
3832 static long
3833 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3834             const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3835 {
3836     const char *search_start = str_ptr;
3837     long pos, search_len = str_len - offset;
3838
3839     for (;;) {
3840         const char *t;
3841         pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3842         if (pos < 0) return pos;
3843         t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3844         if (t == search_start + pos) break;
3845         search_len -= t - search_start;
3846         if (search_len <= 0) return -1;
3847         offset += t - search_start;
3848         search_start = t;
3849     }
3850     return pos + offset;
3851 }
3852
3853 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3854
3855 static long
3856 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3857 {
3858     const char *str_ptr, *str_ptr_end, *sub_ptr;
3859     long str_len, sub_len;
3860     rb_encoding *enc;
3861
3862     enc = rb_enc_check(str, sub);
3863     if (is_broken_string(sub)) return -1;
3864
3865     str_ptr = RSTRING_PTR(str);
3866     str_ptr_end = RSTRING_END(str);
3867     str_len = RSTRING_LEN(str);
3868     sub_ptr = RSTRING_PTR(sub);
3869     sub_len = RSTRING_LEN(sub);
3870
3871     if (str_len < sub_len) return -1;
3872
3873     if (offset != 0) {
3874         long str_len_char, sub_len_char;
3875         int single_byte = single_byte_optimizable(str);
3876         str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3877         sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3878         if (offset < 0) {
3879             offset += str_len_char;
3880             if (offset < 0) return -1;
3881         }
3882         if (str_len_char - offset < sub_len_char) return -1;
3883         if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3884         str_ptr += offset;
3885     }
3886     if (sub_len == 0) return offset;
3887
3888     /* need proceed one character at a time */
3889     return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3890 }
3891
3892
3893 /*
3894  *  call-seq:
3895  *    index(substring, offset = 0) -> integer or nil
3896  *    index(regexp, offset = 0) -> integer or nil
3897  *
3898  *  Returns the \Integer index of the first occurrence of the given +substring+,
3899  *  or +nil+ if none found:
3900  *
3901  *    'foo'.index('f') # => 0
3902  *    'foo'.index('o') # => 1
3903  *    'foo'.index('oo') # => 1
3904  *    'foo'.index('ooo') # => nil
3905  *
3906  *  Returns the \Integer index of the first match for the given \Regexp +regexp+,
3907  *  or +nil+ if none found:
3908  *
3909  *    'foo'.index(/f/) # => 0
3910  *    'foo'.index(/o/) # => 1
3911  *    'foo'.index(/oo/) # => 1
3912  *    'foo'.index(/ooo/) # => nil
3913  *
3914  *  \Integer argument +offset+, if given, specifies the position in the
3915  *  string to begin the search:
3916  *
3917  *    'foo'.index('o', 1) # => 1
3918  *    'foo'.index('o', 2) # => 2
3919  *    'foo'.index('o', 3) # => nil
3920  *
3921  *  If +offset+ is negative, counts backward from the end of +self+:
3922  *
3923  *    'foo'.index('o', -1) # => 2
3924  *    'foo'.index('o', -2) # => 1
3925  *    'foo'.index('o', -3) # => 1
3926  *    'foo'.index('o', -4) # => nil
3927  *
3928  *  Related: String#rindex.
3929  */
3930
3931 static VALUE
3932 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3933 {
3934     VALUE sub;
3935     VALUE initpos;
3936     long pos;
3937
3938     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3939         pos = NUM2LONG(initpos);
3940     }
3941     else {
3942         pos = 0;
3943     }
3944     if (pos < 0) {
3945         pos += str_strlen(str, NULL);
3946         if (pos < 0) {
3947             if (RB_TYPE_P(sub, T_REGEXP)) {
3948                 rb_backref_set(Qnil);
3949             }
3950             return Qnil;
3951         }
3952     }
3953
3954     if (RB_TYPE_P(sub, T_REGEXP)) {
3955         if (pos > str_strlen(str, NULL))
3956             return Qnil;
3957         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3958                          rb_enc_check(str, sub), single_byte_optimizable(str));
3959
3960         if (rb_reg_search(sub, str, pos, 0) < 0) {
3961             return Qnil;
3962         }
3963         else {
3964             VALUE match = rb_backref_get();
3965             struct re_registers *regs = RMATCH_REGS(match);
3966             pos = rb_str_sublen(str, BEG(0));
3967             return LONG2NUM(pos);
3968         }
3969     }
3970     else {
3971         StringValue(sub);
3972         pos = rb_str_index(str, sub, pos);
3973         pos = rb_str_sublen(str, pos);
3974     }
3975
3976     if (pos == -1) return Qnil;
3977     return LONG2NUM(pos);
3978 }
3979
3980 #ifdef HAVE_MEMRCHR
3981 static long
3982 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3983 {
3984     char *hit, *adjusted;
3985     int c;
3986     long slen, searchlen;
3987     char *sbeg, *e, *t;
3988
3989     slen = RSTRING_LEN(sub);
3990     if (slen == 0) return pos;
3991     sbeg = RSTRING_PTR(str);
3992     e = RSTRING_END(str);
3993     t = RSTRING_PTR(sub);
3994     c = *t & 0xff;
3995     searchlen = s - sbeg + 1;
3996
3997     do {
3998         hit = memrchr(sbeg, c, searchlen);
3999         if (!hit) break;
4000         adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4001         if (hit != adjusted) {
4002             searchlen = adjusted - sbeg;
4003             continue;
4004         }
4005         if (memcmp(hit, t, slen) == 0)
4006             return rb_str_sublen(str, hit - sbeg);
4007         searchlen = adjusted - sbeg;
4008     } while (searchlen > 0);
4009
4010     return -1;
4011 }
4012 #else
4013 static long
4014 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4015 {
4016     long slen;
4017     char *sbeg, *e, *t;
4018
4019     sbeg = RSTRING_PTR(str);
4020     e = RSTRING_END(str);
4021     t = RSTRING_PTR(sub);
4022     slen = RSTRING_LEN(sub);
4023
4024     while (s) {
4025         if (memcmp(s, t, slen) == 0) {
4026             return pos;
4027         }
4028         if (pos == 0) break;
4029         pos--;
4030         s = rb_enc_prev_char(sbeg, s, e, enc);
4031     }
4032
4033     return -1;
4034 }
4035 #endif
4036
4037 static long
4038 rb_str_rindex(VALUE str, VALUE sub, long pos)
4039 {
4040     long len, slen;
4041     char *sbeg, *s;
4042     rb_encoding *enc;
4043     int singlebyte;
4044
4045     enc = rb_enc_check(str, sub);
4046     if (is_broken_string(sub)) return -1;
4047     singlebyte = single_byte_optimizable(str);
4048     len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4049     slen = str_strlen(sub, enc); /* rb_enc_check */
4050
4051     /* substring longer than string */
4052     if (len < slen) return -1;
4053     if (len - pos < slen) pos = len - slen;
4054     if (len == 0) return pos;
4055
4056     sbeg = RSTRING_PTR(str);
4057
4058     if (pos == 0) {
4059         if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4060             return 0;
4061         else
4062             return -1;
4063     }
4064
4065     s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4066     return str_rindex(str, sub, s, pos, enc);
4067 }
4068
4069 /*
4070  *  call-seq:
4071  *    rindex(substring, offset = self.length) -> integer or nil
4072  *    rindex(regexp, offset = self.length) -> integer or nil
4073  *
4074  *  Returns the \Integer index of the _last_ occurrence of the given +substring+,
4075  *  or +nil+ if none found:
4076  *
4077  *    'foo'.rindex('f') # => 0
4078  *    'foo'.rindex('o') # => 2
4079  *    'foo'.rindex('oo') # => 1
4080  *    'foo'.rindex('ooo') # => nil
4081  *
4082  *  Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4083  *  or +nil+ if none found:
4084  *
4085  *    'foo'.rindex(/f/) # => 0
4086  *    'foo'.rindex(/o/) # => 2
4087  *    'foo'.rindex(/oo/) # => 1
4088  *    'foo'.rindex(/ooo/) # => nil
4089  *
4090  *  The _last_ match means starting at the possible last position, not
4091  *  the last of longest matches.
4092  *
4093  *    'foo'.rindex(/o+/) # => 2
4094  *    $~ #=> #<MatchData "o">
4095  *
4096  *  To get the last longest match, needs to combine with negative
4097  *  lookbehind.
4098  *
4099  *    'foo'.rindex(/(?<!o)o+/) # => 1
4100  *    $~ #=> #<MatchData "oo">
4101  *
4102  *  Or String#index with negative lookforward.
4103  *
4104  *    'foo'.index(/o+(?!.*o)/) # => 1
4105  *    $~ #=> #<MatchData "oo">
4106  *
4107  *  \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4108  *   string to _end_ the search:
4109  *
4110  *    'foo'.rindex('o', 0) # => nil
4111  *    'foo'.rindex('o', 1) # => 1
4112  *    'foo'.rindex('o', 2) # => 2
4113  *    'foo'.rindex('o', 3) # => 2
4114  *
4115  *  If +offset+ is a negative \Integer, the maximum starting position in the
4116  *  string to _end_ the search is the sum of the string's length and +offset+:
4117  *
4118  *    'foo'.rindex('o', -1) # => 2
4119  *    'foo'.rindex('o', -2) # => 1
4120  *    'foo'.rindex('o', -3) # => nil
4121  *    'foo'.rindex('o', -4) # => nil
4122  *
4123  *  Related: String#index.
4124  */
4125
4126 static VALUE
4127 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4128 {
4129     VALUE sub;
4130     VALUE vpos;
4131     rb_encoding *enc = STR_ENC_GET(str);
4132     long pos, len = str_strlen(str, enc); /* str's enc */
4133
4134     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4135         pos = NUM2LONG(vpos);
4136         if (pos < 0) {
4137             pos += len;
4138             if (pos < 0) {
4139                 if (RB_TYPE_P(sub, T_REGEXP)) {
4140                     rb_backref_set(Qnil);
4141                 }
4142                 return Qnil;
4143             }
4144         }
4145         if (pos > len) pos = len;
4146     }
4147     else {
4148         pos = len;
4149     }
4150
4151     if (RB_TYPE_P(sub, T_REGEXP)) {
4152         /* enc = rb_get_check(str, sub); */
4153         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4154                          enc, single_byte_optimizable(str));
4155
4156         if (rb_reg_search(sub, str, pos, 1) >= 0) {
4157             VALUE match = rb_backref_get();
4158             struct re_registers *regs = RMATCH_REGS(match);
4159             pos = rb_str_sublen(str, BEG(0));
4160             return LONG2NUM(pos);
4161         }
4162     }
4163     else {
4164         StringValue(sub);
4165         pos = rb_str_rindex(str, sub, pos);
4166         if (pos >= 0) return LONG2NUM(pos);
4167     }
4168     return Qnil;
4169 }
4170
4171 /*
4172  *  call-seq:
4173  *    string =~ regexp -> integer or nil
4174  *    string =~ object -> integer or nil
4175  *
4176  *  Returns the \Integer index of the first substring that matches
4177  *  the given +regexp+, or +nil+ if no match found:
4178  *
4179  *    'foo' =~ /f/ # => 0
4180  *    'foo' =~ /o/ # => 1
4181  *    'foo' =~ /x/ # => nil
4182  *
4183  *  Note: also updates
4184  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4185  *
4186  *  If the given +object+ is not a \Regexp, returns the value
4187  *  returned by <tt>object =~ self</tt>.
4188  *
4189  *  Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4190  *  (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4191  *
4192  *    number= nil
4193  *    "no. 9" =~ /(?<number>\d+)/
4194  *    number # => nil (not assigned)
4195  *    /(?<number>\d+)/ =~ "no. 9"
4196  *    number #=> "9"
4197  *
4198  */
4199
4200 static VALUE
4201 rb_str_match(VALUE x, VALUE y)
4202 {
4203     switch (OBJ_BUILTIN_TYPE(y)) {
4204       case T_STRING:
4205         rb_raise(rb_eTypeError, "type mismatch: String given");
4206
4207       case T_REGEXP:
4208         return rb_reg_match(y, x);
4209
4210       default:
4211         return rb_funcall(y, idEqTilde, 1, x);
4212     }
4213 }
4214
4215
4216 static VALUE get_pat(VALUE);
4217
4218
4219 /*
4220  *  call-seq:
4221  *    match(pattern, offset = 0) -> matchdata or nil
4222  *    match(pattern, offset = 0) {|matchdata| ... } -> object
4223  *
4224  *  Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4225  *
4226  *  Note: also updates
4227  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4228  *
4229  *  - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4230  *      regexp = Regexp.new(pattern)
4231  *  - Computes +matchdata+, which will be either a \MatchData object or +nil+
4232  *    (see Regexp#match):
4233  *      matchdata = <tt>regexp.match(self)
4234  *
4235  *  With no block given, returns the computed +matchdata+:
4236  *
4237  *    'foo'.match('f') # => #<MatchData "f">
4238  *    'foo'.match('o') # => #<MatchData "o">
4239  *    'foo'.match('x') # => nil
4240  *
4241  *  If \Integer argument +offset+ is given, the search begins at index +offset+:
4242  *
4243  *    'foo'.match('f', 1) # => nil
4244  *    'foo'.match('o', 1) # => #<MatchData "o">
4245  *
4246  *  With a block given, calls the block with the computed +matchdata+
4247  *  and returns the block's return value:
4248  *
4249  *    'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4250  *    'foo'.match(/x/) {|matchdata| matchdata } # => nil
4251  *    'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4252  *
4253  */
4254
4255 static VALUE
4256 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4257 {
4258     VALUE re, result;
4259     if (argc < 1)
4260         rb_check_arity(argc, 1, 2);
4261     re = argv[0];
4262     argv[0] = str;
4263     result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4264     if (!NIL_P(result) && rb_block_given_p()) {
4265         return rb_yield(result);
4266     }
4267     return result;
4268 }
4269
4270 /*
4271  *  call-seq:
4272  *    match?(pattern, offset = 0) -> true or false
4273  *
4274  *  Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4275  *
4276  *  Note: does not update
4277  *  {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4278  *
4279  *  Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4280  *    regexp = Regexp.new(pattern)
4281  *
4282  *  Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4283  *  +false+ otherwise:
4284  *
4285  *    'foo'.match?(/o/) # => true
4286  *    'foo'.match?('o') # => true
4287  *    'foo'.match?(/x/) # => false
4288  *
4289  *  If \Integer argument +offset+ is given, the search begins at index +offset+:
4290  *    'foo'.match?('f', 1) # => false
4291  *    'foo'.match?('o', 1) # => true
4292  *
4293  */
4294
4295 static VALUE
4296 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4297 {
4298     VALUE re;
4299     rb_check_arity(argc, 1, 2);
4300     re = get_pat(argv[0]);
4301     return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4302 }
4303
4304 enum neighbor_char {
4305     NEIGHBOR_NOT_CHAR,
4306     NEIGHBOR_FOUND,
4307     NEIGHBOR_WRAPPED
4308 };
4309
4310 static enum neighbor_char
4311 enc_succ_char(char *p, long len, rb_encoding *enc)
4312 {
4313     long i;
4314     int l;
4315
4316     if (rb_enc_mbminlen(enc) > 1) {
4317         /* wchar, trivial case */
4318         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4319         if (!MBCLEN_CHARFOUND_P(r)) {
4320             return NEIGHBOR_NOT_CHAR;
4321         }
4322         c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4323         l = rb_enc_code_to_mbclen(c, enc);
4324         if (!l) return NEIGHBOR_NOT_CHAR;
4325         if (l != len) return NEIGHBOR_WRAPPED;
4326         rb_enc_mbcput(c, p, enc);
4327         r = rb_enc_precise_mbclen(p, p + len, enc);
4328         if (!MBCLEN_CHARFOUND_P(r)) {
4329             return NEIGHBOR_NOT_CHAR;
4330         }
4331         return NEIGHBOR_FOUND;
4332     }
4333     while (1) {
4334         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4335             p[i] = '\0';
4336         if (i < 0)
4337             return NEIGHBOR_WRAPPED;
4338         ++((unsigned char*)p)[i];
4339         l = rb_enc_precise_mbclen(p, p+len, enc);
4340         if (MBCLEN_CHARFOUND_P(l)) {
4341             l = MBCLEN_CHARFOUND_LEN(l);
4342             if (l == len) {
4343                 return NEIGHBOR_FOUND;
4344             }
4345             else {
4346                 memset(p+l, 0xff, len-l);
4347             }
4348         }
4349         if (MBCLEN_INVALID_P(l) && i < len-1) {
4350             long len2;
4351             int l2;
4352             for (len2 = len-1; 0 < len2; len2--) {
4353                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4354                 if (!MBCLEN_INVALID_P(l2))
4355                     break;
4356             }
4357             memset(p+len2+1, 0xff, len-(len2+1));
4358         }
4359     }
4360 }
4361
4362 static enum neighbor_char
4363 enc_pred_char(char *p, long len, rb_encoding *enc)
4364 {
4365     long i;
4366     int l;
4367     if (rb_enc_mbminlen(enc) > 1) {
4368         /* wchar, trivial case */
4369         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4370         if (!MBCLEN_CHARFOUND_P(r)) {
4371             return NEIGHBOR_NOT_CHAR;
4372         }
4373         c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4374         if (!c) return NEIGHBOR_NOT_CHAR;
4375         --c;
4376         l = rb_enc_code_to_mbclen(c, enc);
4377         if (!l) return NEIGHBOR_NOT_CHAR;
4378         if (l != len) return NEIGHBOR_WRAPPED;
4379         rb_enc_mbcput(c, p, enc);
4380         r = rb_enc_precise_mbclen(p, p + len, enc);
4381         if (!MBCLEN_CHARFOUND_P(r)) {
4382             return NEIGHBOR_NOT_CHAR;
4383         }
4384         return NEIGHBOR_FOUND;
4385     }
4386     while (1) {
4387         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4388             p[i] = '\xff';
4389         if (i < 0)
4390             return NEIGHBOR_WRAPPED;
4391         --((unsigned char*)p)[i];
4392         l = rb_enc_precise_mbclen(p, p+len, enc);
4393         if (MBCLEN_CHARFOUND_P(l)) {
4394             l = MBCLEN_CHARFOUND_LEN(l);
4395             if (l == len) {
4396                 return NEIGHBOR_FOUND;
4397             }
4398             else {
4399                 memset(p+l, 0, len-l);
4400             }
4401         }
4402         if (MBCLEN_INVALID_P(l) && i < len-1) {
4403             long len2;
4404             int l2;
4405             for (len2 = len-1; 0 < len2; len2--) {
4406                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4407                 if (!MBCLEN_INVALID_P(l2))
4408                     break;
4409             }
4410             memset(p+len2+1, 0, len-(len2+1));
4411         }
4412     }
4413 }
4414
4415 /*
4416   overwrite +p+ by succeeding letter in +enc+ and returns
4417   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4418   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4419   assuming each ranges are successive, and mbclen
4420   never change in each ranges.
4421   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4422   character.
4423  */
4424 static enum neighbor_char
4425 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4426 {
4427     enum neighbor_char ret;
4428     unsigned int c;
4429     int ctype;
4430     int range;
4431     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4432
4433     /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4434     int try;
4435     const int max_gaps = 1;
4436
4437     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4438     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4439         ctype = ONIGENC_CTYPE_DIGIT;
4440     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4441         ctype = ONIGENC_CTYPE_ALPHA;
4442     else
4443         return NEIGHBOR_NOT_CHAR;
4444
4445     MEMCPY(save, p, char, len);
4446     for (try = 0; try <= max_gaps; ++try) {
4447         ret = enc_succ_char(p, len, enc);
4448         if (ret == NEIGHBOR_FOUND) {
4449             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4450             if (rb_enc_isctype(c, ctype, enc))
4451                 return NEIGHBOR_FOUND;
4452         }
4453     }
4454     MEMCPY(p, save, char, len);
4455     range = 1;
4456     while (1) {
4457         MEMCPY(save, p, char, len);
4458         ret = enc_pred_char(p, len, enc);
4459         if (ret == NEIGHBOR_FOUND) {
4460             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4461             if (!rb_enc_isctype(c, ctype, enc)) {
4462                 MEMCPY(p, save, char, len);
4463                 break;
4464             }
4465         }
4466         else {
4467             MEMCPY(p, save, char, len);
4468             break;
4469         }
4470         range++;
4471     }
4472     if (range == 1) {
4473         return NEIGHBOR_NOT_CHAR;
4474     }
4475
4476     if (ctype != ONIGENC_CTYPE_DIGIT) {
4477         MEMCPY(carry, p, char, len);
4478         return NEIGHBOR_WRAPPED;
4479     }
4480
4481     MEMCPY(carry, p, char, len);
4482     enc_succ_char(carry, len, enc);
4483     return NEIGHBOR_WRAPPED;
4484 }
4485
4486
4487 static VALUE str_succ(VALUE str);
4488
4489 /*
4490  *  call-seq:
4491  *    succ -> new_str
4492  *
4493  *  Returns the successor to +self+. The successor is calculated by
4494  *  incrementing characters.
4495  *
4496  *  The first character to be incremented is the rightmost alphanumeric:
4497  *  or, if no alphanumerics, the rightmost character:
4498  *
4499  *    'THX1138'.succ # => "THX1139"
4500  *    '<<koala>>'.succ # => "<<koalb>>"
4501  *    '***'.succ # => '**+'
4502  *
4503  *  The successor to a digit is another digit, "carrying" to the next-left
4504  *  character for a "rollover" from 9 to 0, and prepending another digit
4505  *  if necessary:
4506  *
4507  *    '00'.succ # => "01"
4508  *    '09'.succ # => "10"
4509  *    '99'.succ # => "100"
4510  *
4511  *  The successor to a letter is another letter of the same case,
4512  *  carrying to the next-left character for a rollover,
4513  *  and prepending another same-case letter if necessary:
4514  *
4515  *    'aa'.succ # => "ab"
4516  *    'az'.succ # => "ba"
4517  *    'zz'.succ # => "aaa"
4518  *    'AA'.succ # => "AB"
4519  *    'AZ'.succ # => "BA"
4520  *    'ZZ'.succ # => "AAA"
4521  *
4522  *  The successor to a non-alphanumeric character is the next character
4523  *  in the underlying character set's collating sequence,
4524  *  carrying to the next-left character for a rollover,
4525  *  and prepending another character if necessary:
4526  *
4527  *    s = 0.chr * 3
4528  *    s # => "\x00\x00\x00"
4529  *    s.succ # => "\x00\x00\x01"
4530  *    s = 255.chr * 3
4531  *    s # => "\xFF\xFF\xFF"
4532  *    s.succ # => "\x01\x00\x00\x00"
4533  *
4534  *  Carrying can occur between and among mixtures of alphanumeric characters:
4535  *
4536  *    s = 'zz99zz99'
4537  *    s.succ # => "aaa00aa00"
4538  *    s = '99zz99zz'
4539  *    s.succ # => "100aa00aa"
4540  *
4541  *  The successor to an empty \String is a new empty \String:
4542  *
4543  *    ''.succ # => ""
4544  *
4545  *  String#next is an alias for String#succ.
4546  */
4547
4548 VALUE
4549 rb_str_succ(VALUE orig)
4550 {
4551     VALUE str;
4552     str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4553     rb_enc_cr_str_copy_for_substr(str, orig);
4554     return str_succ(str);
4555 }
4556
4557 static VALUE
4558 str_succ(VALUE str)
4559 {
4560     rb_encoding *enc;
4561     char *sbeg, *s, *e, *last_alnum = 0;
4562     int found_alnum = 0;
4563     long l, slen;
4564     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4565     long carry_pos = 0, carry_len = 1;
4566     enum neighbor_char neighbor = NEIGHBOR_FOUND;
4567
4568     slen = RSTRING_LEN(str);
4569     if (slen == 0) return str;
4570
4571     enc = STR_ENC_GET(str);
4572     sbeg = RSTRING_PTR(str);
4573     s = e = sbeg + slen;
4574
4575     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4576         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4577             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4578                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4579                 break;
4580             }
4581         }
4582         l = rb_enc_precise_mbclen(s, e, enc);
4583         if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4584         l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4585         neighbor = enc_succ_alnum_char(s, l, enc, carry);
4586         switch (neighbor) {
4587           case NEIGHBOR_NOT_CHAR:
4588             continue;
4589           case NEIGHBOR_FOUND:
4590             return str;
4591           case NEIGHBOR_WRAPPED:
4592             last_alnum = s;
4593             break;
4594         }
4595         found_alnum = 1;
4596         carry_pos = s - sbeg;
4597         carry_len = l;
4598     }
4599     if (!found_alnum) {         /* str contains no alnum */
4600         s = e;
4601         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4602             enum neighbor_char neighbor;
4603             char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4604             l = rb_enc_precise_mbclen(s, e, enc);
4605             if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4606             l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4607             MEMCPY(tmp, s, char, l);
4608             neighbor = enc_succ_char(tmp, l, enc);
4609             switch (neighbor) {
4610               case NEIGHBOR_FOUND:
4611                 MEMCPY(s, tmp, char, l);
4612                 return str;
4613                 break;
4614               case NEIGHBOR_WRAPPED:
4615                 MEMCPY(s, tmp, char, l);
4616                 break;
4617               case NEIGHBOR_NOT_CHAR:
4618                 break;
4619             }
4620             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4621                 /* wrapped to \0...\0.  search next valid char. */
4622                 enc_succ_char(s, l, enc);
4623             }
4624             if (!rb_enc_asciicompat(enc)) {
4625                 MEMCPY(carry, s, char, l);
4626                 carry_len = l;
4627             }
4628             carry_pos = s - sbeg;
4629         }
4630         ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4631     }
4632     RESIZE_CAPA(str, slen + carry_len);
4633     sbeg = RSTRING_PTR(str);
4634     s = sbeg + carry_pos;
4635     memmove(s + carry_len, s, slen - carry_pos);
4636     memmove(s, carry, carry_len);
4637     slen += carry_len;
4638     STR_SET_LEN(str, slen);
4639     TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4640     rb_enc_str_coderange(str);
4641     return str;
4642 }
4643
4644
4645 /*
4646  *  call-seq:
4647  *    succ! -> self
4648  *
4649  *  Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4650  *
4651  *  String#next! is an alias for String#succ!.
4652  */
4653
4654 static VALUE
4655 rb_str_succ_bang(VALUE str)
4656 {
4657     rb_str_modify(str);
4658     str_succ(str);
4659     return str;
4660 }
4661
4662 static int
4663 all_digits_p(const char *s, long len)
4664 {
4665     while (len-- > 0) {
4666         if (!ISDIGIT(*s)) return 0;
4667         s++;
4668     }
4669     return 1;
4670 }
4671
4672 static int
4673 str_upto_i(VALUE str, VALUE arg)
4674 {
4675     rb_yield(str);
4676     return 0;
4677 }
4678
4679 /*
4680  *  call-seq:
4681  *    upto(other_string, exclusive = false) {|string| ... } -> self
4682  *    upto(other_string, exclusive = false) -> new_enumerator
4683  *
4684  *  With a block given, calls the block with each \String value
4685  *  returned by successive calls to String#succ;
4686  *  the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4687  *  the sequence terminates when value +other_string+ is reached;
4688  *  returns +self+:
4689  *
4690  *    'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4691  *  Output:
4692  *
4693  *    a8 a9 b0 b1 b2 b3 b4 b5 b6
4694  *
4695  *  If argument +exclusive+ is given as a truthy object, the last value is omitted:
4696  *
4697  *    'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4698  *
4699  *  Output:
4700  *
4701  *    a8 a9 b0 b1 b2 b3 b4 b5
4702  *
4703  *  If +other_string+ would not be reached, does not call the block:
4704  *
4705  *    '25'.upto('5') {|s| fail s }
4706  *    'aa'.upto('a') {|s| fail s }
4707  *
4708  *  With no block given, returns a new \Enumerator:
4709  *
4710  *    'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4711  *
4712  */
4713
4714 static VALUE
4715 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4716 {
4717     VALUE end, exclusive;
4718
4719     rb_scan_args(argc, argv, "11", &end, &exclusive);
4720     RETURN_ENUMERATOR(beg, argc, argv);
4721     return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4722 }
4723
4724 VALUE
4725 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4726 {
4727     VALUE current, after_end;
4728     ID succ;
4729     int n, ascii;
4730     rb_encoding *enc;
4731
4732     CONST_ID(succ, "succ");
4733     StringValue(end);
4734     enc = rb_enc_check(beg, end);
4735     ascii = (is_ascii_string(beg) && is_ascii_string(end));
4736     /* single character */
4737     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4738         char c = RSTRING_PTR(beg)[0];
4739         char e = RSTRING_PTR(end)[0];
4740
4741         if (c > e || (excl && c == e)) return beg;
4742         for (;;) {
4743             if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4744             if (!excl && c == e) break;
4745             c++;
4746             if (excl && c == e) break;
4747         }
4748         return beg;
4749     }
4750     /* both edges are all digits */
4751     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4752         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4753         all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4754         VALUE b, e;
4755         int width;
4756
4757         width = RSTRING_LENINT(beg);
4758         b = rb_str_to_inum(beg, 10, FALSE);
4759         e = rb_str_to_inum(end, 10, FALSE);
4760         if (FIXNUM_P(b) && FIXNUM_P(e)) {
4761             long bi = FIX2LONG(b);
4762             long ei = FIX2LONG(e);
4763             rb_encoding *usascii = rb_usascii_encoding();
4764
4765             while (bi <= ei) {
4766                 if (excl && bi == ei) break;
4767                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4768                 bi++;
4769             }
4770         }
4771         else {
4772             ID op = excl ? '<' : idLE;
4773             VALUE args[2], fmt = rb_fstring_lit("%.*d");
4774
4775             args[0] = INT2FIX(width);
4776             while (rb_funcall(b, op, 1, e)) {
4777                 args[1] = b;
4778                 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4779                 b = rb_funcallv(b, succ, 0, 0);
4780             }
4781         }
4782         return beg;
4783     }
4784     /* normal case */
4785     n = rb_str_cmp(beg, end);
4786     if (n > 0 || (excl && n == 0)) return beg;
4787
4788     after_end = rb_funcallv(end, succ, 0, 0);
4789     current = str_duplicate(rb_cString, beg);
4790     while (!rb_str_equal(current, after_end)) {
4791         VALUE next = Qnil;
4792         if (excl || !rb_str_equal(current, end))
4793             next = rb_funcallv(current, succ, 0, 0);
4794         if ((*each)(current, arg)) break;
4795         if (NIL_P(next)) break;
4796         current = next;
4797         StringValue(current);
4798         if (excl && rb_str_equal(current, end)) break;
4799         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4800             break;
4801     }
4802
4803     return beg;
4804 }
4805
4806 VALUE
4807 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4808 {
4809     VALUE current;
4810     ID succ;
4811
4812     CONST_ID(succ, "succ");
4813     /* both edges are all digits */
4814     if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4815         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4816         VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4817         int width = RSTRING_LENINT(beg);
4818         b = rb_str_to_inum(beg, 10, FALSE);
4819         if (FIXNUM_P(b)) {
4820             long bi = FIX2LONG(b);
4821             rb_encoding *usascii = rb_usascii_encoding();
4822
4823             while (FIXABLE(bi)) {
4824                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4825                 bi++;
4826             }
4827             b = LONG2NUM(bi);
4828         }
4829         args[0] = INT2FIX(width);
4830         while (1) {
4831             args[1] = b;
4832             if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4833             b = rb_funcallv(b, succ, 0, 0);
4834         }
4835     }
4836     /* normal case */
4837     current = str_duplicate(rb_cString, beg);
4838     while (1) {
4839         VALUE next = rb_funcallv(current, succ, 0, 0);
4840         if ((*each)(current, arg)) break;
4841         current = next;
4842         StringValue(current);
4843         if (RSTRING_LEN(current) == 0)
4844             break;
4845     }
4846
4847     return beg;
4848 }
4849
4850 static int
4851 include_range_i(VALUE str, VALUE arg)
4852 {
4853     VALUE *argp = (VALUE *)arg;
4854     if (!rb_equal(str, *argp)) return 0;
4855     *argp = Qnil;
4856     return 1;
4857 }
4858
4859 VALUE
4860 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4861 {
4862     beg = rb_str_new_frozen(beg);
4863     StringValue(end);
4864     end = rb_str_new_frozen(end);
4865     if (NIL_P(val)) return Qfalse;
4866     val = rb_check_string_type(val);
4867     if (NIL_P(val)) return Qfalse;
4868     if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4869         rb_enc_asciicompat(STR_ENC_GET(end)) &&
4870         rb_enc_asciicompat(STR_ENC_GET(val))) {
4871         const char *bp = RSTRING_PTR(beg);
4872         const char *ep = RSTRING_PTR(end);
4873         const char *vp = RSTRING_PTR(val);
4874         if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4875             if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4876                 return Qfalse;
4877             else {
4878                 char b = *bp;
4879                 char e = *ep;
4880                 char v = *vp;
4881
4882                 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4883                     if (b <= v && v < e) return Qtrue;
4884                     return RBOOL(!RTEST(exclusive) && v == e);
4885                 }
4886             }
4887         }
4888 #if 0
4889         /* both edges are all digits */
4890         if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4891             all_digits_p(bp, RSTRING_LEN(beg)) &&
4892             all_digits_p(ep, RSTRING_LEN(end))) {
4893             /* TODO */
4894         }
4895 #endif
4896     }
4897     rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4898
4899     return RBOOL(NIL_P(val));
4900 }
4901
4902 static VALUE
4903 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4904 {
4905     if (rb_reg_search(re, str, 0, 0) >= 0) {
4906         VALUE match = rb_backref_get();
4907         int nth = rb_reg_backref_number(match, backref);
4908         return rb_reg_nth_match(nth, match);
4909     }
4910     return Qnil;
4911 }
4912
4913 static VALUE
4914 rb_str_aref(VALUE str, VALUE indx)
4915 {
4916     long idx;
4917
4918     if (FIXNUM_P(indx)) {
4919         idx = FIX2LONG(indx);
4920     }
4921     else if (RB_TYPE_P(indx, T_REGEXP)) {
4922         return rb_str_subpat(str, indx, INT2FIX(0));
4923     }
4924     else if (RB_TYPE_P(indx, T_STRING)) {
4925         if (rb_str_index(str, indx, 0) != -1)
4926             return str_duplicate(rb_cString, indx);
4927         return Qnil;
4928     }
4929     else {
4930         /* check if indx is Range */
4931         long beg, len = str_strlen(str, NULL);
4932         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4933           case Qfalse:
4934             break;
4935           case Qnil:
4936             return Qnil;
4937           default:
4938             return rb_str_substr(str, beg, len);
4939         }
4940         idx = NUM2LONG(indx);
4941     }
4942
4943     return str_substr(str, idx, 1, FALSE);
4944 }
4945
4946
4947 /*
4948  *  call-seq:
4949  *    string[index] -> new_string or nil
4950  *    string[start, length] -> new_string or nil
4951  *    string[range] -> new_string or nil
4952  *    string[regexp, capture = 0] -> new_string or nil
4953  *    string[substring] -> new_string or nil
4954  *
4955  *  Returns the substring of +self+ specified by the arguments.
4956  *
4957  *  When the single \Integer argument +index+ is given,
4958  *  returns the 1-character substring found in +self+ at offset +index+:
4959  *
4960  *    'bar'[2] # => "r"
4961  *
4962  *  Counts backward from the end of +self+ if +index+ is negative:
4963  *
4964  *    'foo'[-3] # => "f"
4965  *
4966  *  Returns +nil+ if +index+ is out of range:
4967  *
4968  *    'foo'[3] # => nil
4969  *    'foo'[-4] # => nil
4970  *
4971  *  When the two \Integer arguments  +start+ and +length+ are given,
4972  *  returns the substring of the given +length+ found in +self+ at offset +start+:
4973  *
4974  *    'foo'[0, 2] # => "fo"
4975  *    'foo'[0, 0] # => ""
4976  *
4977  *  Counts backward from the end of +self+ if +start+ is negative:
4978  *
4979  *    'foo'[-2, 2] # => "oo"
4980  *
4981  *  Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4982  *
4983  *    'foo'[3, 2] # => ""
4984  *
4985  *  Returns +nil+ if +start+ is out of range:
4986  *
4987  *    'foo'[4, 2] # => nil
4988  *    'foo'[-4, 2] # => nil
4989  *
4990  *  Returns the trailing substring of +self+ if +length+ is large:
4991  *
4992  *    'foo'[1, 50] # => "oo"
4993  *
4994  *  Returns +nil+ if +length+ is negative:
4995  *
4996  *    'foo'[0, -1] # => nil
4997  *
4998  *  When the single \Range argument +range+ is given,
4999  *  derives +start+ and +length+ values from the given +range+,
5000  *  and returns values as above:
5001  *
5002  *  - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5003  *  - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5004  *
5005  *  When the \Regexp argument +regexp+ is given,
5006  *  and the +capture+ argument is <tt>0</tt>,
5007  *  returns the first matching substring found in +self+,
5008  *  or +nil+ if none found:
5009  *
5010  *    'foo'[/o/] # => "o"
5011  *    'foo'[/x/] # => nil
5012  *    s = 'hello there'
5013  *    s[/[aeiou](.)\1/] # => "ell"
5014  *    s[/[aeiou](.)\1/, 0] # => "ell"
5015  *
5016  *  If argument +capture+ is given and not <tt>0</tt>,
5017  *  it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5018  *  the method call returns only the specified capture
5019  *  (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5020  *
5021  *    s = 'hello there'
5022  *    s[/[aeiou](.)\1/, 1] # => "l"
5023  *    s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5024  *    s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5025  *
5026  *  If an invalid capture group index is given, +nil+ is returned.  If an invalid
5027  *  capture group name is given, +IndexError+ is raised.
5028  *
5029  *  When the single \String argument +substring+ is given,
5030  *  returns the substring from +self+ if found, otherwise +nil+:
5031  *
5032  *    'foo'['oo'] # => "oo"
5033  *    'foo'['xx'] # => nil
5034  *
5035  *  String#slice is an alias for String#[].
5036  */
5037
5038 static VALUE
5039 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5040 {
5041     if (argc == 2) {
5042         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5043             return rb_str_subpat(str, argv[0], argv[1]);
5044         }
5045         else {
5046             long beg = NUM2LONG(argv[0]);
5047             long len = NUM2LONG(argv[1]);
5048             return rb_str_substr(str, beg, len);
5049         }
5050     }
5051     rb_check_arity(argc, 1, 2);
5052     return rb_str_aref(str, argv[0]);
5053 }
5054
5055 VALUE
5056 rb_str_drop_bytes(VALUE str, long len)
5057 {
5058     char *ptr = RSTRING_PTR(str);
5059     long olen = RSTRING_LEN(str), nlen;
5060
5061     str_modifiable(str);
5062     if (len > olen) len = olen;
5063     nlen = olen - len;
5064     if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5065         char *oldptr = ptr;
5066         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5067         STR_SET_EMBED(str);
5068         STR_SET_EMBED_LEN(str, nlen);
5069         ptr = RSTRING(str)->as.embed.ary;
5070         memmove(ptr, oldptr + len, nlen);
5071         if (fl == STR_NOEMBED) xfree(oldptr);
5072     }
5073     else {
5074         if (!STR_SHARED_P(str)) {
5075             VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5076             rb_enc_cr_str_exact_copy(shared, str);
5077             OBJ_FREEZE(shared);
5078         }
5079         ptr = RSTRING(str)->as.heap.ptr += len;
5080         RSTRING(str)->as.heap.len = nlen;
5081     }
5082     ptr[nlen] = 0;
5083     ENC_CODERANGE_CLEAR(str);
5084     return str;
5085 }
5086
5087 static void
5088 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5089 {
5090     char *sptr;
5091     long slen, vlen = RSTRING_LEN(val);
5092     int cr;
5093
5094     if (beg == 0 && vlen == 0) {
5095         rb_str_drop_bytes(str, len);
5096         return;
5097     }
5098
5099     str_modify_keep_cr(str);
5100     RSTRING_GETMEM(str, sptr, slen);
5101     if (len < vlen) {
5102         /* expand string */
5103         RESIZE_CAPA(str, slen + vlen - len);
5104         sptr = RSTRING_PTR(str);
5105     }
5106
5107     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5108         cr = rb_enc_str_coderange(val);
5109     else
5110         cr = ENC_CODERANGE_UNKNOWN;
5111
5112     if (vlen != len) {
5113         memmove(sptr + beg + vlen,
5114                 sptr + beg + len,
5115                 slen - (beg + len));
5116     }
5117     if (vlen < beg && len < 0) {
5118         MEMZERO(sptr + slen, char, -len);
5119     }
5120     if (vlen > 0) {
5121         memmove(sptr + beg, RSTRING_PTR(val), vlen);
5122     }
5123     slen += vlen - len;
5124     STR_SET_LEN(str, slen);
5125     TERM_FILL(&sptr[slen], TERM_LEN(str));
5126     ENC_CODERANGE_SET(str, cr);
5127 }
5128
5129 void
5130 rb_str_update(VALUE str, long beg, long len, VALUE val)
5131 {
5132     long slen;
5133     char *p, *e;
5134     rb_encoding *enc;
5135     int singlebyte = single_byte_optimizable(str);
5136     int cr;
5137
5138     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5139
5140     StringValue(val);
5141     enc = rb_enc_check(str, val);
5142     slen = str_strlen(str, enc); /* rb_enc_check */
5143
5144     if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5145         rb_raise(rb_eIndexError, "index %ld out of string", beg);
5146     }
5147     if (beg < 0) {
5148         beg += slen;
5149     }
5150     assert(beg >= 0);
5151     assert(beg <= slen);
5152     if (len > slen - beg) {
5153         len = slen - beg;
5154     }
5155     str_modify_keep_cr(str);
5156     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5157     if (!p) p = RSTRING_END(str);
5158     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5159     if (!e) e = RSTRING_END(str);
5160     /* error check */
5161     beg = p - RSTRING_PTR(str); /* physical position */
5162     len = e - p;                /* physical length */
5163     rb_str_splice_0(str, beg, len, val);
5164     rb_enc_associate(str, enc);
5165     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5166     if (cr != ENC_CODERANGE_BROKEN)
5167         ENC_CODERANGE_SET(str, cr);
5168 }
5169
5170 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5171
5172 static void
5173 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5174 {
5175     int nth;
5176     VALUE match;
5177     long start, end, len;
5178     rb_encoding *enc;
5179     struct re_registers *regs;
5180
5181     if (rb_reg_search(re, str, 0, 0) < 0) {
5182         rb_raise(rb_eIndexError, "regexp not matched");
5183     }
5184     match = rb_backref_get();
5185     nth = rb_reg_backref_number(match, backref);
5186     regs = RMATCH_REGS(match);
5187     if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5188         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5189     }
5190     if (nth < 0) {
5191         nth += regs->num_regs;
5192     }
5193
5194     start = BEG(nth);
5195     if (start == -1) {
5196         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5197     }
5198     end = END(nth);
5199     len = end - start;
5200     StringValue(val);
5201     enc = rb_enc_check_str(str, val);
5202     rb_str_splice_0(str, start, len, val);
5203     rb_enc_associate(str, enc);
5204 }
5205
5206 static VALUE
5207 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5208 {
5209     long idx, beg;
5210
5211     switch (TYPE(indx)) {
5212       case T_REGEXP:
5213         rb_str_subpat_set(str, indx, INT2FIX(0), val);
5214         return val;
5215
5216       case T_STRING:
5217         beg = rb_str_index(str, indx, 0);
5218         if (beg < 0) {
5219             rb_raise(rb_eIndexError, "string not matched");
5220         }
5221         beg = rb_str_sublen(str, beg);
5222         rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5223         return val;
5224
5225       default:
5226         /* check if indx is Range */
5227         {
5228             long beg, len;
5229             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5230                 rb_str_splice(str, beg, len, val);
5231                 return val;
5232             }
5233         }
5234         /* FALLTHROUGH */
5235
5236       case T_FIXNUM:
5237         idx = NUM2LONG(indx);
5238         rb_str_splice(str, idx, 1, val);
5239         return val;
5240     }
5241 }
5242
5243 /*
5244  *  call-seq:
5245  *     str[integer] = new_str
5246  *     str[integer, integer] = new_str
5247  *     str[range] = aString
5248  *     str[regexp] = new_str
5249  *     str[regexp, integer] = new_str
5250  *     str[regexp, name] = new_str
5251  *     str[other_str] = new_str
5252  *
5253  *  Element Assignment---Replaces some or all of the content of
5254  *  <i>str</i>. The portion of the string affected is determined using
5255  *  the same criteria as String#[]. If the replacement string is not
5256  *  the same length as the text it is replacing, the string will be
5257  *  adjusted accordingly. If the regular expression or string is used
5258  *  as the index doesn't match a position in the string, IndexError is
5259  *  raised. If the regular expression form is used, the optional
5260  *  second Integer allows you to specify which portion of the match to
5261  *  replace (effectively using the MatchData indexing rules. The forms
5262  *  that take an Integer will raise an IndexError if the value is out
5263  *  of range; the Range form will raise a RangeError, and the Regexp
5264  *  and String will raise an IndexError on negative match.
5265  */
5266
5267 static VALUE
5268 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5269 {
5270     if (argc == 3) {
5271         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5272             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5273         }
5274         else {
5275             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5276         }
5277         return argv[2];
5278     }
5279     rb_check_arity(argc, 2, 3);
5280     return rb_str_aset(str, argv[0], argv[1]);
5281 }
5282
5283 /*
5284  *  call-seq:
5285  *    insert(index, other_string) -> self
5286  *
5287  *  Inserts the given +other_string+ into +self+; returns +self+.
5288  *
5289  *  If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5290  *
5291  *    'foo'.insert(1, 'bar') # => "fbaroo"
5292  *
5293  *  If the \Integer +index+ is negative, counts backward from the end of +self+
5294  *  and inserts +other_string+ at offset <tt>index+1</tt>
5295  *  (that is, _after_ <tt>self[index]</tt>):
5296  *
5297  *    'foo'.insert(-2, 'bar') # => "fobaro"
5298  *
5299  */
5300
5301 static VALUE
5302 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5303 {
5304     long pos = NUM2LONG(idx);
5305
5306     if (pos == -1) {
5307         return rb_str_append(str, str2);
5308     }
5309     else if (pos < 0) {
5310         pos++;
5311     }
5312     rb_str_splice(str, pos, 0, str2);
5313     return str;
5314 }
5315
5316
5317 /*
5318  *  call-seq:
5319  *     slice!(index)               -> new_string or nil
5320  *     slice!(start, length)       -> new_string or nil
5321  *     slice!(range)               -> new_string or nil
5322  *     slice!(regexp, capture = 0) -> new_string or nil
5323  *     slice!(substring)           -> new_string or nil
5324  *
5325  *  Removes the substring of +self+ specified by the arguments;
5326  *  returns the removed substring.
5327  *
5328  *  See String#[] for details about the arguments that specify the substring.
5329  *
5330  *  A few examples:
5331  *
5332  *     string = "This is a string"
5333  *     string.slice!(2)        #=> "i"
5334  *     string.slice!(3..6)     #=> " is "
5335  *     string.slice!(/s.*t/)   #=> "sa st"
5336  *     string.slice!("r")      #=> "r"
5337  *     string                  #=> "Thing"
5338  *
5339  */
5340
5341 static VALUE
5342 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5343 {
5344     VALUE result = Qnil;
5345     VALUE indx;
5346     long beg, len = 1;
5347     char *p;
5348
5349     rb_check_arity(argc, 1, 2);
5350     str_modify_keep_cr(str);
5351     indx = argv[0];
5352     if (RB_TYPE_P(indx, T_REGEXP)) {
5353         if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5354         VALUE match = rb_backref_get();
5355         struct re_registers *regs = RMATCH_REGS(match);
5356         int nth = 0;
5357         if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5358             if ((nth += regs->num_regs) <= 0) return Qnil;
5359         }
5360         else if (nth >= regs->num_regs) return Qnil;
5361         beg = BEG(nth);
5362         len = END(nth) - beg;
5363         goto subseq;
5364     }
5365     else if (argc == 2) {
5366         beg = NUM2LONG(indx);
5367         len = NUM2LONG(argv[1]);
5368         goto num_index;
5369     }
5370     else if (FIXNUM_P(indx)) {
5371         beg = FIX2LONG(indx);
5372         if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5373         if (!len) return Qnil;
5374         beg = p - RSTRING_PTR(str);
5375         goto subseq;
5376     }
5377     else if (RB_TYPE_P(indx, T_STRING)) {
5378         beg = rb_str_index(str, indx, 0);
5379         if (beg == -1) return Qnil;
5380         len = RSTRING_LEN(indx);
5381         result = str_duplicate(rb_cString, indx);
5382         goto squash;
5383     }
5384     else {
5385         switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5386           case Qnil:
5387             return Qnil;
5388           case Qfalse:
5389             beg = NUM2LONG(indx);
5390             if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5391             if (!len) return Qnil;
5392             beg = p - RSTRING_PTR(str);
5393             goto subseq;
5394           default:
5395             goto num_index;
5396         }
5397     }
5398
5399   num_index:
5400     if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5401     beg = p - RSTRING_PTR(str);
5402
5403   subseq:
5404     result = rb_str_new(RSTRING_PTR(str)+beg, len);
5405     rb_enc_cr_str_copy_for_substr(result, str);
5406
5407   squash:
5408     if (len > 0) {
5409         if (beg == 0) {
5410             rb_str_drop_bytes(str, len);
5411         }
5412         else {
5413             char *sptr = RSTRING_PTR(str);
5414             long slen = RSTRING_LEN(str);
5415             if (beg + len > slen) /* pathological check */
5416                 len = slen - beg;
5417             memmove(sptr + beg,
5418                     sptr + beg + len,
5419                     slen - (beg + len));
5420             slen -= len;
5421             STR_SET_LEN(str, slen);
5422             TERM_FILL(&sptr[slen], TERM_LEN(str));
5423         }
5424     }
5425     return result;
5426 }
5427
5428 static VALUE
5429 get_pat(VALUE pat)
5430 {
5431     VALUE val;
5432
5433     switch (OBJ_BUILTIN_TYPE(pat)) {
5434       case T_REGEXP:
5435         return pat;
5436
5437       case T_STRING:
5438         break;
5439
5440       default:
5441         val = rb_check_string_type(pat);
5442         if (NIL_P(val)) {
5443             Check_Type(pat, T_REGEXP);
5444         }
5445         pat = val;
5446     }
5447
5448     return rb_reg_regcomp(pat);
5449 }
5450
5451 static VALUE
5452 get_pat_quoted(VALUE pat, int check)
5453 {
5454     VALUE val;
5455
5456     switch (OBJ_BUILTIN_TYPE(pat)) {
5457       case T_REGEXP:
5458         return pat;
5459
5460       case T_STRING:
5461         break;
5462
5463       default:
5464         val = rb_check_string_type(pat);
5465         if (NIL_P(val)) {
5466             Check_Type(pat, T_REGEXP);
5467         }
5468         pat = val;
5469     }
5470     if (check && is_broken_string(pat)) {
5471         rb_exc_raise(rb_reg_check_preprocess(pat));
5472     }
5473     return pat;
5474 }
5475
5476 static long
5477 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5478 {
5479     if (BUILTIN_TYPE(pat) == T_STRING) {
5480         pos = rb_strseq_index(str, pat, pos, 1);
5481         if (set_backref_str) {
5482             if (pos >= 0) {
5483                 str = rb_str_new_frozen_String(str);
5484                 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5485             }
5486             else {
5487                 rb_backref_set(Qnil);
5488             }
5489         }
5490         return pos;
5491     }
5492     else {
5493         return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5494     }
5495 }
5496
5497
5498 /*
5499  *  call-seq:
5500  *    sub!(pattern, replacement)   -> self or nil
5501  *    sub!(pattern) {|match| ... } -> self or nil
5502  *
5503  *  Returns +self+ with only the first occurrence
5504  *  (not all occurrences) of the given +pattern+ replaced.
5505  *
5506  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5507  *
5508  *  Related: String#sub, String#gsub, String#gsub!.
5509  *
5510  */
5511
5512 static VALUE
5513 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5514 {
5515     VALUE pat, repl, hash = Qnil;
5516     int iter = 0;
5517     long plen;
5518     int min_arity = rb_block_given_p() ? 1 : 2;
5519     long beg;
5520
5521     rb_check_arity(argc, min_arity, 2);
5522     if (argc == 1) {
5523         iter = 1;
5524     }
5525     else {
5526         repl = argv[1];
5527         hash = rb_check_hash_type(argv[1]);
5528         if (NIL_P(hash)) {
5529             StringValue(repl);
5530         }
5531     }
5532
5533     pat = get_pat_quoted(argv[0], 1);
5534
5535     str_modifiable(str);
5536     beg = rb_pat_search(pat, str, 0, 1);
5537     if (beg >= 0) {
5538         rb_encoding *enc;
5539         int cr = ENC_CODERANGE(str);
5540         long beg0, end0;
5541         VALUE match, match0 = Qnil;
5542         struct re_registers *regs;
5543         char *p, *rp;
5544         long len, rlen;
5545
5546         match = rb_backref_get();
5547         regs = RMATCH_REGS(match);
5548         if (RB_TYPE_P(pat, T_STRING)) {
5549             beg0 = beg;
5550             end0 = beg0 + RSTRING_LEN(pat);
5551             match0 = pat;
5552         }
5553         else {
5554             beg0 = BEG(0);
5555             end0 = END(0);
5556             if (iter) match0 = rb_reg_nth_match(0, match);
5557         }
5558
5559         if (iter || !NIL_P(hash)) {
5560             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5561
5562             if (iter) {
5563                 repl = rb_obj_as_string(rb_yield(match0));
5564             }
5565             else {
5566                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5567                 repl = rb_obj_as_string(repl);
5568             }
5569             str_mod_check(str, p, len);
5570             rb_check_frozen(str);
5571         }
5572         else {
5573             repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5574         }
5575
5576         enc = rb_enc_compatible(str, repl);
5577         if (!enc) {
5578             rb_encoding *str_enc = STR_ENC_GET(str);
5579             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5580             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5581                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5582                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5583                          rb_enc_name(str_enc),
5584                          rb_enc_name(STR_ENC_GET(repl)));
5585             }
5586             enc = STR_ENC_GET(repl);
5587         }
5588         rb_str_modify(str);
5589         rb_enc_associate(str, enc);
5590         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5591             int cr2 = ENC_CODERANGE(repl);
5592             if (cr2 == ENC_CODERANGE_BROKEN ||
5593                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5594                 cr = ENC_CODERANGE_UNKNOWN;
5595             else
5596                 cr = cr2;
5597         }
5598         plen = end0 - beg0;
5599         rlen = RSTRING_LEN(repl);
5600         len = RSTRING_LEN(str);
5601         if (rlen > plen) {
5602             RESIZE_CAPA(str, len + rlen - plen);
5603         }
5604         p = RSTRING_PTR(str);
5605         if (rlen != plen) {
5606             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5607         }
5608         rp = RSTRING_PTR(repl);
5609         memmove(p + beg0, rp, rlen);
5610         len += rlen - plen;
5611         STR_SET_LEN(str, len);
5612         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5613         ENC_CODERANGE_SET(str, cr);
5614
5615         return str;
5616     }
5617     return Qnil;
5618 }
5619
5620
5621 /*
5622  *  call-seq:
5623  *    sub(pattern, replacement)   -> new_string
5624  *    sub(pattern) {|match| ... } -> new_string
5625  *
5626  *  Returns a copy of +self+ with only the first occurrence
5627  *  (not all occurrences) of the given +pattern+ replaced.
5628  *
5629  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5630  *
5631  *  Related: String#sub!, String#gsub, String#gsub!.
5632  *
5633  */
5634
5635 static VALUE
5636 rb_str_sub(int argc, VALUE *argv, VALUE str)
5637 {
5638     str = str_duplicate(rb_cString, str);
5639     rb_str_sub_bang(argc, argv, str);
5640     return str;
5641 }
5642
5643 static VALUE
5644 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5645 {
5646     VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5647     struct re_registers *regs;
5648     long beg, beg0, end0;
5649     long offset, blen, slen, len, last;
5650     enum {STR, ITER, MAP} mode = STR;
5651     char *sp, *cp;
5652     int need_backref = -1;
5653     rb_encoding *str_enc;
5654
5655     switch (argc) {
5656       case 1:
5657         RETURN_ENUMERATOR(str, argc, argv);
5658         mode = ITER;
5659         break;
5660       case 2:
5661         repl = argv[1];
5662         hash = rb_check_hash_type(argv[1]);
5663         if (NIL_P(hash)) {
5664             StringValue(repl);
5665         }
5666         else {
5667             mode = MAP;
5668         }
5669         break;
5670       default:
5671         rb_error_arity(argc, 1, 2);
5672     }
5673
5674     pat = get_pat_quoted(argv[0], 1);
5675     beg = rb_pat_search(pat, str, 0, need_backref);
5676     if (beg < 0) {
5677         if (bang) return Qnil;  /* no match, no substitution */
5678         return str_duplicate(rb_cString, str);
5679     }
5680
5681     offset = 0;
5682     blen = RSTRING_LEN(str) + 30; /* len + margin */
5683     dest = rb_str_buf_new(blen);
5684     sp = RSTRING_PTR(str);
5685     slen = RSTRING_LEN(str);
5686     cp = sp;
5687     str_enc = STR_ENC_GET(str);
5688     rb_enc_associate(dest, str_enc);
5689     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5690
5691     do {
5692         match = rb_backref_get();
5693         regs = RMATCH_REGS(match);
5694         if (RB_TYPE_P(pat, T_STRING)) {
5695             beg0 = beg;
5696             end0 = beg0 + RSTRING_LEN(pat);
5697             match0 = pat;
5698         }
5699         else {
5700             beg0 = BEG(0);
5701             end0 = END(0);
5702             if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5703         }
5704
5705         if (mode) {
5706             if (mode == ITER) {
5707                 val = rb_obj_as_string(rb_yield(match0));
5708             }
5709             else {
5710                 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5711                 val = rb_obj_as_string(val);
5712             }
5713             str_mod_check(str, sp, slen);
5714             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
5715                 rb_raise(rb_eRuntimeError, "block should not cheat");
5716             }
5717         }
5718         else if (need_backref) {
5719             val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5720             if (need_backref < 0) {
5721                 need_backref = val != repl;
5722             }
5723         }
5724         else {
5725             val = repl;
5726         }
5727
5728         len = beg0 - offset;    /* copy pre-match substr */
5729         if (len) {
5730             rb_enc_str_buf_cat(dest, cp, len, str_enc);
5731         }
5732
5733         rb_str_buf_append(dest, val);
5734
5735         last = offset;
5736         offset = end0;
5737         if (beg0 == end0) {
5738             /*
5739              * Always consume at least one character of the input string
5740              * in order to prevent infinite loops.
5741              */
5742             if (RSTRING_LEN(str) <= end0) break;
5743             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5744             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5745             offset = end0 + len;
5746         }
5747         cp = RSTRING_PTR(str) + offset;
5748         if (offset > RSTRING_LEN(str)) break;
5749         beg = rb_pat_search(pat, str, offset, need_backref);
5750     } while (beg >= 0);
5751     if (RSTRING_LEN(str) > offset) {
5752         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5753     }
5754     rb_pat_search(pat, str, last, 1);
5755     if (bang) {
5756         str_shared_replace(str, dest);
5757     }
5758     else {
5759         str = dest;
5760     }
5761
5762     return str;
5763 }
5764
5765
5766 /*
5767  *  call-seq:
5768  *     gsub!(pattern, replacement)   -> self or nil
5769  *     gsub!(pattern) {|match| ... } -> self or nil
5770  *     gsub!(pattern)                -> an_enumerator
5771  *
5772  *  Performs the specified substring replacement(s) on +self+;
5773  *  returns +self+ if any replacement occurred, +nil+ otherwise.
5774  *
5775  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5776  *
5777  *  Returns an Enumerator if no +replacement+ and no block given.
5778  *
5779  *  Related: String#sub, String#gsub, String#sub!.
5780  *
5781  */
5782
5783 static VALUE
5784 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5785 {
5786     str_modify_keep_cr(str);
5787     return str_gsub(argc, argv, str, 1);
5788 }
5789
5790
5791 /*
5792  *  call-seq:
5793  *     gsub(pattern, replacement)   -> new_string
5794  *     gsub(pattern) {|match| ... } -> new_string
5795  *     gsub(pattern)                -> enumerator
5796  *
5797  *  Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5798  *
5799  *  See {Substitution Methods}[#class-String-label-Substitution+Methods].
5800  *
5801  *  Returns an Enumerator if no +replacement+ and no block given.
5802  *
5803  *  Related: String#sub, String#sub!, String#gsub!.
5804  *
5805  */
5806
5807 static VALUE
5808 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5809 {
5810     return str_gsub(argc, argv, str, 0);
5811 }
5812
5813
5814 /*
5815  *  call-seq:
5816  *    replace(other_string) -> self
5817  *
5818  *  Replaces the contents of +self+ with the contents of +other_string+:
5819  *
5820  *    s = 'foo'        # => "foo"
5821  *    s.replace('bar') # => "bar"
5822  *
5823  */
5824
5825 VALUE
5826 rb_str_replace(VALUE str, VALUE str2)
5827 {
5828     str_modifiable(str);
5829     if (str == str2) return str;
5830
5831     StringValue(str2);
5832     str_discard(str);
5833     return str_replace(str, str2);
5834 }
5835
5836 /*
5837  *  call-seq:
5838  *    clear -> self
5839  *
5840  *  Removes the contents of +self+:
5841  *
5842  *    s = 'foo' # => "foo"
5843  *    s.clear   # => ""
5844  *
5845  */
5846
5847 static VALUE
5848 rb_str_clear(VALUE str)
5849 {
5850     str_discard(str);
5851     STR_SET_EMBED(str);
5852     STR_SET_EMBED_LEN(str, 0);
5853     RSTRING_PTR(str)[0] = 0;
5854     if (rb_enc_asciicompat(STR_ENC_GET(str)))
5855         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
5856     else
5857         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5858     return str;
5859 }
5860
5861 /*
5862  *  call-seq:
5863  *    chr -> string
5864  *
5865  *  Returns a string containing the first character of +self+:
5866  *
5867  *    s = 'foo' # => "foo"
5868  *    s.chr     # => "f"
5869  *
5870  */
5871
5872 static VALUE
5873 rb_str_chr(VALUE str)
5874 {
5875     return rb_str_substr(str, 0, 1);
5876 }
5877
5878 /*
5879  *  call-seq:
5880  *    getbyte(index) -> integer
5881  *
5882  *  Returns the byte at zero-based +index+ as an integer:
5883  *
5884  *    s = 'abcde'  # => "abcde"
5885  *    s.getbyte(0) # => 97
5886  *    s.getbyte(1) # => 98
5887  *
5888  *  Related: String#setbyte.
5889  */
5890 static VALUE
5891 rb_str_getbyte(VALUE str, VALUE index)
5892 {
5893     long pos = NUM2LONG(index);
5894
5895     if (pos < 0)
5896         pos += RSTRING_LEN(str);
5897     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
5898         return Qnil;
5899
5900     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5901 }
5902
5903 /*
5904  *  call-seq:
5905  *    setbyte(index, integer) -> integer
5906  *
5907  *  Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5908  *
5909  *    s = 'abcde'      # => "abcde"
5910  *    s.setbyte(0, 98) # => 98
5911  *    s                # => "bbcde"
5912  *
5913  *  Related: String#getbyte.
5914  */
5915 static VALUE
5916 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5917 {
5918     long pos = NUM2LONG(index);
5919     long len = RSTRING_LEN(str);
5920     char *ptr, *head, *left = 0;
5921     rb_encoding *enc;
5922     int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5923
5924     if (pos < -len || len <= pos)
5925         rb_raise(rb_eIndexError, "index %ld out of string", pos);
5926     if (pos < 0)
5927         pos += len;
5928
5929     VALUE v = rb_to_int(value);
5930     VALUE w = rb_int_and(v, INT2FIX(0xff));
5931     char byte = (char)(NUM2INT(w) & 0xFF);
5932
5933     if (!str_independent(str))
5934         str_make_independent(str);
5935     enc = STR_ENC_GET(str);
5936     head = RSTRING_PTR(str);
5937     ptr = &head[pos];
5938     if (!STR_EMBED_P(str)) {
5939         cr = ENC_CODERANGE(str);
5940         switch (cr) {
5941           case ENC_CODERANGE_7BIT:
5942             left = ptr;
5943             *ptr = byte;
5944             if (ISASCII(byte)) goto end;
5945             nlen = rb_enc_precise_mbclen(left, head+len, enc);
5946             if (!MBCLEN_CHARFOUND_P(nlen))
5947                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5948             else
5949                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
5950             goto end;
5951           case ENC_CODERANGE_VALID:
5952             left = rb_enc_left_char_head(head, ptr, head+len, enc);
5953             width = rb_enc_precise_mbclen(left, head+len, enc);
5954             *ptr = byte;
5955             nlen = rb_enc_precise_mbclen(left, head+len, enc);
5956             if (!MBCLEN_CHARFOUND_P(nlen))
5957                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
5958             else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5959                 ENC_CODERANGE_CLEAR(str);
5960             goto end;
5961         }
5962     }
5963     ENC_CODERANGE_CLEAR(str);
5964     *ptr = byte;
5965
5966   end:
5967     return value;
5968 }
5969
5970 static VALUE
5971 str_byte_substr(VALUE str, long beg, long len, int empty)
5972 {
5973     char *p, *s = RSTRING_PTR(str);
5974     long n = RSTRING_LEN(str);
5975     VALUE str2;
5976
5977     if (beg > n || len < 0) return Qnil;
5978     if (beg < 0) {
5979         beg += n;
5980         if (beg < 0) return Qnil;
5981     }
5982     if (len > n - beg)
5983         len = n - beg;
5984     if (len <= 0) {
5985         if (!empty) return Qnil;
5986         len = 0;
5987         p = 0;
5988     }
5989     else
5990         p = s + beg;
5991
5992     if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5993         str2 = rb_str_new_frozen(str);
5994         str2 = str_new_shared(rb_cString, str2);
5995         RSTRING(str2)->as.heap.ptr += beg;
5996         RSTRING(str2)->as.heap.len = len;
5997     }
5998     else {
5999         str2 = rb_str_new(p, len);
6000     }
6001
6002     str_enc_copy(str2, str);
6003
6004     if (RSTRING_LEN(str2) == 0) {
6005         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6006             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6007         else
6008             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6009     }
6010     else {
6011         switch (ENC_CODERANGE(str)) {
6012           case ENC_CODERANGE_7BIT:
6013             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6014             break;
6015           default:
6016             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6017             break;
6018         }
6019     }
6020
6021     return str2;
6022 }
6023
6024 static VALUE
6025 str_byte_aref(VALUE str, VALUE indx)
6026 {
6027     long idx;
6028     if (FIXNUM_P(indx)) {
6029         idx = FIX2LONG(indx);
6030     }
6031     else {
6032         /* check if indx is Range */
6033         long beg, len = RSTRING_LEN(str);
6034
6035         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6036           case Qfalse:
6037             break;
6038           case Qnil:
6039             return Qnil;
6040           default:
6041             return str_byte_substr(str, beg, len, TRUE);
6042         }
6043
6044         idx = NUM2LONG(indx);
6045     }
6046     return str_byte_substr(str, idx, 1, FALSE);
6047 }
6048
6049 /*
6050  *  call-seq:
6051  *    byteslice(index, length = 1) -> string or nil
6052  *    byteslice(range)             -> string or nil
6053  *
6054  *  Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6055  *
6056  *  With integer arguments +index+ and +length+ given,
6057  *  returns the substring beginning at the given +index+
6058  *  of the given +length+ (if possible),
6059  *  or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6060  *
6061  *    s = '0123456789' # => "0123456789"
6062  *    s.byteslice(2)   # => "2"
6063  *    s.byteslice(200) # => nil
6064  *    s.byteslice(4, 3)  # => "456"
6065  *    s.byteslice(4, 30) # => "456789"
6066  *    s.byteslice(4, -1) # => nil
6067  *    s.byteslice(40, 2) # => nil
6068  *
6069  *  In either case above, counts backwards from the end of +self+
6070  *  if +index+ is negative:
6071  *
6072  *    s = '0123456789'   # => "0123456789"
6073  *    s.byteslice(-4)    # => "6"
6074  *    s.byteslice(-4, 3) # => "678"
6075  *
6076  *  With Range argument +range+ given, returns
6077  *  <tt>byteslice(range.begin, range.size)</tt>:
6078  *
6079  *    s = '0123456789'    # => "0123456789"
6080  *    s.byteslice(4..6)   # => "456"
6081  *    s.byteslice(-6..-4) # => "456"
6082  *    s.byteslice(5..2)   # => "" # range.size is zero.
6083  *    s.byteslice(40..42) # => nil
6084  *
6085  *  In all cases, a returned string has the same encoding as +self+:
6086  *
6087  *    s.encoding              # => #<Encoding:UTF-8>
6088  *    s.byteslice(4).encoding # => #<Encoding:UTF-8>
6089  *
6090  */
6091
6092 static VALUE
6093 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6094 {
6095     if (argc == 2) {
6096         long beg = NUM2LONG(argv[0]);
6097         long end = NUM2LONG(argv[1]);
6098         return str_byte_substr(str, beg, end, TRUE);
6099     }
6100     rb_check_arity(argc, 1, 2);
6101     return str_byte_aref(str, argv[0]);
6102 }
6103
6104 /*
6105  *  call-seq:
6106  *    reverse -> string
6107  *
6108  *  Returns a new string with the characters from +self+ in reverse order.
6109  *
6110  *    'stressed'.reverse # => "desserts"
6111  *
6112  */
6113
6114 static VALUE
6115 rb_str_reverse(VALUE str)
6116 {
6117     rb_encoding *enc;
6118     VALUE rev;
6119     char *s, *e, *p;
6120     int cr;
6121
6122     if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6123     enc = STR_ENC_GET(str);
6124     rev = rb_str_new(0, RSTRING_LEN(str));
6125     s = RSTRING_PTR(str); e = RSTRING_END(str);
6126     p = RSTRING_END(rev);
6127     cr = ENC_CODERANGE(str);
6128
6129     if (RSTRING_LEN(str) > 1) {
6130         if (single_byte_optimizable(str)) {
6131             while (s < e) {
6132                 *--p = *s++;
6133             }
6134         }
6135         else if (cr == ENC_CODERANGE_VALID) {
6136             while (s < e) {
6137                 int clen = rb_enc_fast_mbclen(s, e, enc);
6138
6139                 p -= clen;
6140                 memcpy(p, s, clen);
6141                 s += clen;
6142             }
6143         }
6144         else {
6145             cr = rb_enc_asciicompat(enc) ?
6146                 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6147             while (s < e) {
6148                 int clen = rb_enc_mbclen(s, e, enc);
6149
6150                 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6151                 p -= clen;
6152                 memcpy(p, s, clen);
6153                 s += clen;
6154             }
6155         }
6156     }
6157     STR_SET_LEN(rev, RSTRING_LEN(str));
6158     str_enc_copy(rev, str);
6159     ENC_CODERANGE_SET(rev, cr);
6160
6161     return rev;
6162 }
6163
6164
6165 /*
6166  *  call-seq:
6167  *    reverse! -> self
6168  *
6169  *  Returns +self+ with its characters reversed:
6170  *
6171  *    s = 'stressed'
6172  *    s.reverse! # => "desserts"
6173  *    s          # => "desserts"
6174  *
6175  */
6176
6177 static VALUE
6178 rb_str_reverse_bang(VALUE str)
6179 {
6180     if (RSTRING_LEN(str) > 1) {
6181         if (single_byte_optimizable(str)) {
6182             char *s, *e, c;
6183
6184             str_modify_keep_cr(str);
6185             s = RSTRING_PTR(str);
6186             e = RSTRING_END(str) - 1;
6187             while (s < e) {
6188                 c = *s;
6189                 *s++ = *e;
6190                 *e-- = c;
6191             }
6192         }
6193         else {
6194             str_shared_replace(str, rb_str_reverse(str));
6195         }
6196     }
6197     else {
6198         str_modify_keep_cr(str);
6199     }
6200     return str;
6201 }
6202
6203
6204 /*
6205  *  call-seq:
6206  *    include? other_string -> true or false
6207  *
6208  *  Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6209  *
6210  *    s = 'foo'
6211  *    s.include?('f')    # => true
6212  *    s.include?('fo')   # => true
6213  *    s.include?('food') # => false
6214  *
6215  */
6216
6217 static VALUE
6218 rb_str_include(VALUE str, VALUE arg)
6219 {
6220     long i;
6221
6222     StringValue(arg);
6223     i = rb_str_index(str, arg, 0);
6224
6225     return RBOOL(i != -1);
6226 }
6227
6228
6229 /*
6230  *  call-seq:
6231  *    to_i(base = 10) -> integer
6232  *
6233  *  Returns the result of interpreting leading characters in +self+
6234  *  as an integer in the given +base+ (which must be in (2..36)):
6235  *
6236  *    '123456'.to_i     # => 123456
6237  *    '123def'.to_i(16) # => 1195503
6238  *
6239  *  Characters past a leading valid number (in the given +base+) are ignored:
6240  *
6241  *    '12.345'.to_i   # => 12
6242  *    '12345'.to_i(2) # => 1
6243  *
6244  *  Returns zero if there is no leading valid number:
6245  *
6246  *    'abcdef'.to_i # => 0
6247  *    '2'.to_i(2)   # => 0
6248  *
6249  */
6250
6251 static VALUE
6252 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6253 {
6254     int base = 10;
6255
6256     if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6257         rb_raise(rb_eArgError, "invalid radix %d", base);
6258     }
6259     return rb_str_to_inum(str, base, FALSE);
6260 }
6261
6262
6263 /*
6264  *  call-seq:
6265  *    to_f -> float
6266  *
6267  *  Returns the result of interpreting leading characters in +self+ as a Float:
6268  *
6269  *    '3.14159'.to_f  # => 3.14159
6270       '1.234e-2'.to_f # => 0.01234
6271  *
6272  *  Characters past a leading valid number (in the given +base+) are ignored:
6273  *
6274  *    '3.14 (pi to two places)'.to_f # => 3.14
6275  *
6276  *  Returns zero if there is no leading valid number:
6277  *
6278  *    'abcdef'.to_f # => 0.0
6279  *
6280  */
6281
6282 static VALUE
6283 rb_str_to_f(VALUE str)
6284 {
6285     return DBL2NUM(rb_str_to_dbl(str, FALSE));
6286 }
6287
6288
6289 /*
6290  *  call-seq:
6291  *    to_s -> self or string
6292  *
6293  *  Returns +self+ if +self+ is a \String,
6294  *  or +self+ converted to a \String if +self+ is a subclass of \String.
6295  *
6296  *  String#to_str is an alias for String#to_s.
6297  *
6298  */
6299
6300 static VALUE
6301 rb_str_to_s(VALUE str)
6302 {
6303     if (rb_obj_class(str) != rb_cString) {
6304         return str_duplicate(rb_cString, str);
6305     }
6306     return str;
6307 }
6308
6309 #if 0
6310 static void
6311 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6312 {
6313     char s[RUBY_MAX_CHAR_LEN];
6314     int n = rb_enc_codelen(c, enc);
6315
6316     rb_enc_mbcput(c, s, enc);
6317     rb_enc_str_buf_cat(str, s, n, enc);
6318 }
6319 #endif
6320
6321 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6322
6323 int
6324 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6325 {
6326     char buf[CHAR_ESC_LEN + 1];
6327     int l;
6328
6329 #if SIZEOF_INT > 4
6330     c &= 0xffffffff;
6331 #endif
6332     if (unicode_p) {
6333         if (c < 0x7F && ISPRINT(c)) {
6334             snprintf(buf, CHAR_ESC_LEN, "%c", c);
6335         }
6336         else if (c < 0x10000) {
6337             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6338         }
6339         else {
6340             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6341         }
6342     }
6343     else {
6344         if (c < 0x100) {
6345             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6346         }
6347         else {
6348             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6349         }
6350     }
6351     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
6352     rb_str_buf_cat(result, buf, l);
6353     return l;
6354 }
6355
6356 const char *
6357 ruby_escaped_char(int c)
6358 {
6359     switch (c) {
6360       case '\0': return "\\0";
6361       case '\n': return "\\n";
6362       case '\r': return "\\r";
6363       case '\t': return "\\t";
6364       case '\f': return "\\f";
6365       case '\013': return "\\v";
6366       case '\010': return "\\b";
6367       case '\007': return "\\a";
6368       case '\033': return "\\e";
6369       case '\x7f': return "\\c?";
6370     }
6371     return NULL;
6372 }
6373
6374 VALUE
6375 rb_str_escape(VALUE str)
6376 {
6377     int encidx = ENCODING_GET(str);
6378     rb_encoding *enc = rb_enc_from_index(encidx);
6379     const char *p = RSTRING_PTR(str);
6380     const char *pend = RSTRING_END(str);
6381     const char *prev = p;
6382     char buf[CHAR_ESC_LEN + 1];
6383     VALUE result = rb_str_buf_new(0);
6384     int unicode_p = rb_enc_unicode_p(enc);
6385     int asciicompat = rb_enc_asciicompat(enc);
6386
6387     while (p < pend) {
6388         unsigned int c;
6389         const char *cc;
6390         int n = rb_enc_precise_mbclen(p, pend, enc);
6391         if (!MBCLEN_CHARFOUND_P(n)) {
6392             if (p > prev) str_buf_cat(result, prev, p - prev);
6393             n = rb_enc_mbminlen(enc);
6394             if (pend < p + n)
6395                 n = (int)(pend - p);
6396             while (n--) {
6397                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6398                 str_buf_cat(result, buf, strlen(buf));
6399                 prev = ++p;
6400             }
6401             continue;
6402         }
6403         n = MBCLEN_CHARFOUND_LEN(n);
6404         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6405         p += n;
6406         cc = ruby_escaped_char(c);
6407         if (cc) {
6408             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6409             str_buf_cat(result, cc, strlen(cc));
6410             prev = p;
6411         }
6412         else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6413         }
6414         else {
6415             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6416             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6417             prev = p;
6418         }
6419     }
6420     if (p > prev) str_buf_cat(result, prev, p - prev);
6421     ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6422
6423     return result;
6424 }
6425
6426 /*
6427  *  call-seq:
6428  *    inspect -> string
6429  *
6430  *  Returns a printable version of +self+, enclosed in double-quotes,
6431  *  and with special characters escaped:
6432  *
6433  *    s = "foo\tbar\tbaz\n"
6434  *    # => "foo\tbar\tbaz\n"
6435  *    s.inspect
6436  *    # => "\"foo\\tbar\\tbaz\\n\""
6437  *
6438  */
6439
6440 VALUE
6441 rb_str_inspect(VALUE str)
6442 {
6443     int encidx = ENCODING_GET(str);
6444     rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6445     const char *p, *pend, *prev;
6446     char buf[CHAR_ESC_LEN + 1];
6447     VALUE result = rb_str_buf_new(0);
6448     rb_encoding *resenc = rb_default_internal_encoding();
6449     int unicode_p = rb_enc_unicode_p(enc);
6450     int asciicompat = rb_enc_asciicompat(enc);
6451
6452     if (resenc == NULL) resenc = rb_default_external_encoding();
6453     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6454     rb_enc_associate(result, resenc);
6455     str_buf_cat2(result, "\"");
6456
6457     p = RSTRING_PTR(str); pend = RSTRING_END(str);
6458     prev = p;
6459     actenc = get_actual_encoding(encidx, str);
6460     if (actenc != enc) {
6461         enc = actenc;
6462         if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6463     }
6464     while (p < pend) {
6465         unsigned int c, cc;
6466         int n;
6467
6468         n = rb_enc_precise_mbclen(p, pend, enc);
6469         if (!MBCLEN_CHARFOUND_P(n)) {
6470             if (p > prev) str_buf_cat(result, prev, p - prev);
6471             n = rb_enc_mbminlen(enc);
6472             if (pend < p + n)
6473                 n = (int)(pend - p);
6474             while (n--) {
6475                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6476                 str_buf_cat(result, buf, strlen(buf));
6477                 prev = ++p;
6478             }
6479             continue;
6480         }
6481         n = MBCLEN_CHARFOUND_LEN(n);
6482         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6483         p += n;
6484         if ((asciicompat || unicode_p) &&
6485           (c == '"'|| c == '\\' ||
6486             (c == '#' &&
6487              p < pend &&
6488              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6489              (cc = rb_enc_codepoint(p,pend,enc),
6490               (cc == '$' || cc == '@' || cc == '{'))))) {
6491             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6492             str_buf_cat2(result, "\\");
6493             if (asciicompat || enc == resenc) {
6494                 prev = p - n;
6495                 continue;
6496             }
6497         }
6498         switch (c) {
6499           case '\n': cc = 'n'; break;
6500           case '\r': cc = 'r'; break;
6501           case '\t': cc = 't'; break;
6502           case '\f': cc = 'f'; break;
6503           case '\013': cc = 'v'; break;
6504           case '\010': cc = 'b'; break;
6505           case '\007': cc = 'a'; break;
6506           case 033: cc = 'e'; break;
6507           default: cc = 0; break;
6508         }
6509         if (cc) {
6510             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6511             buf[0] = '\\';
6512             buf[1] = (char)cc;
6513             str_buf_cat(result, buf, 2);
6514             prev = p;
6515             continue;
6516         }
6517         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6518             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6519             continue;
6520         }
6521         else {
6522             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6523             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6524             prev = p;
6525             continue;
6526         }
6527     }
6528     if (p > prev) str_buf_cat(result, prev, p - prev);
6529     str_buf_cat2(result, "\"");
6530
6531     return result;
6532 }
6533
6534 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6535
6536 /*
6537  *  call-seq:
6538  *    dump -> string
6539  *
6540  *  Returns a printable version of +self+, enclosed in double-quotes,
6541  *  with special characters escaped, and with non-printing characters
6542  *  replaced by hexadecimal notation:
6543  *
6544  *    "hello \n ''".dump    # => "\"hello \\n ''\""
6545  *    "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6546  *
6547  *  Related: String#undump (inverse of String#dump).
6548  *
6549  */
6550
6551 VALUE
6552 rb_str_dump(VALUE str)
6553 {
6554     int encidx = rb_enc_get_index(str);
6555     rb_encoding *enc = rb_enc_from_index(encidx);
6556     long len;
6557     const char *p, *pend;
6558     char *q, *qend;
6559     VALUE result;
6560     int u8 = (encidx == rb_utf8_encindex());
6561     static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6562
6563     len = 2;                    /* "" */
6564     if (!rb_enc_asciicompat(enc)) {
6565         len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6566         len += strlen(enc->name);
6567     }
6568
6569     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6570     while (p < pend) {
6571         int clen;
6572         unsigned char c = *p++;
6573
6574         switch (c) {
6575           case '"':  case '\\':
6576           case '\n': case '\r':
6577           case '\t': case '\f':
6578           case '\013': case '\010': case '\007': case '\033':
6579             clen = 2;
6580             break;
6581
6582           case '#':
6583             clen = IS_EVSTR(p, pend) ? 2 : 1;
6584             break;
6585
6586           default:
6587             if (ISPRINT(c)) {
6588                 clen = 1;
6589             }
6590             else {
6591                 if (u8 && c > 0x7F) {   /* \u notation */
6592                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
6593                     if (MBCLEN_CHARFOUND_P(n)) {
6594                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6595                         if (cc <= 0xFFFF)
6596                             clen = 6;  /* \uXXXX */
6597                         else if (cc <= 0xFFFFF)
6598                             clen = 9;  /* \u{XXXXX} */
6599                         else
6600                             clen = 10; /* \u{XXXXXX} */
6601                         p += MBCLEN_CHARFOUND_LEN(n)-1;
6602                         break;
6603                     }
6604                 }
6605                 clen = 4;       /* \xNN */
6606             }
6607             break;
6608         }
6609
6610         if (clen > LONG_MAX - len) {
6611             rb_raise(rb_eRuntimeError, "string size too big");
6612         }
6613         len += clen;
6614     }
6615
6616     result = rb_str_new(0, len);
6617     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6618     q = RSTRING_PTR(result); qend = q + len + 1;
6619
6620     *q++ = '"';
6621     while (p < pend) {
6622         unsigned char c = *p++;
6623
6624         if (c == '"' || c == '\\') {
6625             *q++ = '\\';
6626             *q++ = c;
6627         }
6628         else if (c == '#') {
6629             if (IS_EVSTR(p, pend)) *q++ = '\\';
6630             *q++ = '#';
6631         }
6632         else if (c == '\n') {
6633             *q++ = '\\';
6634             *q++ = 'n';
6635         }
6636         else if (c == '\r') {
6637             *q++ = '\\';
6638             *q++ = 'r';
6639         }
6640         else if (c == '\t') {
6641             *q++ = '\\';
6642             *q++ = 't';
6643         }
6644         else if (c == '\f') {
6645             *q++ = '\\';
6646             *q++ = 'f';
6647         }
6648         else if (c == '\013') {
6649             *q++ = '\\';
6650             *q++ = 'v';
6651         }
6652         else if (c == '\010') {
6653             *q++ = '\\';
6654             *q++ = 'b';
6655         }
6656         else if (c == '\007') {
6657             *q++ = '\\';
6658             *q++ = 'a';
6659         }
6660         else if (c == '\033') {
6661             *q++ = '\\';
6662             *q++ = 'e';
6663         }
6664         else if (ISPRINT(c)) {
6665             *q++ = c;
6666         }
6667         else {
6668             *q++ = '\\';
6669             if (u8) {
6670                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6671                 if (MBCLEN_CHARFOUND_P(n)) {
6672                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6673                     p += n;
6674                     if (cc <= 0xFFFF)
6675                         snprintf(q, qend-q, "u%04X", cc);    /* \uXXXX */
6676                     else
6677                         snprintf(q, qend-q, "u{%X}", cc);  /* \u{XXXXX} or \u{XXXXXX} */
6678                     q += strlen(q);
6679                     continue;
6680                 }
6681             }
6682             snprintf(q, qend-q, "x%02X", c);
6683             q += 3;
6684         }
6685     }
6686     *q++ = '"';
6687     *q = '\0';
6688     if (!rb_enc_asciicompat(enc)) {
6689         snprintf(q, qend-q, nonascii_suffix, enc->name);
6690         encidx = rb_ascii8bit_encindex();
6691     }
6692     /* result from dump is ASCII */
6693     rb_enc_associate_index(result, encidx);
6694     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
6695     return result;
6696 }
6697
6698 static int
6699 unescape_ascii(unsigned int c)
6700 {
6701     switch (c) {
6702       case 'n':
6703         return '\n';
6704       case 'r':
6705         return '\r';
6706       case 't':
6707         return '\t';
6708       case 'f':
6709         return '\f';
6710       case 'v':
6711         return '\13';
6712       case 'b':
6713         return '\010';
6714       case 'a':
6715         return '\007';
6716       case 'e':
6717         return 033;
6718     }
6719     UNREACHABLE_RETURN(-1);
6720 }
6721
6722 static void
6723 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6724 {
6725     const char *s = *ss;
6726     unsigned int c;
6727     int codelen;
6728     size_t hexlen;
6729     unsigned char buf[6];
6730     static rb_encoding *enc_utf8 = NULL;
6731
6732     switch (*s) {
6733       case '\\':
6734       case '"':
6735       case '#':
6736         rb_str_cat(undumped, s, 1); /* cat itself */
6737         s++;
6738         break;
6739       case 'n':
6740       case 'r':
6741       case 't':
6742       case 'f':
6743       case 'v':
6744       case 'b':
6745       case 'a':
6746       case 'e':
6747         *buf = unescape_ascii(*s);
6748         rb_str_cat(undumped, (char *)buf, 1);
6749         s++;
6750         break;
6751       case 'u':
6752         if (*binary) {
6753             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6754         }
6755         *utf8 = true;
6756         if (++s >= s_end) {
6757             rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6758         }
6759         if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6760         if (*penc != enc_utf8) {
6761             *penc = enc_utf8;
6762             rb_enc_associate(undumped, enc_utf8);
6763         }
6764         if (*s == '{') { /* handle \u{...} form */
6765             s++;
6766             for (;;) {
6767                 if (s >= s_end) {
6768                     rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6769                 }
6770                 if (*s == '}') {
6771                     s++;
6772                     break;
6773                 }
6774                 if (ISSPACE(*s)) {
6775                     s++;
6776                     continue;
6777                 }
6778                 c = scan_hex(s, s_end-s, &hexlen);
6779                 if (hexlen == 0 || hexlen > 6) {
6780                     rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6781                 }
6782                 if (c > 0x10ffff) {
6783                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6784                 }
6785                 if (0xd800 <= c && c <= 0xdfff) {
6786                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6787                 }
6788                 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6789                 rb_str_cat(undumped, (char *)buf, codelen);
6790                 s += hexlen;
6791             }
6792         }
6793         else { /* handle \uXXXX form */
6794             c = scan_hex(s, 4, &hexlen);
6795             if (hexlen != 4) {
6796                 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6797             }
6798             if (0xd800 <= c && c <= 0xdfff) {
6799                 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6800             }
6801             codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6802             rb_str_cat(undumped, (char *)buf, codelen);
6803             s += hexlen;
6804         }
6805         break;
6806       case 'x':
6807         if (*utf8) {
6808             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6809         }
6810         *binary = true;
6811         if (++s >= s_end) {
6812             rb_raise(rb_eRuntimeError, "invalid hex escape");
6813         }
6814         *buf = scan_hex(s, 2, &hexlen);
6815         if (hexlen != 2) {
6816             rb_raise(rb_eRuntimeError, "invalid hex escape");
6817         }
6818         rb_str_cat(undumped, (char *)buf, 1);
6819         s += hexlen;
6820         break;
6821       default:
6822         rb_str_cat(undumped, s-1, 2);
6823         s++;
6824     }
6825
6826     *ss = s;
6827 }
6828
6829 static VALUE rb_str_is_ascii_only_p(VALUE str);
6830
6831 /*
6832  *  call-seq:
6833  *    undump -> string
6834  *
6835  *  Returns an unescaped version of +self+:
6836  *
6837  *    s_orig = "\f\x00\xff\\\""    # => "\f\u0000\xFF\\\""
6838  *    s_dumped = s_orig.dump       # => "\"\\f\\x00\\xFF\\\\\\\"\""
6839  *    s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6840  *    s_undumped == s_orig         # => true
6841  *
6842  *  Related: String#dump (inverse of String#undump).
6843  *
6844  */
6845
6846 static VALUE
6847 str_undump(VALUE str)
6848 {
6849     const char *s = RSTRING_PTR(str);
6850     const char *s_end = RSTRING_END(str);
6851     rb_encoding *enc = rb_enc_get(str);
6852     VALUE undumped = rb_enc_str_new(s, 0L, enc);
6853     bool utf8 = false;
6854     bool binary = false;
6855     int w;
6856
6857     rb_must_asciicompat(str);
6858     if (rb_str_is_ascii_only_p(str) == Qfalse) {
6859         rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6860     }
6861     if (!str_null_check(str, &w)) {
6862         rb_raise(rb_eRuntimeError, "string contains null byte");
6863     }
6864     if (RSTRING_LEN(str) < 2) goto invalid_format;
6865     if (*s != '"') goto invalid_format;
6866
6867     /* strip '"' at the start */
6868     s++;
6869
6870     for (;;) {
6871         if (s >= s_end) {
6872             rb_raise(rb_eRuntimeError, "unterminated dumped string");
6873         }
6874
6875         if (*s == '"') {
6876             /* epilogue */
6877             s++;
6878             if (s == s_end) {
6879                 /* ascii compatible dumped string */
6880                 break;
6881             }
6882             else {
6883                 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6884                 static const char dup_suffix[] = ".dup";
6885                 const char *encname;
6886                 int encidx;
6887                 ptrdiff_t size;
6888
6889                 /* check separately for strings dumped by older versions */
6890                 size = sizeof(dup_suffix) - 1;
6891                 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6892
6893                 size = sizeof(force_encoding_suffix) - 1;
6894                 if (s_end - s <= size) goto invalid_format;
6895                 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6896                 s += size;
6897
6898                 if (utf8) {
6899                     rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6900                 }
6901
6902                 encname = s;
6903                 s = memchr(s, '"', s_end-s);
6904                 size = s - encname;
6905                 if (!s) goto invalid_format;
6906                 if (s_end - s != 2) goto invalid_format;
6907                 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6908
6909                 encidx = rb_enc_find_index2(encname, (long)size);
6910                 if (encidx < 0) {
6911                     rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6912                 }
6913                 rb_enc_associate_index(undumped, encidx);
6914             }
6915             break;
6916         }
6917
6918         if (*s == '\\') {
6919             s++;
6920             if (s >= s_end) {
6921                 rb_raise(rb_eRuntimeError, "invalid escape");
6922             }
6923             undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6924         }
6925         else {
6926             rb_str_cat(undumped, s++, 1);
6927         }
6928     }
6929
6930     return undumped;
6931 invalid_format:
6932     rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6933 }
6934
6935 static void
6936 rb_str_check_dummy_enc(rb_encoding *enc)
6937 {
6938     if (rb_enc_dummy_p(enc)) {
6939         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6940                  rb_enc_name(enc));
6941     }
6942 }
6943
6944 static rb_encoding *
6945 str_true_enc(VALUE str)
6946 {
6947     rb_encoding *enc = STR_ENC_GET(str);
6948     rb_str_check_dummy_enc(enc);
6949     return enc;
6950 }
6951
6952 static OnigCaseFoldType
6953 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6954 {
6955     if (argc==0)
6956         return flags;
6957     if (argc>2)
6958         rb_raise(rb_eArgError, "too many options");
6959     if (argv[0]==sym_turkic) {
6960         flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6961         if (argc==2) {
6962             if (argv[1]==sym_lithuanian)
6963                 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6964             else
6965                 rb_raise(rb_eArgError, "invalid second option");
6966         }
6967     }
6968     else if (argv[0]==sym_lithuanian) {
6969         flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6970         if (argc==2) {
6971             if (argv[1]==sym_turkic)
6972                 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6973             else
6974                 rb_raise(rb_eArgError, "invalid second option");
6975         }
6976     }
6977     else if (argc>1)
6978         rb_raise(rb_eArgError, "too many options");
6979     else if (argv[0]==sym_ascii)
6980         flags |= ONIGENC_CASE_ASCII_ONLY;
6981     else if (argv[0]==sym_fold) {
6982         if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6983             flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6984         else
6985             rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
6986     }
6987     else
6988         rb_raise(rb_eArgError, "invalid option");
6989     return flags;
6990 }
6991
6992 static inline bool
6993 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
6994 {
6995     if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
6996         return true;
6997     return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
6998 }
6999
7000 /* 16 should be long enough to absorb any kind of single character length increase */
7001 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7002 #ifndef CASEMAP_DEBUG
7003 # define CASEMAP_DEBUG 0
7004 #endif
7005
7006 struct mapping_buffer;
7007 typedef struct mapping_buffer {
7008     size_t capa;
7009     size_t used;
7010     struct mapping_buffer *next;
7011     OnigUChar space[FLEX_ARY_LEN];
7012 } mapping_buffer;
7013
7014 static void
7015 mapping_buffer_free(void *p)
7016 {
7017     mapping_buffer *previous_buffer;
7018     mapping_buffer *current_buffer = p;
7019     while (current_buffer) {
7020         previous_buffer = current_buffer;
7021         current_buffer  = current_buffer->next;
7022         ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7023     }
7024 }
7025
7026 static const rb_data_type_t mapping_buffer_type = {
7027     "mapping_buffer",
7028     {0, mapping_buffer_free,}
7029 };
7030
7031 static VALUE
7032 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7033 {
7034     VALUE target;
7035
7036     const OnigUChar *source_current, *source_end;
7037     int target_length = 0;
7038     VALUE buffer_anchor;
7039     mapping_buffer *current_buffer = 0;
7040     mapping_buffer **pre_buffer;
7041     size_t buffer_count = 0;
7042     int buffer_length_or_invalid;
7043
7044     if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7045
7046     source_current = (OnigUChar*)RSTRING_PTR(source);
7047     source_end = (OnigUChar*)RSTRING_END(source);
7048
7049     buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7050     pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7051     while (source_current < source_end) {
7052         /* increase multiplier using buffer count to converge quickly */
7053         size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7054         if (CASEMAP_DEBUG) {
7055             fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7056         }
7057         current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7058         *pre_buffer = current_buffer;
7059         pre_buffer = &current_buffer->next;
7060         current_buffer->next = NULL;
7061         current_buffer->capa = capa;
7062         buffer_length_or_invalid = enc->case_map(flags,
7063                                    &source_current, source_end,
7064                                    current_buffer->space,
7065                                    current_buffer->space+current_buffer->capa,
7066                                    enc);
7067         if (buffer_length_or_invalid < 0) {
7068             current_buffer = DATA_PTR(buffer_anchor);
7069             DATA_PTR(buffer_anchor) = 0;
7070             mapping_buffer_free(current_buffer);
7071             rb_raise(rb_eArgError, "input string invalid");
7072         }
7073         target_length  += current_buffer->used = buffer_length_or_invalid;
7074     }
7075     if (CASEMAP_DEBUG) {
7076         fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7077     }
7078
7079     if (buffer_count==1) {
7080         target = rb_str_new((const char*)current_buffer->space, target_length);
7081     }
7082     else {
7083         char *target_current;
7084
7085         target = rb_str_new(0, target_length);
7086         target_current = RSTRING_PTR(target);
7087         current_buffer = DATA_PTR(buffer_anchor);
7088         while (current_buffer) {
7089             memcpy(target_current, current_buffer->space, current_buffer->used);
7090             target_current += current_buffer->used;
7091             current_buffer  = current_buffer->next;
7092         }
7093     }
7094     current_buffer = DATA_PTR(buffer_anchor);
7095     DATA_PTR(buffer_anchor) = 0;
7096     mapping_buffer_free(current_buffer);
7097
7098     /* TODO: check about string terminator character */
7099     str_enc_copy(target, source);
7100     /*ENC_CODERANGE_SET(mapped, cr);*/
7101
7102     return target;
7103 }
7104
7105 static VALUE
7106 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7107 {
7108     const OnigUChar *source_current, *source_end;
7109     OnigUChar *target_current, *target_end;
7110     long old_length = RSTRING_LEN(source);
7111     int length_or_invalid;
7112
7113     if (old_length == 0) return Qnil;
7114
7115     source_current = (OnigUChar*)RSTRING_PTR(source);
7116     source_end = (OnigUChar*)RSTRING_END(source);
7117     if (source == target) {
7118         target_current = (OnigUChar*)source_current;
7119         target_end = (OnigUChar*)source_end;
7120     }
7121     else {
7122         target_current = (OnigUChar*)RSTRING_PTR(target);
7123         target_end = (OnigUChar*)RSTRING_END(target);
7124     }
7125
7126     length_or_invalid = onigenc_ascii_only_case_map(flags,
7127                                &source_current, source_end,
7128                                target_current, target_end, enc);
7129     if (length_or_invalid < 0)
7130         rb_raise(rb_eArgError, "input string invalid");
7131     if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7132         fprintf(stderr, "problem with rb_str_ascii_casemap"
7133                 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7134         rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7135                  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7136     }
7137
7138     str_enc_copy(target, source);
7139
7140     return target;
7141 }
7142
7143 static bool
7144 upcase_single(VALUE str)
7145 {
7146     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7147     bool modified = false;
7148
7149     while (s < send) {
7150         unsigned int c = *(unsigned char*)s;
7151
7152         if ('a' <= c && c <= 'z') {
7153             *s = 'A' + (c - 'a');
7154             modified = true;
7155         }
7156         s++;
7157     }
7158     return modified;
7159 }
7160
7161 /*
7162  *  call-seq:
7163  *    upcase!(*options) -> self or nil
7164  *
7165  *  Upcases the characters in +self+;
7166  *  returns +self+ if any changes were made, +nil+ otherwise:
7167  *
7168  *    s = 'Hello World!' # => "Hello World!"
7169  *    s.upcase!          # => "HELLO WORLD!"
7170  *    s                  # => "HELLO WORLD!"
7171  *    s.upcase!          # => nil
7172  *
7173  *  The casing may be affected by the given +options+;
7174  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7175  *
7176  *  Related: String#upcase, String#downcase, String#downcase!.
7177  *
7178  */
7179
7180 static VALUE
7181 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7182 {
7183     rb_encoding *enc;
7184     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7185
7186     flags = check_case_options(argc, argv, flags);
7187     str_modify_keep_cr(str);
7188     enc = str_true_enc(str);
7189     if (case_option_single_p(flags, enc, str)) {
7190         if (upcase_single(str))
7191             flags |= ONIGENC_CASE_MODIFIED;
7192     }
7193     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7194         rb_str_ascii_casemap(str, str, &flags, enc);
7195     else
7196         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7197
7198     if (ONIGENC_CASE_MODIFIED&flags) return str;
7199     return Qnil;
7200 }
7201
7202
7203 /*
7204  *  call-seq:
7205  *    upcase(*options) -> string
7206  *
7207  *  Returns a string containing the upcased characters in +self+:
7208  *
7209  *     s = 'Hello World!' # => "Hello World!"
7210  *     s.upcase           # => "HELLO WORLD!"
7211  *
7212  *  The casing may be affected by the given +options+;
7213  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7214  *
7215  *  Related: String#upcase!, String#downcase, String#downcase!.
7216  *
7217  */
7218
7219 static VALUE
7220 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7221 {
7222     rb_encoding *enc;
7223     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7224     VALUE ret;
7225
7226     flags = check_case_options(argc, argv, flags);
7227     enc = str_true_enc(str);
7228     if (case_option_single_p(flags, enc, str)) {
7229         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7230         str_enc_copy(ret, str);
7231         upcase_single(ret);
7232     }
7233     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7234         ret = rb_str_new(0, RSTRING_LEN(str));
7235         rb_str_ascii_casemap(str, ret, &flags, enc);
7236     }
7237     else {
7238         ret = rb_str_casemap(str, &flags, enc);
7239     }
7240
7241     return ret;
7242 }
7243
7244 static bool
7245 downcase_single(VALUE str)
7246 {
7247     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7248     bool modified = false;
7249
7250     while (s < send) {
7251         unsigned int c = *(unsigned char*)s;
7252
7253         if ('A' <= c && c <= 'Z') {
7254             *s = 'a' + (c - 'A');
7255             modified = true;
7256         }
7257         s++;
7258     }
7259
7260     return modified;
7261 }
7262
7263 /*
7264  *  call-seq:
7265  *    downcase!(*options) -> self or nil
7266  *
7267  *  Downcases the characters in +self+;
7268  *  returns +self+ if any changes were made, +nil+ otherwise:
7269  *
7270  *    s = 'Hello World!' # => "Hello World!"
7271  *    s.downcase!        # => "hello world!"
7272  *    s                  # => "hello world!"
7273  *    s.downcase!        # => nil
7274  *
7275  *  The casing may be affected by the given +options+;
7276  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7277  *
7278  *  Related: String#downcase, String#upcase, String#upcase!.
7279  *
7280  */
7281
7282 static VALUE
7283 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7284 {
7285     rb_encoding *enc;
7286     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7287
7288     flags = check_case_options(argc, argv, flags);
7289     str_modify_keep_cr(str);
7290     enc = str_true_enc(str);
7291     if (case_option_single_p(flags, enc, str)) {
7292         if (downcase_single(str))
7293             flags |= ONIGENC_CASE_MODIFIED;
7294     }
7295     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7296         rb_str_ascii_casemap(str, str, &flags, enc);
7297     else
7298         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7299
7300     if (ONIGENC_CASE_MODIFIED&flags) return str;
7301     return Qnil;
7302 }
7303
7304
7305 /*
7306  *  call-seq:
7307  *    downcase(*options) -> string
7308  *
7309  *  Returns a string containing the downcased characters in +self+:
7310  *
7311  *     s = 'Hello World!' # => "Hello World!"
7312  *     s.downcase         # => "hello world!"
7313  *
7314  *  The casing may be affected by the given +options+;
7315  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7316  *
7317  *  Related: String#downcase!, String#upcase, String#upcase!.
7318  *
7319  */
7320
7321 static VALUE
7322 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7323 {
7324     rb_encoding *enc;
7325     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7326     VALUE ret;
7327
7328     flags = check_case_options(argc, argv, flags);
7329     enc = str_true_enc(str);
7330     if (case_option_single_p(flags, enc, str)) {
7331         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7332         str_enc_copy(ret, str);
7333         downcase_single(ret);
7334     }
7335     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7336         ret = rb_str_new(0, RSTRING_LEN(str));
7337         rb_str_ascii_casemap(str, ret, &flags, enc);
7338     }
7339     else {
7340         ret = rb_str_casemap(str, &flags, enc);
7341     }
7342
7343     return ret;
7344 }
7345
7346
7347 /*
7348  *  call-seq:
7349  *    capitalize!(*options) -> self or nil
7350  *
7351  *  Upcases the first character in +self+;
7352  *  downcases the remaining characters;
7353  *  returns +self+ if any changes were made, +nil+ otherwise:
7354  *
7355  *    s = 'hello World!' # => "hello World!"
7356  *    s.capitalize!      # => "Hello world!"
7357  *    s                  # => "Hello world!"
7358  *    s.capitalize!      # => nil
7359  *
7360  *  The casing may be affected by the given +options+;
7361  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7362  *
7363  *  Related: String#capitalize.
7364  *
7365  */
7366
7367 static VALUE
7368 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7369 {
7370     rb_encoding *enc;
7371     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7372
7373     flags = check_case_options(argc, argv, flags);
7374     str_modify_keep_cr(str);
7375     enc = str_true_enc(str);
7376     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7377     if (flags&ONIGENC_CASE_ASCII_ONLY)
7378         rb_str_ascii_casemap(str, str, &flags, enc);
7379     else
7380         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7381
7382     if (ONIGENC_CASE_MODIFIED&flags) return str;
7383     return Qnil;
7384 }
7385
7386
7387 /*
7388  *  call-seq:
7389  *    capitalize(*options) -> string
7390  *
7391  *  Returns a string containing the characters in +self+;
7392  *  the first character is upcased;
7393  *  the remaining characters are downcased:
7394  *
7395  *     s = 'hello World!' # => "hello World!"
7396  *     s.capitalize       # => "Hello world!"
7397  *
7398  *  The casing may be affected by the given +options+;
7399  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7400  *
7401  *  Related: String#capitalize!.
7402  *
7403  */
7404
7405 static VALUE
7406 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7407 {
7408     rb_encoding *enc;
7409     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7410     VALUE ret;
7411
7412     flags = check_case_options(argc, argv, flags);
7413     enc = str_true_enc(str);
7414     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7415     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7416         ret = rb_str_new(0, RSTRING_LEN(str));
7417         rb_str_ascii_casemap(str, ret, &flags, enc);
7418     }
7419     else {
7420         ret = rb_str_casemap(str, &flags, enc);
7421     }
7422     return ret;
7423 }
7424
7425
7426 /*
7427  *  call-seq:
7428  *    swapcase!(*options) -> self or nil
7429  *
7430  *  Upcases each lowercase character in +self+;
7431  *  downcases uppercase character;
7432  *  returns +self+ if any changes were made, +nil+ otherwise:
7433  *
7434  *    s = 'Hello World!' # => "Hello World!"
7435  *    s.swapcase!        # => "hELLO wORLD!"
7436  *    s                  # => "Hello World!"
7437  *    ''.swapcase!       # => nil
7438  *
7439  *  The casing may be affected by the given +options+;
7440  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7441  *
7442  *  Related: String#swapcase.
7443  *
7444  */
7445
7446 static VALUE
7447 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7448 {
7449     rb_encoding *enc;
7450     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7451
7452     flags = check_case_options(argc, argv, flags);
7453     str_modify_keep_cr(str);
7454     enc = str_true_enc(str);
7455     if (flags&ONIGENC_CASE_ASCII_ONLY)
7456         rb_str_ascii_casemap(str, str, &flags, enc);
7457     else
7458         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7459
7460     if (ONIGENC_CASE_MODIFIED&flags) return str;
7461     return Qnil;
7462 }
7463
7464
7465 /*
7466  *  call-seq:
7467  *    swapcase(*options) -> string
7468  *
7469  *  Returns a string containing the characters in +self+, with cases reversed;
7470  *  each uppercase character is downcased;
7471  *  each lowercase character is upcased:
7472  *
7473  *     s = 'Hello World!' # => "Hello World!"
7474  *     s.swapcase         # => "hELLO wORLD!"
7475  *
7476  *  The casing may be affected by the given +options+;
7477  *  see {Case Mapping}[doc/case_mapping_rdoc.html].
7478  *
7479  *  Related: String#swapcase!.
7480  *
7481  */
7482
7483 static VALUE
7484 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7485 {
7486     rb_encoding *enc;
7487     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7488     VALUE ret;
7489
7490     flags = check_case_options(argc, argv, flags);
7491     enc = str_true_enc(str);
7492     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7493     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7494         ret = rb_str_new(0, RSTRING_LEN(str));
7495         rb_str_ascii_casemap(str, ret, &flags, enc);
7496     }
7497     else {
7498         ret = rb_str_casemap(str, &flags, enc);
7499     }
7500     return ret;
7501 }
7502
7503 typedef unsigned char *USTR;
7504
7505 struct tr {
7506     int gen;
7507     unsigned int now, max;
7508     char *p, *pend;
7509 };
7510
7511 static unsigned int
7512 trnext(struct tr *t, rb_encoding *enc)
7513 {
7514     int n;
7515
7516     for (;;) {
7517       nextpart:
7518         if (!t->gen) {
7519             if (t->p == t->pend) return -1;
7520             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7521                 t->p += n;
7522             }
7523             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7524             t->p += n;
7525             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7526                 t->p += n;
7527                 if (t->p < t->pend) {
7528                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7529                     t->p += n;
7530                     if (t->now > c) {
7531                         if (t->now < 0x80 && c < 0x80) {
7532                             rb_raise(rb_eArgError,
7533                                      "invalid range \"%c-%c\" in string transliteration",
7534                                      t->now, c);
7535                         }
7536                         else {
7537                             rb_raise(rb_eArgError, "invalid range in string transliteration");
7538                         }
7539                         continue; /* not reached */
7540                     }
7541                     t->gen = 1;
7542                     t->max = c;
7543                 }
7544             }
7545             return t->now;
7546         }
7547         else {
7548             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7549                 if (t->now == t->max) {
7550                     t->gen = 0;
7551                     goto nextpart;
7552                 }
7553             }
7554             if (t->now < t->max) {
7555                 return t->now;
7556             }
7557             else {
7558                 t->gen = 0;
7559                 return t->max;
7560             }
7561         }
7562     }
7563 }
7564
7565 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7566
7567 static VALUE
7568 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7569 {
7570     const unsigned int errc = -1;
7571     unsigned int trans[256];
7572     rb_encoding *enc, *e1, *e2;
7573     struct tr trsrc, trrepl;
7574     int cflag = 0;
7575     unsigned int c, c0, last = 0;
7576     int modify = 0, i, l;
7577     unsigned char *s, *send;
7578     VALUE hash = 0;
7579     int singlebyte = single_byte_optimizable(str);
7580     int termlen;
7581     int cr;
7582
7583 #define CHECK_IF_ASCII(c) \
7584     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7585            (cr = ENC_CODERANGE_VALID) : 0)
7586
7587     StringValue(src);
7588     StringValue(repl);
7589     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7590     if (RSTRING_LEN(repl) == 0) {
7591         return rb_str_delete_bang(1, &src, str);
7592     }
7593
7594     cr = ENC_CODERANGE(str);
7595     e1 = rb_enc_check(str, src);
7596     e2 = rb_enc_check(str, repl);
7597     if (e1 == e2) {
7598         enc = e1;
7599     }
7600     else {
7601         enc = rb_enc_check(src, repl);
7602     }
7603     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7604     if (RSTRING_LEN(src) > 1 &&
7605         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7606         trsrc.p + l < trsrc.pend) {
7607         cflag = 1;
7608         trsrc.p += l;
7609     }
7610     trrepl.p = RSTRING_PTR(repl);
7611     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7612     trsrc.gen = trrepl.gen = 0;
7613     trsrc.now = trrepl.now = 0;
7614     trsrc.max = trrepl.max = 0;
7615
7616     if (cflag) {
7617         for (i=0; i<256; i++) {
7618             trans[i] = 1;
7619         }
7620         while ((c = trnext(&trsrc, enc)) != errc) {
7621             if (c < 256) {
7622                 trans[c] = errc;
7623             }
7624             else {
7625                 if (!hash) hash = rb_hash_new();
7626                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7627             }
7628         }
7629         while ((c = trnext(&trrepl, enc)) != errc)
7630             /* retrieve last replacer */;
7631         last = trrepl.now;
7632         for (i=0; i<256; i++) {
7633             if (trans[i] != errc) {
7634                 trans[i] = last;
7635             }
7636         }
7637     }
7638     else {
7639         unsigned int r;
7640
7641         for (i=0; i<256; i++) {
7642             trans[i] = errc;
7643         }
7644         while ((c = trnext(&trsrc, enc)) != errc) {
7645             r = trnext(&trrepl, enc);
7646             if (r == errc) r = trrepl.now;
7647             if (c < 256) {
7648                 trans[c] = r;
7649                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7650             }
7651             else {
7652                 if (!hash) hash = rb_hash_new();
7653                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7654             }
7655         }
7656     }
7657
7658     if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7659         cr = ENC_CODERANGE_7BIT;
7660     str_modify_keep_cr(str);
7661     s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7662     termlen = rb_enc_mbminlen(enc);
7663     if (sflag) {
7664         int clen, tlen;
7665         long offset, max = RSTRING_LEN(str);
7666         unsigned int save = -1;
7667         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7668
7669         while (s < send) {
7670             int may_modify = 0;
7671
7672             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7673             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7674
7675             s += clen;
7676             if (c < 256) {
7677                 c = trans[c];
7678             }
7679             else if (hash) {
7680                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7681                 if (NIL_P(tmp)) {
7682                     if (cflag) c = last;
7683                     else c = errc;
7684                 }
7685                 else if (cflag) c = errc;
7686                 else c = NUM2INT(tmp);
7687             }
7688             else {
7689                 c = errc;
7690             }
7691             if (c != (unsigned int)-1) {
7692                 if (save == c) {
7693                     CHECK_IF_ASCII(c);
7694                     continue;
7695                 }
7696                 save = c;
7697                 tlen = rb_enc_codelen(c, enc);
7698                 modify = 1;
7699             }
7700             else {
7701                 save = -1;
7702                 c = c0;
7703                 if (enc != e1) may_modify = 1;
7704             }
7705             if ((offset = t - buf) + tlen > max) {
7706                 size_t MAYBE_UNUSED(old) = max + termlen;
7707                 max = offset + tlen + (send - s);
7708                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7709                 t = buf + offset;
7710             }
7711             rb_enc_mbcput(c, t, enc);
7712             if (may_modify && memcmp(s, t, tlen) != 0) {
7713                 modify = 1;
7714             }
7715             CHECK_IF_ASCII(c);
7716             t += tlen;
7717         }
7718         if (!STR_EMBED_P(str)) {
7719             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7720         }
7721         TERM_FILL((char *)t, termlen);
7722         RSTRING(str)->as.heap.ptr = (char *)buf;
7723         RSTRING(str)->as.heap.len = t - buf;
7724         STR_SET_NOEMBED(str);
7725         RSTRING(str)->as.heap.aux.capa = max;
7726     }
7727     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7728         while (s < send) {
7729             c = (unsigned char)*s;
7730             if (trans[c] != errc) {
7731                 if (!cflag) {
7732                     c = trans[c];
7733                     *s = c;
7734                     modify = 1;
7735                 }
7736                 else {
7737                     *s = last;
7738                     modify = 1;
7739                 }
7740             }
7741             CHECK_IF_ASCII(c);
7742             s++;
7743         }
7744     }
7745     else {
7746         int clen, tlen;
7747         long offset, max = (long)((send - s) * 1.2);
7748         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7749
7750         while (s < send) {
7751             int may_modify = 0;
7752             c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7753             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7754
7755             if (c < 256) {
7756                 c = trans[c];
7757             }
7758             else if (hash) {
7759                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7760                 if (NIL_P(tmp)) {
7761                     if (cflag) c = last;
7762                     else c = errc;
7763                 }
7764                 else if (cflag) c = errc;
7765                 else c = NUM2INT(tmp);
7766             }
7767             else {
7768                 c = cflag ? last : errc;
7769             }
7770             if (c != errc) {
7771                 tlen = rb_enc_codelen(c, enc);
7772                 modify = 1;
7773             }
7774             else {
7775                 c = c0;
7776                 if (enc != e1) may_modify = 1;
7777             }
7778             if ((offset = t - buf) + tlen > max) {
7779                 size_t MAYBE_UNUSED(old) = max + termlen;
7780                 max = offset + tlen + (long)((send - s) * 1.2);
7781                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7782                 t = buf + offset;
7783             }
7784             if (s != t) {
7785                 rb_enc_mbcput(c, t, enc);
7786                 if (may_modify && memcmp(s, t, tlen) != 0) {
7787                     modify = 1;
7788                 }
7789             }
7790             CHECK_IF_ASCII(c);
7791             s += clen;
7792             t += tlen;
7793         }
7794         if (!STR_EMBED_P(str)) {
7795             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7796         }
7797         TERM_FILL((char *)t, termlen);
7798         RSTRING(str)->as.heap.ptr = (char *)buf;
7799         RSTRING(str)->as.heap.len = t - buf;
7800         STR_SET_NOEMBED(str);
7801         RSTRING(str)->as.heap.aux.capa = max;
7802     }
7803
7804     if (modify) {
7805         if (cr != ENC_CODERANGE_BROKEN)
7806             ENC_CODERANGE_SET(str, cr);
7807         rb_enc_associate(str, enc);
7808         return str;
7809     }
7810     return Qnil;
7811 }
7812
7813
7814 /*
7815  *  call-seq:
7816  *     str.tr!(from_str, to_str)   -> str or nil
7817  *
7818  *  Translates <i>str</i> in place, using the same rules as
7819  *  String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7820  *  were made.
7821  */
7822
7823 static VALUE
7824 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7825 {
7826     return tr_trans(str, src, repl, 0);
7827 }
7828
7829
7830 /*
7831  *  call-seq:
7832  *     str.tr(from_str, to_str)   => new_str
7833  *
7834  *  Returns a copy of +str+ with the characters in +from_str+ replaced by the
7835  *  corresponding characters in +to_str+.  If +to_str+ is shorter than
7836  *  +from_str+, it is padded with its last character in order to maintain the
7837  *  correspondence.
7838  *
7839  *     "hello".tr('el', 'ip')      #=> "hippo"
7840  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
7841  *     "hello".tr('aeiou', 'AA*')  #=> "hAll*"
7842  *
7843  *  Both strings may use the <code>c1-c2</code> notation to denote ranges of
7844  *  characters, and +from_str+ may start with a <code>^</code>, which denotes
7845  *  all characters except those listed.
7846  *
7847  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
7848  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
7849  *
7850  *  The backslash character <code>\\</code> can be used to escape
7851  *  <code>^</code> or <code>-</code> and is otherwise ignored unless it
7852  *  appears at the end of a range or the end of the +from_str+ or +to_str+:
7853  *
7854  *     "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7855  *     "hello-world".tr("a\\-eo", "*")   #=> "h*ll**w*rld"
7856  *
7857  *     "hello\r\nworld".tr("\r", "")   #=> "hello\nworld"
7858  *     "hello\r\nworld".tr("\\r", "")  #=> "hello\r\nwold"
7859  *     "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7860  *
7861  *     "X['\\b']".tr("X\\", "")   #=> "['b']"
7862  *     "X['\\b']".tr("X-\\]", "") #=> "'b'"
7863  */
7864
7865 static VALUE
7866 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7867 {
7868     str = str_duplicate(rb_cString, str);
7869     tr_trans(str, src, repl, 0);
7870     return str;
7871 }
7872
7873 #define TR_TABLE_MAX (UCHAR_MAX+1)
7874 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7875 static void
7876 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7877                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7878 {
7879     const unsigned int errc = -1;
7880     char buf[TR_TABLE_MAX];
7881     struct tr tr;
7882     unsigned int c;
7883     VALUE table = 0, ptable = 0;
7884     int i, l, cflag = 0;
7885
7886     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7887     tr.gen = tr.now = tr.max = 0;
7888
7889     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7890         cflag = 1;
7891         tr.p += l;
7892     }
7893     if (first) {
7894         for (i=0; i<TR_TABLE_MAX; i++) {
7895             stable[i] = 1;
7896         }
7897         stable[TR_TABLE_MAX] = cflag;
7898     }
7899     else if (stable[TR_TABLE_MAX] && !cflag) {
7900         stable[TR_TABLE_MAX] = 0;
7901     }
7902     for (i=0; i<TR_TABLE_MAX; i++) {
7903         buf[i] = cflag;
7904     }
7905
7906     while ((c = trnext(&tr, enc)) != errc) {
7907         if (c < TR_TABLE_MAX) {
7908             buf[(unsigned char)c] = !cflag;
7909         }
7910         else {
7911             VALUE key = UINT2NUM(c);
7912
7913             if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7914                 if (cflag) {
7915                     ptable = *ctablep;
7916                     table = ptable ? ptable : rb_hash_new();
7917                     *ctablep = table;
7918                 }
7919                 else {
7920                     table = rb_hash_new();
7921                     ptable = *tablep;
7922                     *tablep = table;
7923                 }
7924             }
7925             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7926                 rb_hash_aset(table, key, Qtrue);
7927             }
7928         }
7929     }
7930     for (i=0; i<TR_TABLE_MAX; i++) {
7931         stable[i] = stable[i] && buf[i];
7932     }
7933     if (!table && !cflag) {
7934         *tablep = 0;
7935     }
7936 }
7937
7938
7939 static int
7940 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7941 {
7942     if (c < TR_TABLE_MAX) {
7943         return table[c] != 0;
7944     }
7945     else {
7946         VALUE v = UINT2NUM(c);
7947
7948         if (del) {
7949             if (!NIL_P(rb_hash_lookup(del, v)) &&
7950                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7951                 return TRUE;
7952             }
7953         }
7954         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7955             return FALSE;
7956         }
7957         return table[TR_TABLE_MAX] ? TRUE : FALSE;
7958     }
7959 }
7960
7961 /*
7962  *  call-seq:
7963  *     str.delete!([other_str]+)   -> str or nil
7964  *
7965  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7966  *  <code>nil</code> if <i>str</i> was not modified.
7967  */
7968
7969 static VALUE
7970 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7971 {
7972     char squeez[TR_TABLE_SIZE];
7973     rb_encoding *enc = 0;
7974     char *s, *send, *t;
7975     VALUE del = 0, nodel = 0;
7976     int modify = 0;
7977     int i, ascompat, cr;
7978
7979     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7980     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
7981     for (i=0; i<argc; i++) {
7982         VALUE s = argv[i];
7983
7984         StringValue(s);
7985         enc = rb_enc_check(str, s);
7986         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
7987     }
7988
7989     str_modify_keep_cr(str);
7990     ascompat = rb_enc_asciicompat(enc);
7991     s = t = RSTRING_PTR(str);
7992     send = RSTRING_END(str);
7993     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
7994     while (s < send) {
7995         unsigned int c;
7996         int clen;
7997
7998         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7999             if (squeez[c]) {
8000                 modify = 1;
8001             }
8002             else {
8003                 if (t != s) *t = c;
8004                 t++;
8005             }
8006             s++;
8007         }
8008         else {
8009             c = rb_enc_codepoint_len(s, send, &clen, enc);
8010
8011             if (tr_find(c, squeez, del, nodel)) {
8012                 modify = 1;
8013             }
8014             else {
8015                 if (t != s) rb_enc_mbcput(c, t, enc);
8016                 t += clen;
8017                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8018             }
8019             s += clen;
8020         }
8021     }
8022     TERM_FILL(t, TERM_LEN(str));
8023     STR_SET_LEN(str, t - RSTRING_PTR(str));
8024     ENC_CODERANGE_SET(str, cr);
8025
8026     if (modify) return str;
8027     return Qnil;
8028 }
8029
8030
8031 /*
8032  *  call-seq:
8033  *     str.delete([other_str]+)   -> new_str
8034  *
8035  *  Returns a copy of <i>str</i> with all characters in the intersection of its
8036  *  arguments deleted. Uses the same rules for building the set of characters as
8037  *  String#count.
8038  *
8039  *     "hello".delete "l","lo"        #=> "heo"
8040  *     "hello".delete "lo"            #=> "he"
8041  *     "hello".delete "aeiou", "^e"   #=> "hell"
8042  *     "hello".delete "ej-m"          #=> "ho"
8043  */
8044
8045 static VALUE
8046 rb_str_delete(int argc, VALUE *argv, VALUE str)
8047 {
8048     str = str_duplicate(rb_cString, str);
8049     rb_str_delete_bang(argc, argv, str);
8050     return str;
8051 }
8052
8053
8054 /*
8055  *  call-seq:
8056  *     str.squeeze!([other_str]*)   -> str or nil
8057  *
8058  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
8059  *  <code>nil</code> if no changes were made.
8060  */
8061
8062 static VALUE
8063 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8064 {
8065     char squeez[TR_TABLE_SIZE];
8066     rb_encoding *enc = 0;
8067     VALUE del = 0, nodel = 0;
8068     unsigned char *s, *send, *t;
8069     int i, modify = 0;
8070     int ascompat, singlebyte = single_byte_optimizable(str);
8071     unsigned int save;
8072
8073     if (argc == 0) {
8074         enc = STR_ENC_GET(str);
8075     }
8076     else {
8077         for (i=0; i<argc; i++) {
8078             VALUE s = argv[i];
8079
8080             StringValue(s);
8081             enc = rb_enc_check(str, s);
8082             if (singlebyte && !single_byte_optimizable(s))
8083                 singlebyte = 0;
8084             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8085         }
8086     }
8087
8088     str_modify_keep_cr(str);
8089     s = t = (unsigned char *)RSTRING_PTR(str);
8090     if (!s || RSTRING_LEN(str) == 0) return Qnil;
8091     send = (unsigned char *)RSTRING_END(str);
8092     save = -1;
8093     ascompat = rb_enc_asciicompat(enc);
8094
8095     if (singlebyte) {
8096         while (s < send) {
8097             unsigned int c = *s++;
8098             if (c != save || (argc > 0 && !squeez[c])) {
8099                 *t++ = save = c;
8100             }
8101         }
8102     }
8103     else {
8104         while (s < send) {
8105             unsigned int c;
8106             int clen;
8107
8108             if (ascompat && (c = *s) < 0x80) {
8109                 if (c != save || (argc > 0 && !squeez[c])) {
8110                     *t++ = save = c;
8111                 }
8112                 s++;
8113             }
8114             else {
8115                 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8116
8117                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8118                     if (t != s) rb_enc_mbcput(c, t, enc);
8119                     save = c;
8120                     t += clen;
8121                 }
8122                 s += clen;
8123             }
8124         }
8125     }
8126
8127     TERM_FILL((char *)t, TERM_LEN(str));
8128     if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8129         STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8130         modify = 1;
8131     }
8132
8133     if (modify) return str;
8134     return Qnil;
8135 }
8136
8137
8138 /*
8139  *  call-seq:
8140  *     str.squeeze([other_str]*)    -> new_str
8141  *
8142  *  Builds a set of characters from the <i>other_str</i> parameter(s)
8143  *  using the procedure described for String#count. Returns a new
8144  *  string where runs of the same character that occur in this set are
8145  *  replaced by a single character. If no arguments are given, all
8146  *  runs of identical characters are replaced by a single character.
8147  *
8148  *     "yellow moon".squeeze                  #=> "yelow mon"
8149  *     "  now   is  the".squeeze(" ")         #=> " now is the"
8150  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
8151  */
8152
8153 static VALUE
8154 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8155 {
8156     str = str_duplicate(rb_cString, str);
8157     rb_str_squeeze_bang(argc, argv, str);
8158     return str;
8159 }
8160
8161
8162 /*
8163  *  call-seq:
8164  *     str.tr_s!(from_str, to_str)   -> str or nil
8165  *
8166  *  Performs String#tr_s processing on <i>str</i> in place,
8167  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
8168  */
8169
8170 static VALUE
8171 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8172 {
8173     return tr_trans(str, src, repl, 1);
8174 }
8175
8176
8177 /*
8178  *  call-seq:
8179  *     str.tr_s(from_str, to_str)   -> new_str
8180  *
8181  *  Processes a copy of <i>str</i> as described under String#tr, then
8182  *  removes duplicate characters in regions that were affected by the
8183  *  translation.
8184  *
8185  *     "hello".tr_s('l', 'r')     #=> "hero"
8186  *     "hello".tr_s('el', '*')    #=> "h*o"
8187  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
8188  */
8189
8190 static VALUE
8191 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8192 {
8193     str = str_duplicate(rb_cString, str);
8194     tr_trans(str, src, repl, 1);
8195     return str;
8196 }
8197
8198
8199 /*
8200  *  call-seq:
8201  *     str.count([other_str]+)   -> integer
8202  *
8203  *  Each +other_str+ parameter defines a set of characters to count.  The
8204  *  intersection of these sets defines the characters to count in +str+.  Any
8205  *  +other_str+ that starts with a caret <code>^</code> is negated.  The
8206  *  sequence <code>c1-c2</code> means all characters between c1 and c2.  The
8207  *  backslash character <code>\\</code> can be used to escape <code>^</code> or
8208  *  <code>-</code> and is otherwise ignored unless it appears at the end of a
8209  *  sequence or the end of a +other_str+.
8210  *
8211  *     a = "hello world"
8212  *     a.count "lo"                   #=> 5
8213  *     a.count "lo", "o"              #=> 2
8214  *     a.count "hello", "^l"          #=> 4
8215  *     a.count "ej-m"                 #=> 4
8216  *
8217  *     "hello^world".count "\\^aeiou" #=> 4
8218  *     "hello-world".count "a\\-eo"   #=> 4
8219  *
8220  *     c = "hello world\\r\\n"
8221  *     c.count "\\"                   #=> 2
8222  *     c.count "\\A"                  #=> 0
8223  *     c.count "X-\\w"                #=> 3
8224  */
8225
8226 static VALUE
8227 rb_str_count(int argc, VALUE *argv, VALUE str)
8228 {
8229     char table[TR_TABLE_SIZE];
8230     rb_encoding *enc = 0;
8231     VALUE del = 0, nodel = 0, tstr;
8232     char *s, *send;
8233     int i;
8234     int ascompat;
8235     size_t n = 0;
8236
8237     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8238
8239     tstr = argv[0];
8240     StringValue(tstr);
8241     enc = rb_enc_check(str, tstr);
8242     if (argc == 1) {
8243         const char *ptstr;
8244         if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8245             (ptstr = RSTRING_PTR(tstr),
8246              ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8247             !is_broken_string(str)) {
8248             int clen;
8249             unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8250
8251             s = RSTRING_PTR(str);
8252             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8253             send = RSTRING_END(str);
8254             while (s < send) {
8255                 if (*(unsigned char*)s++ == c) n++;
8256             }
8257             return SIZET2NUM(n);
8258         }
8259     }
8260
8261     tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8262     for (i=1; i<argc; i++) {
8263         tstr = argv[i];
8264         StringValue(tstr);
8265         enc = rb_enc_check(str, tstr);
8266         tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8267     }
8268
8269     s = RSTRING_PTR(str);
8270     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8271     send = RSTRING_END(str);
8272     ascompat = rb_enc_asciicompat(enc);
8273     while (s < send) {
8274         unsigned int c;
8275
8276         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8277             if (table[c]) {
8278                 n++;
8279             }
8280             s++;
8281         }
8282         else {
8283             int clen;
8284             c = rb_enc_codepoint_len(s, send, &clen, enc);
8285             if (tr_find(c, table, del, nodel)) {
8286                 n++;
8287             }
8288             s += clen;
8289         }
8290     }
8291
8292     return SIZET2NUM(n);
8293 }
8294
8295 static VALUE
8296 rb_fs_check(VALUE val)
8297 {
8298     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8299         val = rb_check_string_type(val);
8300         if (NIL_P(val)) return 0;
8301     }
8302     return val;
8303 }
8304
8305 static const char isspacetable[256] = {
8306     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8307     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8308     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8309     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8310     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8311     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8312     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8313     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8314     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8315     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8316     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8317     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8318     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8319     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8320     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8321     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8322 };
8323
8324 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8325
8326 static long
8327 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8328 {
8329     if (empty_count >= 0 && len == 0) {
8330         return empty_count + 1;
8331     }
8332     if (empty_count > 0) {
8333         /* make different substrings */
8334         if (result) {
8335             do {
8336                 rb_ary_push(result, str_new_empty_String(str));
8337             } while (--empty_count > 0);
8338         }
8339         else {
8340             do {
8341                 rb_yield(str_new_empty_String(str));
8342             } while (--empty_count > 0);
8343         }
8344     }
8345     str = rb_str_subseq(str, beg, len);
8346     if (result) {
8347         rb_ary_push(result, str);
8348     }
8349     else {
8350         rb_yield(str);
8351     }
8352     return empty_count;
8353 }
8354
8355 typedef enum {
8356     SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8357 } split_type_t;
8358
8359 static split_type_t
8360 literal_split_pattern(VALUE spat, split_type_t default_type)
8361 {
8362     rb_encoding *enc = STR_ENC_GET(spat);
8363     const char *ptr;
8364     long len;
8365     RSTRING_GETMEM(spat, ptr, len);
8366     if (len == 0) {
8367         /* Special case - split into chars */
8368         return SPLIT_TYPE_CHARS;
8369     }
8370     else if (rb_enc_asciicompat(enc)) {
8371         if (len == 1 && ptr[0] == ' ') {
8372             return SPLIT_TYPE_AWK;
8373         }
8374     }
8375     else {
8376         int l;
8377         if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8378             return SPLIT_TYPE_AWK;
8379         }
8380     }
8381     return default_type;
8382 }
8383
8384 /*
8385  *  call-seq:
8386  *     str.split(pattern=nil, [limit])                -> an_array
8387  *     str.split(pattern=nil, [limit]) {|sub| block } -> str
8388  *
8389  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
8390  *  of these substrings.
8391  *
8392  *  If <i>pattern</i> is a String, then its contents are used as
8393  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8394  *  space, <i>str</i> is split on whitespace, with leading and trailing
8395  *  whitespace and runs of contiguous whitespace characters ignored.
8396  *
8397  *  If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8398  *  pattern matches. Whenever the pattern matches a zero-length string,
8399  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
8400  *  groups, the respective matches will be returned in the array as well.
8401  *
8402  *  If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8403  *  If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8404  *  split on whitespace as if ' ' were specified.
8405  *
8406  *  If the <i>limit</i> parameter is omitted, trailing null fields are
8407  *  suppressed. If <i>limit</i> is a positive number, at most that number
8408  *  of split substrings will be returned (captured groups will be returned
8409  *  as well, but are not counted towards the limit).
8410  *  If <i>limit</i> is <code>1</code>, the entire
8411  *  string is returned as the only entry in an array. If negative, there is no
8412  *  limit to the number of fields returned, and trailing null fields are not
8413  *  suppressed.
8414  *
8415  *  When the input +str+ is empty an empty Array is returned as the string is
8416  *  considered to have no fields to split.
8417  *
8418  *     " now's  the time ".split       #=> ["now's", "the", "time"]
8419  *     " now's  the time ".split(' ')  #=> ["now's", "the", "time"]
8420  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
8421  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8422  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
8423  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
8424  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
8425  *
8426  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
8427  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
8428  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
8429  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
8430  *
8431  *     "1:2:3".split(/(:)()()/, 2)     #=> ["1", ":", "", "", "2:3"]
8432  *
8433  *     "".split(',', -1)               #=> []
8434  *
8435  *  If a block is given, invoke the block with each split substring.
8436  *
8437  */
8438
8439 static VALUE
8440 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8441 {
8442     rb_encoding *enc;
8443     VALUE spat;
8444     VALUE limit;
8445     split_type_t split_type;
8446     long beg, end, i = 0, empty_count = -1;
8447     int lim = 0;
8448     VALUE result, tmp;
8449
8450     result = rb_block_given_p() ? Qfalse : Qnil;
8451     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8452         lim = NUM2INT(limit);
8453         if (lim <= 0) limit = Qnil;
8454         else if (lim == 1) {
8455             if (RSTRING_LEN(str) == 0)
8456                 return result ? rb_ary_new2(0) : str;
8457             tmp = str_duplicate(rb_cString, str);
8458             if (!result) {
8459                 rb_yield(tmp);
8460                 return str;
8461             }
8462             return rb_ary_new3(1, tmp);
8463         }
8464         i = 1;
8465     }
8466     if (NIL_P(limit) && !lim) empty_count = 0;
8467
8468     enc = STR_ENC_GET(str);
8469     split_type = SPLIT_TYPE_REGEXP;
8470     if (!NIL_P(spat)) {
8471         spat = get_pat_quoted(spat, 0);
8472     }
8473     else if (NIL_P(spat = rb_fs)) {
8474         split_type = SPLIT_TYPE_AWK;
8475     }
8476     else if (!(spat = rb_fs_check(spat))) {
8477         rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8478     }
8479     else {
8480         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8481     }
8482     if (split_type != SPLIT_TYPE_AWK) {
8483         switch (BUILTIN_TYPE(spat)) {
8484           case T_REGEXP:
8485             rb_reg_options(spat); /* check if uninitialized */
8486             tmp = RREGEXP_SRC(spat);
8487             split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8488             if (split_type == SPLIT_TYPE_AWK) {
8489                 spat = tmp;
8490                 split_type = SPLIT_TYPE_STRING;
8491             }
8492             break;
8493
8494           case T_STRING:
8495             mustnot_broken(spat);
8496             split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8497             break;
8498
8499           default:
8500             UNREACHABLE_RETURN(Qnil);
8501         }
8502     }
8503
8504 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8505
8506     if (result) result = rb_ary_new();
8507     beg = 0;
8508     char *ptr = RSTRING_PTR(str);
8509     char *eptr = RSTRING_END(str);
8510     if (split_type == SPLIT_TYPE_AWK) {
8511         char *bptr = ptr;
8512         int skip = 1;
8513         unsigned int c;
8514
8515         end = beg;
8516         if (is_ascii_string(str)) {
8517             while (ptr < eptr) {
8518                 c = (unsigned char)*ptr++;
8519                 if (skip) {
8520                     if (ascii_isspace(c)) {
8521                         beg = ptr - bptr;
8522                     }
8523                     else {
8524                         end = ptr - bptr;
8525                         skip = 0;
8526                         if (!NIL_P(limit) && lim <= i) break;
8527                     }
8528                 }
8529                 else if (ascii_isspace(c)) {
8530                     SPLIT_STR(beg, end-beg);
8531                     skip = 1;
8532                     beg = ptr - bptr;
8533                     if (!NIL_P(limit)) ++i;
8534                 }
8535                 else {
8536                     end = ptr - bptr;
8537                 }
8538             }
8539         }
8540         else {
8541             while (ptr < eptr) {
8542                 int n;
8543
8544                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8545                 ptr += n;
8546                 if (skip) {
8547                     if (rb_isspace(c)) {
8548                         beg = ptr - bptr;
8549                     }
8550                     else {
8551                         end = ptr - bptr;
8552                         skip = 0;
8553                         if (!NIL_P(limit) && lim <= i) break;
8554                     }
8555                 }
8556                 else if (rb_isspace(c)) {
8557                     SPLIT_STR(beg, end-beg);
8558                     skip = 1;
8559                     beg = ptr - bptr;
8560                     if (!NIL_P(limit)) ++i;
8561                 }
8562                 else {
8563                     end = ptr - bptr;
8564                 }
8565             }
8566         }
8567     }
8568     else if (split_type == SPLIT_TYPE_STRING) {
8569         char *str_start = ptr;
8570         char *substr_start = ptr;
8571         char *sptr = RSTRING_PTR(spat);
8572         long slen = RSTRING_LEN(spat);
8573
8574         mustnot_broken(str);
8575         enc = rb_enc_check(str, spat);
8576         while (ptr < eptr &&
8577                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8578             /* Check we are at the start of a char */
8579             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8580             if (t != ptr + end) {
8581                 ptr = t;
8582                 continue;
8583             }
8584             SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8585             ptr += end + slen;
8586             substr_start = ptr;
8587             if (!NIL_P(limit) && lim <= ++i) break;
8588         }
8589         beg = ptr - str_start;
8590     }
8591     else if (split_type == SPLIT_TYPE_CHARS) {
8592         char *str_start = ptr;
8593         int n;
8594
8595         mustnot_broken(str);
8596         enc = rb_enc_get(str);
8597         while (ptr < eptr &&
8598                (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8599             SPLIT_STR(ptr - str_start, n);
8600             ptr += n;
8601             if (!NIL_P(limit) && lim <= ++i) break;
8602         }
8603         beg = ptr - str_start;
8604     }
8605     else {
8606         long len = RSTRING_LEN(str);
8607         long start = beg;
8608         long idx;
8609         int last_null = 0;
8610         struct re_registers *regs;
8611         VALUE match = 0;
8612
8613         for (; rb_reg_search(spat, str, start, 0) >= 0;
8614              (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8615             match = rb_backref_get();
8616             if (!result) rb_match_busy(match);
8617             regs = RMATCH_REGS(match);
8618             end = BEG(0);
8619             if (start == end && BEG(0) == END(0)) {
8620                 if (!ptr) {
8621                     SPLIT_STR(0, 0);
8622                     break;
8623                 }
8624                 else if (last_null == 1) {
8625                     SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8626                     beg = start;
8627                 }
8628                 else {
8629                     if (start == len)
8630                         start++;
8631                     else
8632                         start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8633                     last_null = 1;
8634                     continue;
8635                 }
8636             }
8637             else {
8638                 SPLIT_STR(beg, end-beg);
8639                 beg = start = END(0);
8640             }
8641             last_null = 0;
8642
8643             for (idx=1; idx < regs->num_regs; idx++) {
8644                 if (BEG(idx) == -1) continue;
8645                 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8646             }
8647             if (!NIL_P(limit) && lim <= ++i) break;
8648         }
8649         if (match) rb_match_unbusy(match);
8650     }
8651     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8652         SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8653     }
8654
8655     return result ? result : str;
8656 }
8657
8658 VALUE
8659 rb_str_split(VALUE str, const char *sep0)
8660 {
8661     VALUE sep;
8662
8663     StringValue(str);
8664     sep = rb_str_new_cstr(sep0);
8665     return rb_str_split_m(1, &sep, str);
8666 }
8667
8668 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8669
8670 static inline int
8671 enumerator_element(VALUE ary, VALUE e)
8672 {
8673     if (ary) {
8674         rb_ary_push(ary, e);
8675         return 0;
8676     }
8677     else {
8678         rb_yield(e);
8679         return 1;
8680     }
8681 }
8682
8683 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8684
8685 static const char *
8686 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8687 {
8688     const char *prev = rb_enc_prev_char(p, e, e, enc);
8689     if (rb_enc_is_newline(prev, e, enc)) {
8690         e = prev;
8691         prev = rb_enc_prev_char(p, e, e, enc);
8692         if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8693             e = prev;
8694     }
8695     return e;
8696 }
8697
8698 static VALUE
8699 get_rs(void)
8700 {
8701     VALUE rs = rb_rs;
8702     if (!NIL_P(rs) &&
8703         (!RB_TYPE_P(rs, T_STRING) ||
8704          RSTRING_LEN(rs) != 1 ||
8705          RSTRING_PTR(rs)[0] != '\n')) {
8706         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8707     }
8708     return rs;
8709 }
8710
8711 #define rb_rs get_rs()
8712
8713 static VALUE
8714 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8715 {
8716     rb_encoding *enc;
8717     VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8718     const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8719     long pos, len, rslen;
8720     int rsnewline = 0;
8721
8722     if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8723         rs = rb_rs;
8724     if (!NIL_P(opts)) {
8725         static ID keywords[1];
8726         if (!keywords[0]) {
8727             keywords[0] = rb_intern_const("chomp");
8728         }
8729         rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8730         chomp = (chomp != Qundef && RTEST(chomp));
8731     }
8732
8733     if (NIL_P(rs)) {
8734         if (!ENUM_ELEM(ary, str)) {
8735             return ary;
8736         }
8737         else {
8738             return orig;
8739         }
8740     }
8741
8742     if (!RSTRING_LEN(str)) goto end;
8743     str = rb_str_new_frozen(str);
8744     ptr = subptr = RSTRING_PTR(str);
8745     pend = RSTRING_END(str);
8746     len = RSTRING_LEN(str);
8747     StringValue(rs);
8748     rslen = RSTRING_LEN(rs);
8749
8750     if (rs == rb_default_rs)
8751         enc = rb_enc_get(str);
8752     else
8753         enc = rb_enc_check(str, rs);
8754
8755     if (rslen == 0) {
8756         /* paragraph mode */
8757         int n;
8758         const char *eol = NULL;
8759         subend = subptr;
8760         while (subend < pend) {
8761             do {
8762                 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8763                     n = 0;
8764                 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8765                 if (rb_enc_is_newline(subend + n, pend, enc)) {
8766                     if (eol == subend) break;
8767                     subend += rslen;
8768                     if (subptr) eol = subend;
8769                 }
8770                 else {
8771                     if (!subptr) subptr = subend;
8772                     subend += rslen;
8773                 }
8774                 rslen = 0;
8775             } while (subend < pend);
8776             if (!subptr) break;
8777             line = rb_str_subseq(str, subptr - ptr,
8778                                  subend - subptr + (chomp ? 0 : rslen));
8779             if (ENUM_ELEM(ary, line)) {
8780                 str_mod_check(str, ptr, len);
8781             }
8782             subptr = eol = NULL;
8783         }
8784         goto end;
8785     }
8786     else {
8787         rsptr = RSTRING_PTR(rs);
8788         if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8789             rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8790             rsnewline = 1;
8791         }
8792     }
8793
8794     if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8795         rs = rb_str_new(rsptr, rslen);
8796         rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8797         rsptr = RSTRING_PTR(rs);
8798         rslen = RSTRING_LEN(rs);
8799     }
8800
8801     while (subptr < pend) {
8802         pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8803         if (pos < 0) break;
8804         hit = subptr + pos;
8805         adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8806         if (hit != adjusted) {
8807             subptr = adjusted;
8808             continue;
8809         }
8810         subend = hit += rslen;
8811         if (chomp) {
8812             if (rsnewline) {
8813                 subend = chomp_newline(subptr, subend, enc);
8814             }
8815             else {
8816                 subend -= rslen;
8817             }
8818         }
8819         line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8820         if (ENUM_ELEM(ary, line)) {
8821             str_mod_check(str, ptr, len);
8822         }
8823         subptr = hit;
8824     }
8825
8826     if (subptr != pend) {
8827         if (chomp) {
8828             if (rsnewline) {
8829                 pend = chomp_newline(subptr, pend, enc);
8830             }
8831             else if (pend - subptr >= rslen &&
8832                      memcmp(pend - rslen, rsptr, rslen) == 0) {
8833                 pend -= rslen;
8834             }
8835         }
8836         line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8837         ENUM_ELEM(ary, line);
8838         RB_GC_GUARD(str);
8839     }
8840
8841   end:
8842     if (ary)
8843         return ary;
8844     else
8845         return orig;
8846 }
8847
8848 /*
8849  *  call-seq:
8850  *     str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8851  *     str.each_line(separator=$/, chomp: false)                   -> an_enumerator
8852  *
8853  *  Splits <i>str</i> using the supplied parameter as the record
8854  *  separator (<code>$/</code> by default), passing each substring in
8855  *  turn to the supplied block.  If a zero-length record separator is
8856  *  supplied, the string is split into paragraphs delimited by
8857  *  multiple successive newlines.
8858  *
8859  *  If +chomp+ is +true+, +separator+ will be removed from the end of each
8860  *  line.
8861  *
8862  *  If no block is given, an enumerator is returned instead.
8863  *
8864  *     "hello\nworld".each_line {|s| p s}
8865  *     # prints:
8866  *     #   "hello\n"
8867  *     #   "world"
8868  *
8869  *     "hello\nworld".each_line('l') {|s| p s}
8870  *     # prints:
8871  *     #   "hel"
8872  *     #   "l"
8873  *     #   "o\nworl"
8874  *     #   "d"
8875  *
8876  *     "hello\n\n\nworld".each_line('') {|s| p s}
8877  *     # prints
8878  *     #   "hello\n\n"
8879  *     #   "world"
8880  *
8881  *     "hello\nworld".each_line(chomp: true) {|s| p s}
8882  *     # prints:
8883  *     #   "hello"
8884  *     #   "world"
8885  *
8886  *     "hello\nworld".each_line('l', chomp: true) {|s| p s}
8887  *     # prints:
8888  *     #   "he"
8889  *     #   ""
8890  *     #   "o\nwor"
8891  *     #   "d"
8892  *
8893  */
8894
8895 static VALUE
8896 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8897 {
8898     RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8899     return rb_str_enumerate_lines(argc, argv, str, 0);
8900 }
8901
8902 /*
8903  *  call-seq:
8904  *     str.lines(separator=$/, chomp: false)  -> an_array
8905  *
8906  *  Returns an array of lines in <i>str</i> split using the supplied
8907  *  record separator (<code>$/</code> by default).  This is a
8908  *  shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8909  *
8910  *  If +chomp+ is +true+, +separator+ will be removed from the end of each
8911  *  line.
8912  *
8913  *     "hello\nworld\n".lines              #=> ["hello\n", "world\n"]
8914  *     "hello  world".lines(' ')           #=> ["hello ", " ", "world"]
8915  *     "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8916  *
8917  *  If a block is given, which is a deprecated form, works the same as
8918  *  <code>each_line</code>.
8919  */
8920
8921 static VALUE
8922 rb_str_lines(int argc, VALUE *argv, VALUE str)
8923 {
8924     VALUE ary = WANTARRAY("lines", 0);
8925     return rb_str_enumerate_lines(argc, argv, str, ary);
8926 }
8927
8928 static VALUE
8929 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8930 {
8931     return LONG2FIX(RSTRING_LEN(str));
8932 }
8933
8934 static VALUE
8935 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8936 {
8937     long i;
8938
8939     for (i=0; i<RSTRING_LEN(str); i++) {
8940         ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8941     }
8942     if (ary)
8943         return ary;
8944     else
8945         return str;
8946 }
8947
8948 /*
8949  *  call-seq:
8950  *     str.each_byte {|integer| block }    -> str
8951  *     str.each_byte                      -> an_enumerator
8952  *
8953  *  Passes each byte in <i>str</i> to the given block, or returns an
8954  *  enumerator if no block is given.
8955  *
8956  *     "hello".each_byte {|c| print c, ' ' }
8957  *
8958  *  <em>produces:</em>
8959  *
8960  *     104 101 108 108 111
8961  */
8962
8963 static VALUE
8964 rb_str_each_byte(VALUE str)
8965 {
8966     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8967     return rb_str_enumerate_bytes(str, 0);
8968 }
8969
8970 /*
8971  *  call-seq:
8972  *     str.bytes    -> an_array
8973  *
8974  *  Returns an array of bytes in <i>str</i>.  This is a shorthand for
8975  *  <code>str.each_byte.to_a</code>.
8976  *
8977  *  If a block is given, which is a deprecated form, works the same as
8978  *  <code>each_byte</code>.
8979  */
8980
8981 static VALUE
8982 rb_str_bytes(VALUE str)
8983 {
8984     VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
8985     return rb_str_enumerate_bytes(str, ary);
8986 }
8987
8988 static VALUE
8989 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
8990 {
8991     return rb_str_length(str);
8992 }
8993
8994 static VALUE
8995 rb_str_enumerate_chars(VALUE str, VALUE ary)
8996 {
8997     VALUE orig = str;
8998     long i, len, n;
8999     const char *ptr;
9000     rb_encoding *enc;
9001
9002     str = rb_str_new_frozen(str);
9003     ptr = RSTRING_PTR(str);
9004     len = RSTRING_LEN(str);
9005     enc = rb_enc_get(str);
9006
9007     if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9008         for (i = 0; i < len; i += n) {
9009             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9010             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9011         }
9012     }
9013     else {
9014         for (i = 0; i < len; i += n) {
9015             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9016             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9017         }
9018     }
9019     RB_GC_GUARD(str);
9020     if (ary)
9021         return ary;
9022     else
9023         return orig;
9024 }
9025
9026 /*
9027  *  call-seq:
9028  *     str.each_char {|cstr| block }    -> str
9029  *     str.each_char                    -> an_enumerator
9030  *
9031  *  Passes each character in <i>str</i> to the given block, or returns
9032  *  an enumerator if no block is given.
9033  *
9034  *     "hello".each_char {|c| print c, ' ' }
9035  *
9036  *  <em>produces:</em>
9037  *
9038  *     h e l l o
9039  */
9040
9041 static VALUE
9042 rb_str_each_char(VALUE str)
9043 {
9044     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9045     return rb_str_enumerate_chars(str, 0);
9046 }
9047
9048 /*
9049  *  call-seq:
9050  *     str.chars    -> an_array
9051  *
9052  *  Returns an array of characters in <i>str</i>.  This is a shorthand
9053  *  for <code>str.each_char.to_a</code>.
9054  *
9055  *  If a block is given, which is a deprecated form, works the same as
9056  *  <code>each_char</code>.
9057  */
9058
9059 static VALUE
9060 rb_str_chars(VALUE str)
9061 {
9062     VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9063     return rb_str_enumerate_chars(str, ary);
9064 }
9065
9066 static VALUE
9067 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9068 {
9069     VALUE orig = str;
9070     int n;
9071     unsigned int c;
9072     const char *ptr, *end;
9073     rb_encoding *enc;
9074
9075     if (single_byte_optimizable(str))
9076         return rb_str_enumerate_bytes(str, ary);
9077
9078     str = rb_str_new_frozen(str);
9079     ptr = RSTRING_PTR(str);
9080     end = RSTRING_END(str);
9081     enc = STR_ENC_GET(str);
9082
9083     while (ptr < end) {
9084         c = rb_enc_codepoint_len(ptr, end, &n, enc);
9085         ENUM_ELEM(ary, UINT2NUM(c));
9086         ptr += n;
9087     }
9088     RB_GC_GUARD(str);
9089     if (ary)
9090         return ary;
9091     else
9092         return orig;
9093 }
9094
9095 /*
9096  *  call-seq:
9097  *     str.each_codepoint {|integer| block }    -> str
9098  *     str.each_codepoint                       -> an_enumerator
9099  *
9100  *  Passes the Integer ordinal of each character in <i>str</i>,
9101  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
9102  *  given block.  For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9103  *  values are directly derived from the binary representation
9104  *  of each character.
9105  *
9106  *  If no block is given, an enumerator is returned instead.
9107  *
9108  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
9109  *
9110  *  <em>produces:</em>
9111  *
9112  *     104 101 108 108 111 1593
9113  */
9114
9115 static VALUE
9116 rb_str_each_codepoint(VALUE str)
9117 {
9118     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9119     return rb_str_enumerate_codepoints(str, 0);
9120 }
9121
9122 /*
9123  *  call-seq:
9124  *     str.codepoints   -> an_array
9125  *
9126  *  Returns an array of the Integer ordinals of the
9127  *  characters in <i>str</i>.  This is a shorthand for
9128  *  <code>str.each_codepoint.to_a</code>.
9129  *
9130  *  If a block is given, which is a deprecated form, works the same as
9131  *  <code>each_codepoint</code>.
9132  */
9133
9134 static VALUE
9135 rb_str_codepoints(VALUE str)
9136 {
9137     VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9138     return rb_str_enumerate_codepoints(str, ary);
9139 }
9140
9141 static regex_t *
9142 get_reg_grapheme_cluster(rb_encoding *enc)
9143 {
9144     int encidx = rb_enc_to_index(enc);
9145     regex_t *reg_grapheme_cluster = NULL;
9146     static regex_t *reg_grapheme_cluster_utf8 = NULL;
9147
9148     /* synchronize */
9149     if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9150         reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9151     }
9152     if (!reg_grapheme_cluster) {
9153         const OnigUChar source_ascii[] = "\\X";
9154         OnigErrorInfo einfo;
9155         const OnigUChar *source = source_ascii;
9156         size_t source_len = sizeof(source_ascii) - 1;
9157         switch (encidx) {
9158 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9159 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9160 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9161 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9162 #define CASE_UTF(e) \
9163           case ENCINDEX_UTF_##e: { \
9164             static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9165             source = source_UTF_##e; \
9166             source_len = sizeof(source_UTF_##e); \
9167             break; \
9168           }
9169             CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9170 #undef CASE_UTF
9171 #undef CHARS_16BE
9172 #undef CHARS_16LE
9173 #undef CHARS_32BE
9174 #undef CHARS_32LE
9175         }
9176         int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9177                          ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9178         if (r) {
9179             UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9180             onig_error_code_to_str(message, r, &einfo);
9181             rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9182         }
9183         if (encidx == rb_utf8_encindex()) {
9184             reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9185         }
9186     }
9187     return reg_grapheme_cluster;
9188 }
9189
9190 static VALUE
9191 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9192 {
9193     size_t grapheme_cluster_count = 0;
9194     regex_t *reg_grapheme_cluster = NULL;
9195     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9196     const char *ptr, *end;
9197
9198     if (!rb_enc_unicode_p(enc)) {
9199         return rb_str_length(str);
9200     }
9201
9202     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9203     ptr = RSTRING_PTR(str);
9204     end = RSTRING_END(str);
9205
9206     while (ptr < end) {
9207         OnigPosition len = onig_match(reg_grapheme_cluster,
9208                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9209                                       (const OnigUChar *)ptr, NULL, 0);
9210         if (len <= 0) break;
9211         grapheme_cluster_count++;
9212         ptr += len;
9213     }
9214
9215     return SIZET2NUM(grapheme_cluster_count);
9216 }
9217
9218 static VALUE
9219 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9220 {
9221     VALUE orig = str;
9222     regex_t *reg_grapheme_cluster = NULL;
9223     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9224     const char *ptr0, *ptr, *end;
9225
9226     if (!rb_enc_unicode_p(enc)) {
9227         return rb_str_enumerate_chars(str, ary);
9228     }
9229
9230     if (!ary) str = rb_str_new_frozen(str);
9231     reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9232     ptr0 = ptr = RSTRING_PTR(str);
9233     end = RSTRING_END(str);
9234
9235     while (ptr < end) {
9236         OnigPosition len = onig_match(reg_grapheme_cluster,
9237                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9238                                       (const OnigUChar *)ptr, NULL, 0);
9239         if (len <= 0) break;
9240         ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9241         ptr += len;
9242     }
9243     RB_GC_GUARD(str);
9244     if (ary)
9245         return ary;
9246     else
9247         return orig;
9248 }
9249
9250 /*
9251  *  call-seq:
9252  *     str.each_grapheme_cluster {|cstr| block }    -> str
9253  *     str.each_grapheme_cluster                    -> an_enumerator
9254  *
9255  *  Passes each grapheme cluster in <i>str</i> to the given block, or returns
9256  *  an enumerator if no block is given.
9257  *  Unlike String#each_char, this enumerates by grapheme clusters defined by
9258  *  Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9259  *
9260  *     "a\u0300".each_char.to_a.size #=> 2
9261  *     "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9262  *
9263  */
9264
9265 static VALUE
9266 rb_str_each_grapheme_cluster(VALUE str)
9267 {
9268     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9269     return rb_str_enumerate_grapheme_clusters(str, 0);
9270 }
9271
9272 /*
9273  *  call-seq:
9274  *     str.grapheme_clusters   -> an_array
9275  *
9276  *  Returns an array of grapheme clusters in <i>str</i>.  This is a shorthand
9277  *  for <code>str.each_grapheme_cluster.to_a</code>.
9278  *
9279  *  If a block is given, which is a deprecated form, works the same as
9280  *  <code>each_grapheme_cluster</code>.
9281  */
9282
9283 static VALUE
9284 rb_str_grapheme_clusters(VALUE str)
9285 {
9286     VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9287     return rb_str_enumerate_grapheme_clusters(str, ary);
9288 }
9289
9290 static long
9291 chopped_length(VALUE str)
9292 {
9293     rb_encoding *enc = STR_ENC_GET(str);
9294     const char *p, *p2, *beg, *end;
9295
9296     beg = RSTRING_PTR(str);
9297     end = beg + RSTRING_LEN(str);
9298     if (beg >= end) return 0;
9299     p = rb_enc_prev_char(beg, end, end, enc);
9300     if (!p) return 0;
9301     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9302         p2 = rb_enc_prev_char(beg, p, end, enc);
9303         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9304     }
9305     return p - beg;
9306 }
9307
9308 /*
9309  *  call-seq:
9310  *     str.chop!   -> str or nil
9311  *
9312  *  Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9313  *  <code>nil</code> if <i>str</i> is the empty string.  See also
9314  *  String#chomp!.
9315  */
9316
9317 static VALUE
9318 rb_str_chop_bang(VALUE str)
9319 {
9320     str_modify_keep_cr(str);
9321     if (RSTRING_LEN(str) > 0) {
9322         long len;
9323         len = chopped_length(str);
9324         STR_SET_LEN(str, len);
9325         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9326         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9327             ENC_CODERANGE_CLEAR(str);
9328         }
9329         return str;
9330     }
9331     return Qnil;
9332 }
9333
9334
9335 /*
9336  *  call-seq:
9337  *     str.chop   -> new_str
9338  *
9339  *  Returns a new String with the last character removed.  If the
9340  *  string ends with <code>\r\n</code>, both characters are
9341  *  removed. Applying <code>chop</code> to an empty string returns an
9342  *  empty string. String#chomp is often a safer alternative, as it
9343  *  leaves the string unchanged if it doesn't end in a record
9344  *  separator.
9345  *
9346  *     "string\r\n".chop   #=> "string"
9347  *     "string\n\r".chop   #=> "string\n"
9348  *     "string\n".chop     #=> "string"
9349  *     "string".chop       #=> "strin"
9350  *     "x".chop.chop       #=> ""
9351  */
9352
9353 static VALUE
9354 rb_str_chop(VALUE str)
9355 {
9356     return rb_str_subseq(str, 0, chopped_length(str));
9357 }
9358
9359 static long
9360 smart_chomp(VALUE str, const char *e, const char *p)
9361 {
9362     rb_encoding *enc = rb_enc_get(str);
9363     if (rb_enc_mbminlen(enc) > 1) {
9364         const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9365         if (rb_enc_is_newline(pp, e, enc)) {
9366             e = pp;
9367         }
9368         pp = e - rb_enc_mbminlen(enc);
9369         if (pp >= p) {
9370             pp = rb_enc_left_char_head(p, pp, e, enc);
9371             if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9372                 e = pp;
9373             }
9374         }
9375     }
9376     else {
9377         switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9378           case '\n':
9379             if (--e > p && *(e-1) == '\r') {
9380                 --e;
9381             }
9382             break;
9383           case '\r':
9384             --e;
9385             break;
9386         }
9387     }
9388     return e - p;
9389 }
9390
9391 static long
9392 chompped_length(VALUE str, VALUE rs)
9393 {
9394     rb_encoding *enc;
9395     int newline;
9396     char *pp, *e, *rsptr;
9397     long rslen;
9398     char *const p = RSTRING_PTR(str);
9399     long len = RSTRING_LEN(str);
9400
9401     if (len == 0) return 0;
9402     e = p + len;
9403     if (rs == rb_default_rs) {
9404         return smart_chomp(str, e, p);
9405     }
9406
9407     enc = rb_enc_get(str);
9408     RSTRING_GETMEM(rs, rsptr, rslen);
9409     if (rslen == 0) {
9410         if (rb_enc_mbminlen(enc) > 1) {
9411             while (e > p) {
9412                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9413                 if (!rb_enc_is_newline(pp, e, enc)) break;
9414                 e = pp;
9415                 pp -= rb_enc_mbminlen(enc);
9416                 if (pp >= p) {
9417                     pp = rb_enc_left_char_head(p, pp, e, enc);
9418                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9419                         e = pp;
9420                     }
9421                 }
9422             }
9423         }
9424         else {
9425             while (e > p && *(e-1) == '\n') {
9426                 --e;
9427                 if (e > p && *(e-1) == '\r')
9428                     --e;
9429             }
9430         }
9431         return e - p;
9432     }
9433     if (rslen > len) return len;
9434
9435     enc = rb_enc_get(rs);
9436     newline = rsptr[rslen-1];
9437     if (rslen == rb_enc_mbminlen(enc)) {
9438         if (rslen == 1) {
9439             if (newline == '\n')
9440                 return smart_chomp(str, e, p);
9441         }
9442         else {
9443             if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9444                 return smart_chomp(str, e, p);
9445         }
9446     }
9447
9448     enc = rb_enc_check(str, rs);
9449     if (is_broken_string(rs)) {
9450         return len;
9451     }
9452     pp = e - rslen;
9453     if (p[len-1] == newline &&
9454         (rslen <= 1 ||
9455          memcmp(rsptr, pp, rslen) == 0)) {
9456         if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9457             return len - rslen;
9458         RB_GC_GUARD(rs);
9459     }
9460     return len;
9461 }
9462
9463 /*!
9464  * Returns the separator for arguments of rb_str_chomp.
9465  *
9466  * @return returns rb_ps ($/) as default, the default value of rb_ps ($/) is "\n".
9467  */
9468 static VALUE
9469 chomp_rs(int argc, const VALUE *argv)
9470 {
9471     rb_check_arity(argc, 0, 1);
9472     if (argc > 0) {
9473         VALUE rs = argv[0];
9474         if (!NIL_P(rs)) StringValue(rs);
9475         return rs;
9476     }
9477     else {
9478         return rb_rs;
9479     }
9480 }
9481
9482 VALUE
9483 rb_str_chomp_string(VALUE str, VALUE rs)
9484 {
9485     long olen = RSTRING_LEN(str);
9486     long len = chompped_length(str, rs);
9487     if (len >= olen) return Qnil;
9488     str_modify_keep_cr(str);
9489     STR_SET_LEN(str, len);
9490     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9491     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9492         ENC_CODERANGE_CLEAR(str);
9493     }
9494     return str;
9495 }
9496
9497 /*
9498  *  call-seq:
9499  *     str.chomp!(separator=$/)   -> str or nil
9500  *
9501  *  Modifies <i>str</i> in place as described for String#chomp,
9502  *  returning <i>str</i>, or <code>nil</code> if no modifications were
9503  *  made.
9504  */
9505
9506 static VALUE
9507 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9508 {
9509     VALUE rs;
9510     str_modifiable(str);
9511     if (RSTRING_LEN(str) == 0) return Qnil;
9512     rs = chomp_rs(argc, argv);
9513     if (NIL_P(rs)) return Qnil;
9514     return rb_str_chomp_string(str, rs);
9515 }
9516
9517
9518 /*
9519  *  call-seq:
9520  *     str.chomp(separator=$/)   -> new_str
9521  *
9522  *  Returns a new String with the given record separator removed
9523  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
9524  *  changed from the default Ruby record separator, then <code>chomp</code> also
9525  *  removes carriage return characters (that is, it will remove <code>\n</code>,
9526  *  <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9527  *  it will remove all trailing newlines from the string.
9528  *
9529  *     "hello".chomp                #=> "hello"
9530  *     "hello\n".chomp              #=> "hello"
9531  *     "hello\r\n".chomp            #=> "hello"
9532  *     "hello\n\r".chomp            #=> "hello\n"
9533  *     "hello\r".chomp              #=> "hello"
9534  *     "hello \n there".chomp       #=> "hello \n there"
9535  *     "hello".chomp("llo")         #=> "he"
9536  *     "hello\r\n\r\n".chomp('')    #=> "hello"
9537  *     "hello\r\n\r\r\n".chomp('')  #=> "hello\r\n\r"
9538  */
9539
9540 static VALUE
9541 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9542 {
9543     VALUE rs = chomp_rs(argc, argv);
9544     if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9545     return rb_str_subseq(str, 0, chompped_length(str, rs));
9546 }
9547
9548 static long
9549 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9550 {
9551     const char *const start = s;
9552
9553     if (!s || s >= e) return 0;
9554
9555     /* remove spaces at head */
9556     if (single_byte_optimizable(str)) {
9557         while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9558     }
9559     else {
9560         while (s < e) {
9561             int n;
9562             unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9563
9564             if (cc && !rb_isspace(cc)) break;
9565             s += n;
9566         }
9567     }
9568     return s - start;
9569 }
9570
9571 /*
9572  *  call-seq:
9573  *     str.lstrip!   -> self or nil
9574  *
9575  *  Removes leading whitespace from the receiver.
9576  *  Returns the altered receiver, or +nil+ if no change was made.
9577  *  See also String#rstrip! and String#strip!.
9578  *
9579  *  Refer to String#strip for the definition of whitespace.
9580  *
9581  *     "  hello  ".lstrip!  #=> "hello  "
9582  *     "hello  ".lstrip!    #=> nil
9583  *     "hello".lstrip!      #=> nil
9584  */
9585
9586 static VALUE
9587 rb_str_lstrip_bang(VALUE str)
9588 {
9589     rb_encoding *enc;
9590     char *start, *s;
9591     long olen, loffset;
9592
9593     str_modify_keep_cr(str);
9594     enc = STR_ENC_GET(str);
9595     RSTRING_GETMEM(str, start, olen);
9596     loffset = lstrip_offset(str, start, start+olen, enc);
9597     if (loffset > 0) {
9598         long len = olen-loffset;
9599         s = start + loffset;
9600         memmove(start, s, len);
9601         STR_SET_LEN(str, len);
9602         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9603         return str;
9604     }
9605     return Qnil;
9606 }
9607
9608
9609 /*
9610  *  call-seq:
9611  *     str.lstrip   -> new_str
9612  *
9613  *  Returns a copy of the receiver with leading whitespace removed.
9614  *  See also String#rstrip and String#strip.
9615  *
9616  *  Refer to String#strip for the definition of whitespace.
9617  *
9618  *     "  hello  ".lstrip   #=> "hello  "
9619  *     "hello".lstrip       #=> "hello"
9620  */
9621
9622 static VALUE
9623 rb_str_lstrip(VALUE str)
9624 {
9625     char *start;
9626     long len, loffset;
9627     RSTRING_GETMEM(str, start, len);
9628     loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9629     if (loffset <= 0) return str_duplicate(rb_cString, str);
9630     return rb_str_subseq(str, loffset, len - loffset);
9631 }
9632
9633 static long
9634 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9635 {
9636     const char *t;
9637
9638     rb_str_check_dummy_enc(enc);
9639     if (!s || s >= e) return 0;
9640     t = e;
9641
9642     /* remove trailing spaces or '\0's */
9643     if (single_byte_optimizable(str)) {
9644         unsigned char c;
9645         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9646     }
9647     else {
9648         char *tp;
9649
9650         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9651             unsigned int c = rb_enc_codepoint(tp, e, enc);
9652             if (c && !rb_isspace(c)) break;
9653             t = tp;
9654         }
9655     }
9656     return e - t;
9657 }
9658
9659 /*
9660  *  call-seq:
9661  *     str.rstrip!   -> self or nil
9662  *
9663  *  Removes trailing whitespace from the receiver.
9664  *  Returns the altered receiver, or +nil+ if no change was made.
9665  *  See also String#lstrip! and String#strip!.
9666  *
9667  *  Refer to String#strip for the definition of whitespace.
9668  *
9669  *     "  hello  ".rstrip!  #=> "  hello"
9670  *     "  hello".rstrip!    #=> nil
9671  *     "hello".rstrip!      #=> nil
9672  */
9673
9674 static VALUE
9675 rb_str_rstrip_bang(VALUE str)
9676 {
9677     rb_encoding *enc;
9678     char *start;
9679     long olen, roffset;
9680
9681     str_modify_keep_cr(str);
9682     enc = STR_ENC_GET(str);
9683     RSTRING_GETMEM(str, start, olen);
9684     roffset = rstrip_offset(str, start, start+olen, enc);
9685     if (roffset > 0) {
9686         long len = olen - roffset;
9687
9688         STR_SET_LEN(str, len);
9689         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9690         return str;
9691     }
9692     return Qnil;
9693 }
9694
9695
9696 /*
9697  *  call-seq:
9698  *     str.rstrip   -> new_str
9699  *
9700  *  Returns a copy of the receiver with trailing whitespace removed.
9701  *  See also String#lstrip and String#strip.
9702  *
9703  *  Refer to String#strip for the definition of whitespace.
9704  *
9705  *     "  hello  ".rstrip   #=> "  hello"
9706  *     "hello".rstrip       #=> "hello"
9707  */
9708
9709 static VALUE
9710 rb_str_rstrip(VALUE str)
9711 {
9712     rb_encoding *enc;
9713     char *start;
9714     long olen, roffset;
9715
9716     enc = STR_ENC_GET(str);
9717     RSTRING_GETMEM(str, start, olen);
9718     roffset = rstrip_offset(str, start, start+olen, enc);
9719
9720     if (roffset <= 0) return str_duplicate(rb_cString, str);
9721     return rb_str_subseq(str, 0, olen-roffset);
9722 }
9723
9724
9725 /*
9726  *  call-seq:
9727  *     str.strip!   -> self or nil
9728  *
9729  *  Removes leading and trailing whitespace from the receiver.
9730  *  Returns the altered receiver, or +nil+ if there was no change.
9731  *
9732  *  Refer to String#strip for the definition of whitespace.
9733  *
9734  *     "  hello  ".strip!  #=> "hello"
9735  *     "hello".strip!      #=> nil
9736  */
9737
9738 static VALUE
9739 rb_str_strip_bang(VALUE str)
9740 {
9741     char *start;
9742     long olen, loffset, roffset;
9743     rb_encoding *enc;
9744
9745     str_modify_keep_cr(str);
9746     enc = STR_ENC_GET(str);
9747     RSTRING_GETMEM(str, start, olen);
9748     loffset = lstrip_offset(str, start, start+olen, enc);
9749     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9750
9751     if (loffset > 0 || roffset > 0) {
9752         long len = olen-roffset;
9753         if (loffset > 0) {
9754             len -= loffset;
9755             memmove(start, start + loffset, len);
9756         }
9757         STR_SET_LEN(str, len);
9758         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9759         return str;
9760     }
9761     return Qnil;
9762 }
9763
9764
9765 /*
9766  *  call-seq:
9767  *     str.strip   -> new_str
9768  *
9769  *  Returns a copy of the receiver with leading and trailing whitespace removed.
9770  *
9771  *  Whitespace is defined as any of the following characters:
9772  *  null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9773  *
9774  *     "    hello    ".strip   #=> "hello"
9775  *     "\tgoodbye\r\n".strip   #=> "goodbye"
9776  *     "\x00\t\n\v\f\r ".strip #=> ""
9777  *     "hello".strip           #=> "hello"
9778  */
9779
9780 static VALUE
9781 rb_str_strip(VALUE str)
9782 {
9783     char *start;
9784     long olen, loffset, roffset;
9785     rb_encoding *enc = STR_ENC_GET(str);
9786
9787     RSTRING_GETMEM(str, start, olen);
9788     loffset = lstrip_offset(str, start, start+olen, enc);
9789     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9790
9791     if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9792     return rb_str_subseq(str, loffset, olen-loffset-roffset);
9793 }
9794
9795 static VALUE
9796 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9797 {
9798     VALUE result, match;
9799     struct re_registers *regs;
9800     int i;
9801     long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9802     if (pos >= 0) {
9803         if (BUILTIN_TYPE(pat) == T_STRING) {
9804             regs = NULL;
9805             end = pos + RSTRING_LEN(pat);
9806         }
9807         else {
9808             match = rb_backref_get();
9809             regs = RMATCH_REGS(match);
9810             pos = BEG(0);
9811             end = END(0);
9812         }
9813         if (pos == end) {
9814             rb_encoding *enc = STR_ENC_GET(str);
9815             /*
9816              * Always consume at least one character of the input string
9817              */
9818             if (RSTRING_LEN(str) > end)
9819                 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9820                                                   RSTRING_END(str), enc);
9821             else
9822                 *start = end + 1;
9823         }
9824         else {
9825             *start = end;
9826         }
9827         if (!regs || regs->num_regs == 1) {
9828             result = rb_str_subseq(str, pos, end - pos);
9829             return result;
9830         }
9831         result = rb_ary_new2(regs->num_regs);
9832         for (i=1; i < regs->num_regs; i++) {
9833             VALUE s = Qnil;
9834             if (BEG(i) >= 0) {
9835                 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9836             }
9837             rb_ary_push(result, s);
9838         }
9839
9840         return result;
9841     }
9842     return Qnil;
9843 }
9844
9845
9846 /*
9847  *  call-seq:
9848  *     str.scan(pattern)                         -> array
9849  *     str.scan(pattern) {|match, ...| block }   -> str
9850  *
9851  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
9852  *  Regexp or a String). For each match, a result is
9853  *  generated and either added to the result array or passed to the block. If
9854  *  the pattern contains no groups, each individual result consists of the
9855  *  matched string, <code>$&</code>.  If the pattern contains groups, each
9856  *  individual result is itself an array containing one entry per group.
9857  *
9858  *     a = "cruel world"
9859  *     a.scan(/\w+/)        #=> ["cruel", "world"]
9860  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
9861  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
9862  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
9863  *
9864  *  And the block form:
9865  *
9866  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
9867  *     print "\n"
9868  *     a.scan(/(.)(.)/) {|x,y| print y, x }
9869  *     print "\n"
9870  *
9871  *  <em>produces:</em>
9872  *
9873  *     <<cruel>> <<world>>
9874  *     rceu lowlr
9875  */
9876
9877 static VALUE
9878 rb_str_scan(VALUE str, VALUE pat)
9879 {
9880     VALUE result;
9881     long start = 0;
9882     long last = -1, prev = 0;
9883     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9884
9885     pat = get_pat_quoted(pat, 1);
9886     mustnot_broken(str);
9887     if (!rb_block_given_p()) {
9888         VALUE ary = rb_ary_new();
9889
9890         while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9891             last = prev;
9892             prev = start;
9893             rb_ary_push(ary, result);
9894         }
9895         if (last >= 0) rb_pat_search(pat, str, last, 1);
9896         else rb_backref_set(Qnil);
9897         return ary;
9898     }
9899
9900     while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9901         last = prev;
9902         prev = start;
9903         rb_yield(result);
9904         str_mod_check(str, p, len);
9905     }
9906     if (last >= 0) rb_pat_search(pat, str, last, 1);
9907     return str;
9908 }
9909
9910
9911 /*
9912  *  call-seq:
9913  *     str.hex   -> integer
9914  *
9915  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
9916  *  (with an optional sign and an optional <code>0x</code>) and returns the
9917  *  corresponding number. Zero is returned on error.
9918  *
9919  *     "0x0a".hex     #=> 10
9920  *     "-1234".hex    #=> -4660
9921  *     "0".hex        #=> 0
9922  *     "wombat".hex   #=> 0
9923  */
9924
9925 static VALUE
9926 rb_str_hex(VALUE str)
9927 {
9928     return rb_str_to_inum(str, 16, FALSE);
9929 }
9930
9931
9932 /*
9933  *  call-seq:
9934  *     str.oct   -> integer
9935  *
9936  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
9937  *  optional sign) and returns the corresponding number.  Returns 0 if the
9938  *  conversion fails.
9939  *
9940  *     "123".oct       #=> 83
9941  *     "-377".oct      #=> -255
9942  *     "bad".oct       #=> 0
9943  *     "0377bad".oct   #=> 255
9944  *
9945  *  If +str+ starts with <code>0</code>, radix indicators are honored.
9946  *  See Kernel#Integer.
9947  */
9948
9949 static VALUE
9950 rb_str_oct(VALUE str)
9951 {
9952     return rb_str_to_inum(str, -8, FALSE);
9953 }
9954
9955 #ifndef HAVE_CRYPT_R
9956 # include "ruby/thread_native.h"
9957 # include "ruby/atomic.h"
9958
9959 static struct {
9960     rb_atomic_t initialized;
9961     rb_nativethread_lock_t lock;
9962 } crypt_mutex;
9963
9964 static void
9965 crypt_mutex_destroy(void)
9966 {
9967     RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9968     rb_nativethread_lock_destroy(&crypt_mutex.lock);
9969     crypt_mutex.initialized = 0;
9970 }
9971
9972 static void
9973 crypt_mutex_initialize(void)
9974 {
9975     rb_atomic_t i;
9976     while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9977     switch (i) {
9978       case 0:
9979         rb_nativethread_lock_initialize(&crypt_mutex.lock);
9980         atexit(crypt_mutex_destroy);
9981         RUBY_ASSERT(crypt_mutex.initialized == 2);
9982         RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
9983         break;
9984       case 1:
9985         break;
9986       default:
9987         rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
9988     }
9989 }
9990 #endif
9991
9992 /*
9993  *  call-seq:
9994  *     str.crypt(salt_str)   -> new_str
9995  *
9996  *  Returns the string generated by calling <code>crypt(3)</code>
9997  *  standard library function with <code>str</code> and
9998  *  <code>salt_str</code>, in this order, as its arguments.  Please do
9999  *  not use this method any longer.  It is legacy; provided only for
10000  *  backward compatibility with ruby scripts in earlier days.  It is
10001  *  bad to use in contemporary programs for several reasons:
10002  *
10003  *  * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10004  *    run.  The generated string lacks data portability.
10005  *
10006  *  * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10007  *    (i.e. silently ends up in unexpected results).
10008  *
10009  *  * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10010  *    thread safe.
10011  *
10012  *  * So-called "traditional" usage of <code>crypt(3)</code> is very
10013  *    very very weak.  According to its manpage, Linux's traditional
10014  *    <code>crypt(3)</code> output has only 2**56 variations; too
10015  *    easy to brute force today.  And this is the default behaviour.
10016  *
10017  *  * In order to make things robust some OSes implement so-called
10018  *    "modular" usage. To go through, you have to do a complex
10019  *    build-up of the <code>salt_str</code> parameter, by hand.
10020  *    Failure in generation of a proper salt string tends not to
10021  *    yield any errors; typos in parameters are normally not
10022  *    detectable.
10023  *
10024  *    * For instance, in the following example, the second invocation
10025  *      of String#crypt is wrong; it has a typo in "round=" (lacks
10026  *      "s").  However the call does not fail and something unexpected
10027  *      is generated.
10028  *
10029  *         "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10030  *         "foo".crypt("$5$round=1000$salt$")  # Typo not detected
10031  *
10032  *  * Even in the "modular" mode, some hash functions are considered
10033  *    archaic and no longer recommended at all; for instance module
10034  *    <code>$1$</code> is officially abandoned by its author: see
10035  *    http://phk.freebsd.dk/sagas/md5crypt_eol/ .  For another
10036  *    instance module <code>$3$</code> is considered completely
10037  *    broken: see the manpage of FreeBSD.
10038  *
10039  *  * On some OS such as Mac OS, there is no modular mode. Yet, as
10040  *    written above, <code>crypt(3)</code> on Mac OS never fails.
10041  *    This means even if you build up a proper salt string it
10042  *    generates a traditional DES hash anyways, and there is no way
10043  *    for you to be aware of.
10044  *
10045  *        "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10046  *
10047  *  If for some reason you cannot migrate to other secure contemporary
10048  *  password hashing algorithms, install the string-crypt gem and
10049  *  <code>require 'string/crypt'</code> to continue using it.
10050  */
10051
10052 static VALUE
10053 rb_str_crypt(VALUE str, VALUE salt)
10054 {
10055 #ifdef HAVE_CRYPT_R
10056     VALUE databuf;
10057     struct crypt_data *data;
10058 #   define CRYPT_END() ALLOCV_END(databuf)
10059 #else
10060     extern char *crypt(const char *, const char *);
10061 #   define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10062 #endif
10063     VALUE result;
10064     const char *s, *saltp;
10065     char *res;
10066 #ifdef BROKEN_CRYPT
10067     char salt_8bit_clean[3];
10068 #endif
10069
10070     StringValue(salt);
10071     mustnot_wchar(str);
10072     mustnot_wchar(salt);
10073     s = StringValueCStr(str);
10074     saltp = RSTRING_PTR(salt);
10075     if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10076         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10077     }
10078
10079 #ifdef BROKEN_CRYPT
10080     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10081         salt_8bit_clean[0] = saltp[0] & 0x7f;
10082         salt_8bit_clean[1] = saltp[1] & 0x7f;
10083         salt_8bit_clean[2] = '\0';
10084         saltp = salt_8bit_clean;
10085     }
10086 #endif
10087 #ifdef HAVE_CRYPT_R
10088     data = ALLOCV(databuf, sizeof(struct crypt_data));
10089 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10090     data->initialized = 0;
10091 # endif
10092     res = crypt_r(s, saltp, data);
10093 #else
10094     crypt_mutex_initialize();
10095     rb_nativethread_lock_lock(&crypt_mutex.lock);
10096     res = crypt(s, saltp);
10097 #endif
10098     if (!res) {
10099         int err = errno;
10100         CRYPT_END();
10101         rb_syserr_fail(err, "crypt");
10102     }
10103     result = rb_str_new_cstr(res);
10104     CRYPT_END();
10105     return result;
10106 }
10107
10108
10109 /*
10110  *  call-seq:
10111  *     str.ord   -> integer
10112  *
10113  *  Returns the Integer ordinal of a one-character string.
10114  *
10115  *     "a".ord         #=> 97
10116  */
10117
10118 static VALUE
10119 rb_str_ord(VALUE s)
10120 {
10121     unsigned int c;
10122
10123     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10124     return UINT2NUM(c);
10125 }
10126 /*
10127  *  call-seq:
10128  *     str.sum(n=16)   -> integer
10129  *
10130  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10131  *  where <em>n</em> is the optional Integer parameter, defaulting
10132  *  to 16. The result is simply the sum of the binary value of each byte in
10133  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10134  *  checksum.
10135  */
10136
10137 static VALUE
10138 rb_str_sum(int argc, VALUE *argv, VALUE str)
10139 {
10140     int bits = 16;
10141     char *ptr, *p, *pend;
10142     long len;
10143     VALUE sum = INT2FIX(0);
10144     unsigned long sum0 = 0;
10145
10146     if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10147         bits = 0;
10148     }
10149     ptr = p = RSTRING_PTR(str);
10150     len = RSTRING_LEN(str);
10151     pend = p + len;
10152
10153     while (p < pend) {
10154         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10155             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10156             str_mod_check(str, ptr, len);
10157             sum0 = 0;
10158         }
10159         sum0 += (unsigned char)*p;
10160         p++;
10161     }
10162
10163     if (bits == 0) {
10164         if (sum0) {
10165             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10166         }
10167     }
10168     else {
10169         if (sum == INT2FIX(0)) {
10170             if (bits < (int)sizeof(long)*CHAR_BIT) {
10171                 sum0 &= (((unsigned long)1)<<bits)-1;
10172             }
10173             sum = LONG2FIX(sum0);
10174         }
10175         else {
10176             VALUE mod;
10177
10178             if (sum0) {
10179                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10180             }
10181
10182             mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10183             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10184             sum = rb_funcall(sum, '&', 1, mod);
10185         }
10186     }
10187     return sum;
10188 }
10189
10190 static VALUE
10191 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10192 {
10193     rb_encoding *enc;
10194     VALUE w;
10195     long width, len, flen = 1, fclen = 1;
10196     VALUE res;
10197     char *p;
10198     const char *f = " ";
10199     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10200     VALUE pad;
10201     int singlebyte = 1, cr;
10202     int termlen;
10203
10204     rb_scan_args(argc, argv, "11", &w, &pad);
10205     enc = STR_ENC_GET(str);
10206     termlen = rb_enc_mbminlen(enc);
10207     width = NUM2LONG(w);
10208     if (argc == 2) {
10209         StringValue(pad);
10210         enc = rb_enc_check(str, pad);
10211         f = RSTRING_PTR(pad);
10212         flen = RSTRING_LEN(pad);
10213         fclen = str_strlen(pad, enc); /* rb_enc_check */
10214         singlebyte = single_byte_optimizable(pad);
10215         if (flen == 0 || fclen == 0) {
10216             rb_raise(rb_eArgError, "zero width padding");
10217         }
10218     }
10219     len = str_strlen(str, enc); /* rb_enc_check */
10220     if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10221     n = width - len;
10222     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10223     rlen = n - llen;
10224     cr = ENC_CODERANGE(str);
10225     if (flen > 1) {
10226        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10227        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10228     }
10229     size = RSTRING_LEN(str);
10230     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10231        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10232        (len += llen2 + rlen2) >= LONG_MAX - size) {
10233        rb_raise(rb_eArgError, "argument too big");
10234     }
10235     len += size;
10236     res = str_new0(rb_cString, 0, len, termlen);
10237     p = RSTRING_PTR(res);
10238     if (flen <= 1) {
10239        memset(p, *f, llen);
10240        p += llen;
10241     }
10242     else {
10243        while (llen >= fclen) {
10244             memcpy(p,f,flen);
10245             p += flen;
10246             llen -= fclen;
10247         }
10248        if (llen > 0) {
10249            memcpy(p, f, llen2);
10250            p += llen2;
10251         }
10252     }
10253     memcpy(p, RSTRING_PTR(str), size);
10254     p += size;
10255     if (flen <= 1) {
10256        memset(p, *f, rlen);
10257        p += rlen;
10258     }
10259     else {
10260        while (rlen >= fclen) {
10261             memcpy(p,f,flen);
10262             p += flen;
10263             rlen -= fclen;
10264         }
10265        if (rlen > 0) {
10266            memcpy(p, f, rlen2);
10267            p += rlen2;
10268         }
10269     }
10270     TERM_FILL(p, termlen);
10271     STR_SET_LEN(res, p-RSTRING_PTR(res));
10272     rb_enc_associate(res, enc);
10273     if (argc == 2)
10274         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10275     if (cr != ENC_CODERANGE_BROKEN)
10276         ENC_CODERANGE_SET(res, cr);
10277
10278     RB_GC_GUARD(pad);
10279     return res;
10280 }
10281
10282
10283 /*
10284  *  call-seq:
10285  *     str.ljust(integer, padstr=' ')   -> new_str
10286  *
10287  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10288  *  String of length <i>integer</i> with <i>str</i> left justified
10289  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10290  *
10291  *     "hello".ljust(4)            #=> "hello"
10292  *     "hello".ljust(20)           #=> "hello               "
10293  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
10294  */
10295
10296 static VALUE
10297 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10298 {
10299     return rb_str_justify(argc, argv, str, 'l');
10300 }
10301
10302
10303 /*
10304  *  call-seq:
10305  *     str.rjust(integer, padstr=' ')   -> new_str
10306  *
10307  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10308  *  String of length <i>integer</i> with <i>str</i> right justified
10309  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10310  *
10311  *     "hello".rjust(4)            #=> "hello"
10312  *     "hello".rjust(20)           #=> "               hello"
10313  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
10314  */
10315
10316 static VALUE
10317 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10318 {
10319     return rb_str_justify(argc, argv, str, 'r');
10320 }
10321
10322
10323 /*
10324  *  call-seq:
10325  *     str.center(width, padstr=' ')   -> new_str
10326  *
10327  *  Centers +str+ in +width+.  If +width+ is greater than the length of +str+,
10328  *  returns a new String of length +width+ with +str+ centered and padded with
10329  *  +padstr+; otherwise, returns +str+.
10330  *
10331  *     "hello".center(4)         #=> "hello"
10332  *     "hello".center(20)        #=> "       hello        "
10333  *     "hello".center(20, '123') #=> "1231231hello12312312"
10334  */
10335
10336 static VALUE
10337 rb_str_center(int argc, VALUE *argv, VALUE str)
10338 {
10339     return rb_str_justify(argc, argv, str, 'c');
10340 }
10341
10342 /*
10343  *  call-seq:
10344  *     str.partition(sep)              -> [head, sep, tail]
10345  *     str.partition(regexp)           -> [head, match, tail]
10346  *
10347  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10348  *  and returns the part before it, the match, and the part
10349  *  after it.
10350  *  If it is not found, returns two empty strings and <i>str</i>.
10351  *
10352  *     "hello".partition("l")         #=> ["he", "l", "lo"]
10353  *     "hello".partition("x")         #=> ["hello", "", ""]
10354  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
10355  */
10356
10357 static VALUE
10358 rb_str_partition(VALUE str, VALUE sep)
10359 {
10360     long pos;
10361
10362     sep = get_pat_quoted(sep, 0);
10363     if (RB_TYPE_P(sep, T_REGEXP)) {
10364         if (rb_reg_search(sep, str, 0, 0) < 0) {
10365             goto failed;
10366         }
10367         VALUE match = rb_backref_get();
10368         struct re_registers *regs = RMATCH_REGS(match);
10369
10370         pos = BEG(0);
10371         sep = rb_str_subseq(str, pos, END(0) - pos);
10372     }
10373     else {
10374         pos = rb_str_index(str, sep, 0);
10375         if (pos < 0) goto failed;
10376     }
10377     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10378                           sep,
10379                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10380                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10381
10382   failed:
10383     return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10384 }
10385
10386 /*
10387  *  call-seq:
10388  *     str.rpartition(sep)             -> [head, sep, tail]
10389  *     str.rpartition(regexp)          -> [head, match, tail]
10390  *
10391  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10392  *  of the string, and returns the part before it, the match, and the part
10393  *  after it.
10394  *  If it is not found, returns two empty strings and <i>str</i>.
10395  *
10396  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
10397  *     "hello".rpartition("x")         #=> ["", "", "hello"]
10398  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
10399  *
10400  *  The match from the end means starting at the possible last position, not
10401  *  the last of longest matches.
10402  *
10403  *     "hello".rpartition(/l+/)        #=> ["hel", "l", "o"]
10404  *
10405  *  To partition at the last longest match, needs to combine with
10406  *  negative lookbehind.
10407  *
10408  *     "hello".rpartition(/(?<!l)l+/)  #=> ["he", "ll", "o"]
10409  *
10410  *  Or String#partition with negative lookforward.
10411  *
10412  *     "hello".partition(/l+(?!.*l)/)  #=> ["he", "ll", "o"]
10413  */
10414
10415 static VALUE
10416 rb_str_rpartition(VALUE str, VALUE sep)
10417 {
10418     long pos = RSTRING_LEN(str);
10419
10420     sep = get_pat_quoted(sep, 0);
10421     if (RB_TYPE_P(sep, T_REGEXP)) {
10422         if (rb_reg_search(sep, str, pos, 1) < 0) {
10423             goto failed;
10424         }
10425         VALUE match = rb_backref_get();
10426         struct re_registers *regs = RMATCH_REGS(match);
10427
10428         pos = BEG(0);
10429         sep = rb_str_subseq(str, pos, END(0) - pos);
10430     }
10431     else {
10432         pos = rb_str_sublen(str, pos);
10433         pos = rb_str_rindex(str, sep, pos);
10434         if (pos < 0) {
10435             goto failed;
10436         }
10437         pos = rb_str_offset(str, pos);
10438     }
10439
10440     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10441                           sep,
10442                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10443                                         RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10444   failed:
10445     return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10446 }
10447
10448 /*
10449  *  call-seq:
10450  *     str.start_with?([prefixes]+)   -> true or false
10451  *
10452  *  Returns true if +str+ starts with one of the +prefixes+ given.
10453  *  Each of the +prefixes+ should be a String or a Regexp.
10454  *
10455  *    "hello".start_with?("hell")               #=> true
10456  *    "hello".start_with?(/H/i)                 #=> true
10457  *
10458  *    # returns true if one of the prefixes matches.
10459  *    "hello".start_with?("heaven", "hell")     #=> true
10460  *    "hello".start_with?("heaven", "paradise") #=> false
10461  */
10462
10463 static VALUE
10464 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10465 {
10466     int i;
10467
10468     for (i=0; i<argc; i++) {
10469         VALUE tmp = argv[i];
10470         if (RB_TYPE_P(tmp, T_REGEXP)) {
10471             if (rb_reg_start_with_p(tmp, str))
10472                 return Qtrue;
10473         }
10474         else {
10475             StringValue(tmp);
10476             rb_enc_check(str, tmp);
10477             if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10478             if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10479                 return Qtrue;
10480         }
10481     }
10482     return Qfalse;
10483 }
10484
10485 /*
10486  *  call-seq:
10487  *     str.end_with?([suffixes]+)   -> true or false
10488  *
10489  *  Returns true if +str+ ends with one of the +suffixes+ given.
10490  *
10491  *    "hello".end_with?("ello")               #=> true
10492  *
10493  *    # returns true if one of the +suffixes+ matches.
10494  *    "hello".end_with?("heaven", "ello")     #=> true
10495  *    "hello".end_with?("heaven", "paradise") #=> false
10496  */
10497
10498 static VALUE
10499 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10500 {
10501     int i;
10502     char *p, *s, *e;
10503     rb_encoding *enc;
10504
10505     for (i=0; i<argc; i++) {
10506         VALUE tmp = argv[i];
10507         long slen, tlen;
10508         StringValue(tmp);
10509         enc = rb_enc_check(str, tmp);
10510         if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10511         if ((slen = RSTRING_LEN(str)) < tlen) continue;
10512         p = RSTRING_PTR(str);
10513         e = p + slen;
10514         s = e - tlen;
10515         if (rb_enc_left_char_head(p, s, e, enc) != s)
10516             continue;
10517         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10518             return Qtrue;
10519     }
10520     return Qfalse;
10521 }
10522
10523 /*!
10524  * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10525  * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10526  *
10527  * @param str the target
10528  * @param prefix the prefix
10529  * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10530  * @retval Positive-Integer otherwise
10531  */
10532 static long
10533 deleted_prefix_length(VALUE str, VALUE prefix)
10534 {
10535     char *strptr, *prefixptr;
10536     long olen, prefixlen;
10537
10538     StringValue(prefix);
10539     if (is_broken_string(prefix)) return 0;
10540     rb_enc_check(str, prefix);
10541
10542     /* return 0 if not start with prefix */
10543     prefixlen = RSTRING_LEN(prefix);
10544     if (prefixlen <= 0) return 0;
10545     olen = RSTRING_LEN(str);
10546     if (olen < prefixlen) return 0;
10547     strptr = RSTRING_PTR(str);
10548     prefixptr = RSTRING_PTR(prefix);
10549     if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10550
10551     return prefixlen;
10552 }
10553
10554 /*
10555  *  call-seq:
10556  *     str.delete_prefix!(prefix) -> self or nil
10557  *
10558  *  Deletes leading <code>prefix</code> from <i>str</i>, returning
10559  *  <code>nil</code> if no change was made.
10560  *
10561  *     "hello".delete_prefix!("hel") #=> "lo"
10562  *     "hello".delete_prefix!("llo") #=> nil
10563  */
10564
10565 static VALUE
10566 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10567 {
10568     long prefixlen;
10569     str_modify_keep_cr(str);
10570
10571     prefixlen = deleted_prefix_length(str, prefix);
10572     if (prefixlen <= 0) return Qnil;
10573
10574     return rb_str_drop_bytes(str, prefixlen);
10575 }
10576
10577 /*
10578  *  call-seq:
10579  *     str.delete_prefix(prefix) -> new_str
10580  *
10581  *  Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10582  *
10583  *     "hello".delete_prefix("hel") #=> "lo"
10584  *     "hello".delete_prefix("llo") #=> "hello"
10585  */
10586
10587 static VALUE
10588 rb_str_delete_prefix(VALUE str, VALUE prefix)
10589 {
10590     long prefixlen;
10591
10592     prefixlen = deleted_prefix_length(str, prefix);
10593     if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10594
10595     return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10596 }
10597
10598 /*!
10599  * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10600  * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10601  *
10602  * @param str the target
10603  * @param suffix the suffix
10604  * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10605  * @retval Positive-Integer otherwise
10606  */
10607 static long
10608 deleted_suffix_length(VALUE str, VALUE suffix)
10609 {
10610     char *strptr, *suffixptr, *s;
10611     long olen, suffixlen;
10612     rb_encoding *enc;
10613
10614     StringValue(suffix);
10615     if (is_broken_string(suffix)) return 0;
10616     enc = rb_enc_check(str, suffix);
10617
10618     /* return 0 if not start with suffix */
10619     suffixlen = RSTRING_LEN(suffix);
10620     if (suffixlen <= 0) return 0;
10621     olen = RSTRING_LEN(str);
10622     if (olen < suffixlen) return 0;
10623     strptr = RSTRING_PTR(str);
10624     suffixptr = RSTRING_PTR(suffix);
10625     s = strptr + olen - suffixlen;
10626     if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10627     if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10628
10629     return suffixlen;
10630 }
10631
10632 /*
10633  *  call-seq:
10634  *     str.delete_suffix!(suffix) -> self or nil
10635  *
10636  *  Deletes trailing <code>suffix</code> from <i>str</i>, returning
10637  *  <code>nil</code> if no change was made.
10638  *
10639  *     "hello".delete_suffix!("llo") #=> "he"
10640  *     "hello".delete_suffix!("hel") #=> nil
10641  */
10642
10643 static VALUE
10644 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10645 {
10646     long olen, suffixlen, len;
10647     str_modifiable(str);
10648
10649     suffixlen = deleted_suffix_length(str, suffix);
10650     if (suffixlen <= 0) return Qnil;
10651
10652     olen = RSTRING_LEN(str);
10653     str_modify_keep_cr(str);
10654     len = olen - suffixlen;
10655     STR_SET_LEN(str, len);
10656     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10657     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10658         ENC_CODERANGE_CLEAR(str);
10659     }
10660     return str;
10661 }
10662
10663 /*
10664  *  call-seq:
10665  *     str.delete_suffix(suffix) -> new_str
10666  *
10667  *  Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10668  *
10669  *     "hello".delete_suffix("llo") #=> "he"
10670  *     "hello".delete_suffix("hel") #=> "hello"
10671  */
10672
10673 static VALUE
10674 rb_str_delete_suffix(VALUE str, VALUE suffix)
10675 {
10676     long suffixlen;
10677
10678     suffixlen = deleted_suffix_length(str, suffix);
10679     if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10680
10681     return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10682 }
10683
10684 void
10685 rb_str_setter(VALUE val, ID id, VALUE *var)
10686 {
10687     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10688         rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10689     }
10690     *var = val;
10691 }
10692
10693 static void
10694 rb_fs_setter(VALUE val, ID id, VALUE *var)
10695 {
10696     val = rb_fs_check(val);
10697     if (!val) {
10698         rb_raise(rb_eTypeError,
10699                  "value of %"PRIsVALUE" must be String or Regexp",
10700                  rb_id2str(id));
10701     }
10702     if (!NIL_P(val)) {
10703         rb_warn_deprecated("`$;'", NULL);
10704     }
10705     *var = val;
10706 }
10707
10708
10709 /*
10710  *  call-seq:
10711  *     str.force_encoding(encoding)   -> str
10712  *
10713  *  Changes the encoding to +encoding+ and returns self.
10714  */
10715
10716 static VALUE
10717 rb_str_force_encoding(VALUE str, VALUE enc)
10718 {
10719     str_modifiable(str);
10720     rb_enc_associate(str, rb_to_encoding(enc));
10721     ENC_CODERANGE_CLEAR(str);
10722     return str;
10723 }
10724
10725 /*
10726  *  call-seq:
10727  *     str.b   -> str
10728  *
10729  *  Returns a copied string whose encoding is ASCII-8BIT.
10730  */
10731
10732 static VALUE
10733 rb_str_b(VALUE str)
10734 {
10735     VALUE str2;
10736     if (FL_TEST(str, STR_NOEMBED)) {
10737         str2 = str_alloc_heap(rb_cString);
10738     }
10739     else {
10740         str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10741     }
10742     str_replace_shared_without_enc(str2, str);
10743     ENC_CODERANGE_CLEAR(str2);
10744     return str2;
10745 }
10746
10747 /*
10748  *  call-seq:
10749  *     str.valid_encoding?  -> true or false
10750  *
10751  *  Returns true for a string which is encoded correctly.
10752  *
10753  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
10754  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
10755  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
10756  */
10757
10758 static VALUE
10759 rb_str_valid_encoding_p(VALUE str)
10760 {
10761     int cr = rb_enc_str_coderange(str);
10762
10763     return RBOOL(cr != ENC_CODERANGE_BROKEN);
10764 }
10765
10766 /*
10767  *  call-seq:
10768  *     str.ascii_only?  -> true or false
10769  *
10770  *  Returns true for a string which has only ASCII characters.
10771  *
10772  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
10773  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
10774  */
10775
10776 static VALUE
10777 rb_str_is_ascii_only_p(VALUE str)
10778 {
10779     int cr = rb_enc_str_coderange(str);
10780
10781     return RBOOL(cr == ENC_CODERANGE_7BIT);
10782 }
10783
10784 VALUE
10785 rb_str_ellipsize(VALUE str, long len)
10786 {
10787     static const char ellipsis[] = "...";
10788     const long ellipsislen = sizeof(ellipsis) - 1;
10789     rb_encoding *const enc = rb_enc_get(str);
10790     const long blen = RSTRING_LEN(str);
10791     const char *const p = RSTRING_PTR(str), *e = p + blen;
10792     VALUE estr, ret = 0;
10793
10794     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10795     if (len * rb_enc_mbminlen(enc) >= blen ||
10796         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10797         ret = str;
10798     }
10799     else if (len <= ellipsislen ||
10800              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10801         if (rb_enc_asciicompat(enc)) {
10802             ret = rb_str_new(ellipsis, len);
10803             rb_enc_associate(ret, enc);
10804         }
10805         else {
10806             estr = rb_usascii_str_new(ellipsis, len);
10807             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10808         }
10809     }
10810     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10811         rb_str_cat(ret, ellipsis, ellipsislen);
10812     }
10813     else {
10814         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10815                              rb_enc_from_encoding(enc), 0, Qnil);
10816         rb_str_append(ret, estr);
10817     }
10818     return ret;
10819 }
10820
10821 static VALUE
10822 str_compat_and_valid(VALUE str, rb_encoding *enc)
10823 {
10824     int cr;
10825     str = StringValue(str);
10826     cr = rb_enc_str_coderange(str);
10827     if (cr == ENC_CODERANGE_BROKEN) {
10828         rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10829     }
10830     else {
10831         rb_encoding *e = STR_ENC_GET(str);
10832         if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10833             rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10834                      rb_enc_name(enc), rb_enc_name(e));
10835         }
10836     }
10837     return str;
10838 }
10839
10840 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10841
10842 VALUE
10843 rb_str_scrub(VALUE str, VALUE repl)
10844 {
10845     rb_encoding *enc = STR_ENC_GET(str);
10846     return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10847 }
10848
10849 VALUE
10850 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10851 {
10852     int cr = ENC_CODERANGE_UNKNOWN;
10853     if (enc == STR_ENC_GET(str)) {
10854         /* cached coderange makes sense only when enc equals the
10855          * actual encoding of str */
10856         cr = ENC_CODERANGE(str);
10857     }
10858     return enc_str_scrub(enc, str, repl, cr);
10859 }
10860
10861 static VALUE
10862 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10863 {
10864     int encidx;
10865     VALUE buf = Qnil;
10866     const char *rep, *p, *e, *p1, *sp;
10867     long replen = -1;
10868     long slen;
10869
10870     if (rb_block_given_p()) {
10871         if (!NIL_P(repl))
10872             rb_raise(rb_eArgError, "both of block and replacement given");
10873         replen = 0;
10874     }
10875
10876     if (ENC_CODERANGE_CLEAN_P(cr))
10877         return Qnil;
10878
10879     if (!NIL_P(repl)) {
10880         repl = str_compat_and_valid(repl, enc);
10881     }
10882
10883     if (rb_enc_dummy_p(enc)) {
10884         return Qnil;
10885     }
10886     encidx = rb_enc_to_index(enc);
10887
10888 #define DEFAULT_REPLACE_CHAR(str) do { \
10889         static const char replace[sizeof(str)-1] = str; \
10890         rep = replace; replen = (int)sizeof(replace); \
10891     } while (0)
10892
10893     slen = RSTRING_LEN(str);
10894     p = RSTRING_PTR(str);
10895     e = RSTRING_END(str);
10896     p1 = p;
10897     sp = p;
10898
10899     if (rb_enc_asciicompat(enc)) {
10900         int rep7bit_p;
10901         if (!replen) {
10902             rep = NULL;
10903             rep7bit_p = FALSE;
10904         }
10905         else if (!NIL_P(repl)) {
10906             rep = RSTRING_PTR(repl);
10907             replen = RSTRING_LEN(repl);
10908             rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10909         }
10910         else if (encidx == rb_utf8_encindex()) {
10911             DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10912             rep7bit_p = FALSE;
10913         }
10914         else {
10915             DEFAULT_REPLACE_CHAR("?");
10916             rep7bit_p = TRUE;
10917         }
10918         cr = ENC_CODERANGE_7BIT;
10919
10920         p = search_nonascii(p, e);
10921         if (!p) {
10922             p = e;
10923         }
10924         while (p < e) {
10925             int ret = rb_enc_precise_mbclen(p, e, enc);
10926             if (MBCLEN_NEEDMORE_P(ret)) {
10927                 break;
10928             }
10929             else if (MBCLEN_CHARFOUND_P(ret)) {
10930                 cr = ENC_CODERANGE_VALID;
10931                 p += MBCLEN_CHARFOUND_LEN(ret);
10932             }
10933             else if (MBCLEN_INVALID_P(ret)) {
10934                 /*
10935                  * p1~p: valid ascii/multibyte chars
10936                  * p ~e: invalid bytes + unknown bytes
10937                  */
10938                 long clen = rb_enc_mbmaxlen(enc);
10939                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10940                 if (p > p1) {
10941                     rb_str_buf_cat(buf, p1, p - p1);
10942                 }
10943
10944                 if (e - p < clen) clen = e - p;
10945                 if (clen <= 2) {
10946                     clen = 1;
10947                 }
10948                 else {
10949                     const char *q = p;
10950                     clen--;
10951                     for (; clen > 1; clen--) {
10952                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
10953                         if (MBCLEN_NEEDMORE_P(ret)) break;
10954                         if (MBCLEN_INVALID_P(ret)) continue;
10955                         UNREACHABLE;
10956                     }
10957                 }
10958                 if (rep) {
10959                     rb_str_buf_cat(buf, rep, replen);
10960                     if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10961                 }
10962                 else {
10963                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
10964                     str_mod_check(str, sp, slen);
10965                     repl = str_compat_and_valid(repl, enc);
10966                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10967                     if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10968                         cr = ENC_CODERANGE_VALID;
10969                 }
10970                 p += clen;
10971                 p1 = p;
10972                 p = search_nonascii(p, e);
10973                 if (!p) {
10974                     p = e;
10975                     break;
10976                 }
10977             }
10978             else {
10979                 UNREACHABLE;
10980             }
10981         }
10982         if (NIL_P(buf)) {
10983             if (p == e) {
10984                 ENC_CODERANGE_SET(str, cr);
10985                 return Qnil;
10986             }
10987             buf = rb_str_buf_new(RSTRING_LEN(str));
10988         }
10989         if (p1 < p) {
10990             rb_str_buf_cat(buf, p1, p - p1);
10991         }
10992         if (p < e) {
10993             if (rep) {
10994                 rb_str_buf_cat(buf, rep, replen);
10995                 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10996             }
10997             else {
10998                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
10999                 str_mod_check(str, sp, slen);
11000                 repl = str_compat_and_valid(repl, enc);
11001                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11002                 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11003                     cr = ENC_CODERANGE_VALID;
11004             }
11005         }
11006     }
11007     else {
11008         /* ASCII incompatible */
11009         long mbminlen = rb_enc_mbminlen(enc);
11010         if (!replen) {
11011             rep = NULL;
11012         }
11013         else if (!NIL_P(repl)) {
11014             rep = RSTRING_PTR(repl);
11015             replen = RSTRING_LEN(repl);
11016         }
11017         else if (encidx == ENCINDEX_UTF_16BE) {
11018             DEFAULT_REPLACE_CHAR("\xFF\xFD");
11019         }
11020         else if (encidx == ENCINDEX_UTF_16LE) {
11021             DEFAULT_REPLACE_CHAR("\xFD\xFF");
11022         }
11023         else if (encidx == ENCINDEX_UTF_32BE) {
11024             DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11025         }
11026         else if (encidx == ENCINDEX_UTF_32LE) {
11027             DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11028         }
11029         else {
11030             DEFAULT_REPLACE_CHAR("?");
11031         }
11032
11033         while (p < e) {
11034             int ret = rb_enc_precise_mbclen(p, e, enc);
11035             if (MBCLEN_NEEDMORE_P(ret)) {
11036                 break;
11037             }
11038             else if (MBCLEN_CHARFOUND_P(ret)) {
11039                 p += MBCLEN_CHARFOUND_LEN(ret);
11040             }
11041             else if (MBCLEN_INVALID_P(ret)) {
11042                 const char *q = p;
11043                 long clen = rb_enc_mbmaxlen(enc);
11044                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11045                 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11046
11047                 if (e - p < clen) clen = e - p;
11048                 if (clen <= mbminlen * 2) {
11049                     clen = mbminlen;
11050                 }
11051                 else {
11052                     clen -= mbminlen;
11053                     for (; clen > mbminlen; clen-=mbminlen) {
11054                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
11055                         if (MBCLEN_NEEDMORE_P(ret)) break;
11056                         if (MBCLEN_INVALID_P(ret)) continue;
11057                         UNREACHABLE;
11058                     }
11059                 }
11060                 if (rep) {
11061                     rb_str_buf_cat(buf, rep, replen);
11062                 }
11063                 else {
11064                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
11065                     str_mod_check(str, sp, slen);
11066                     repl = str_compat_and_valid(repl, enc);
11067                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11068                 }
11069                 p += clen;
11070                 p1 = p;
11071             }
11072             else {
11073                 UNREACHABLE;
11074             }
11075         }
11076         if (NIL_P(buf)) {
11077             if (p == e) {
11078                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11079                 return Qnil;
11080             }
11081             buf = rb_str_buf_new(RSTRING_LEN(str));
11082         }
11083         if (p1 < p) {
11084             rb_str_buf_cat(buf, p1, p - p1);
11085         }
11086         if (p < e) {
11087             if (rep) {
11088                 rb_str_buf_cat(buf, rep, replen);
11089             }
11090             else {
11091                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11092                 str_mod_check(str, sp, slen);
11093                 repl = str_compat_and_valid(repl, enc);
11094                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11095             }
11096         }
11097         cr = ENC_CODERANGE_VALID;
11098     }
11099     ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11100     return buf;
11101 }
11102
11103 /*
11104  *  call-seq:
11105  *    str.scrub -> new_str
11106  *    str.scrub(repl) -> new_str
11107  *    str.scrub{|bytes|} -> new_str
11108  *
11109  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
11110  *  character, else returns self.
11111  *  If block is given, replace invalid bytes with returned value of the block.
11112  *
11113  *     "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11114  *     "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11115  *     "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11116  */
11117 static VALUE
11118 str_scrub(int argc, VALUE *argv, VALUE str)
11119 {
11120     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11121     VALUE new = rb_str_scrub(str, repl);
11122     return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11123 }
11124
11125 /*
11126  *  call-seq:
11127  *    str.scrub! -> str
11128  *    str.scrub!(repl) -> str
11129  *    str.scrub!{|bytes|} -> str
11130  *
11131  *  If the string is invalid byte sequence then replace invalid bytes with given replacement
11132  *  character, else returns self.
11133  *  If block is given, replace invalid bytes with returned value of the block.
11134  *
11135  *     "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11136  *     "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11137  *     "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11138  */
11139 static VALUE
11140 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11141 {
11142     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11143     VALUE new = rb_str_scrub(str, repl);
11144     if (!NIL_P(new)) rb_str_replace(str, new);
11145     return str;
11146 }
11147
11148 static ID id_normalize;
11149 static ID id_normalized_p;
11150 static VALUE mUnicodeNormalize;
11151
11152 static VALUE
11153 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11154 {
11155     static int UnicodeNormalizeRequired = 0;
11156     VALUE argv2[2];
11157
11158     if (!UnicodeNormalizeRequired) {
11159         rb_require("unicode_normalize/normalize.rb");
11160         UnicodeNormalizeRequired = 1;
11161     }
11162     argv2[0] = str;
11163     if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11164     return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11165 }
11166
11167 /*
11168  *  call-seq:
11169  *    str.unicode_normalize(form=:nfc)
11170  *
11171  *  Unicode Normalization---Returns a normalized form of +str+,
11172  *  using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11173  *  The normalization form used is determined by +form+, which can
11174  *  be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11175  *  The default is +:nfc+.
11176  *
11177  *  If the string is not in a Unicode Encoding, then an Exception is raised.
11178  *  In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11179  *  and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11180  *  Anything other than UTF-8 is implemented by converting to UTF-8,
11181  *  which makes it slower than UTF-8.
11182  *
11183  *    "a\u0300".unicode_normalize        #=> "\u00E0"
11184  *    "a\u0300".unicode_normalize(:nfc)  #=> "\u00E0"
11185  *    "\u00E0".unicode_normalize(:nfd)   #=> "a\u0300"
11186  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11187  *                                       #=> Encoding::CompatibilityError raised
11188  */
11189 static VALUE
11190 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11191 {
11192     return unicode_normalize_common(argc, argv, str, id_normalize);
11193 }
11194
11195 /*
11196  *  call-seq:
11197  *    str.unicode_normalize!(form=:nfc)
11198  *
11199  *  Destructive version of String#unicode_normalize, doing Unicode
11200  *  normalization in place.
11201  */
11202 static VALUE
11203 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11204 {
11205     return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11206 }
11207
11208 /*  call-seq:
11209  *    str.unicode_normalized?(form=:nfc)
11210  *
11211  *  Checks whether +str+ is in Unicode normalization form +form+,
11212  *  which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11213  *  The default is +:nfc+.
11214  *
11215  *  If the string is not in a Unicode Encoding, then an Exception is raised.
11216  *  For details, see String#unicode_normalize.
11217  *
11218  *    "a\u0300".unicode_normalized?        #=> false
11219  *    "a\u0300".unicode_normalized?(:nfd)  #=> true
11220  *    "\u00E0".unicode_normalized?         #=> true
11221  *    "\u00E0".unicode_normalized?(:nfd)   #=> false
11222  *    "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11223  *                                         #=> Encoding::CompatibilityError raised
11224  */
11225 static VALUE
11226 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11227 {
11228     return unicode_normalize_common(argc, argv, str, id_normalized_p);
11229 }
11230
11231 /**********************************************************************
11232  * Document-class: Symbol
11233  *
11234  * Symbol objects represent named identifiers inside the Ruby interpreter.
11235  *
11236  * You can create a \Symbol object explicitly with:
11237  *
11238  * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11239  *
11240  * The same Symbol object will be
11241  * created for a given name or string for the duration of a program's
11242  * execution, regardless of the context or meaning of that name. Thus
11243  * if <code>Fred</code> is a constant in one context, a method in
11244  * another, and a class in a third, the Symbol <code>:Fred</code>
11245  * will be the same object in all three contexts.
11246  *
11247  *     module One
11248  *       class Fred
11249  *       end
11250  *       $f1 = :Fred
11251  *     end
11252  *     module Two
11253  *       Fred = 1
11254  *       $f2 = :Fred
11255  *     end
11256  *     def Fred()
11257  *     end
11258  *     $f3 = :Fred
11259  *     $f1.object_id   #=> 2514190
11260  *     $f2.object_id   #=> 2514190
11261  *     $f3.object_id   #=> 2514190
11262  *
11263  * Constant, method, and variable names are returned as symbols:
11264  *
11265  *     module One
11266  *       Two = 2
11267  *       def three; 3 end
11268  *       @four = 4
11269  *       @@five = 5
11270  *       $six = 6
11271  *     end
11272  *     seven = 7
11273  *
11274  *     One.constants
11275  *     # => [:Two]
11276  *     One.instance_methods(true)
11277  *     # => [:three]
11278  *     One.instance_variables
11279  *     # => [:@four]
11280  *     One.class_variables
11281  *     # => [:@@five]
11282  *     global_variables.grep(/six/)
11283  *     # => [:$six]
11284  *     local_variables
11285  *     # => [:seven]
11286  *
11287  * Symbol objects are different from String objects in that
11288  * Symbol objects represent identifiers, while String objects
11289  * represent text or data.
11290  *
11291  * == What's Here
11292  *
11293  * First, what's elsewhere. \Class \Symbol:
11294  *
11295  * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11296  * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11297  *
11298  * Here, class \Symbol provides methods that are useful for:
11299  *
11300  * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11301  * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11302  * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11303  *
11304  * === Methods for Querying
11305  *
11306  * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11307  * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11308  *                          in symbol that matches a given Regexp
11309  *                          or other object; returns +nil+ if no match is found.
11310  * - #[], #slice :: Returns a substring of symbol
11311  *                  determined by a given index, start/length, or range, or string.
11312  * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11313  * - #encoding:: Returns the Encoding object that represents the encoding
11314  *               of symbol.
11315  * - #end_with?:: Returns +true+ if symbol ends with
11316  *                any of the given strings.
11317  * - #match:: Returns a MatchData object if symbol
11318  *            matches a given Regexp; +nil+ otherwise.
11319  * - #match?:: Returns +true+ if symbol
11320  *             matches a given Regexp; +false+ otherwise.
11321  * - #length, #size:: Returns the number of characters in symbol.
11322  * - #start_with?:: Returns +true+ if symbol starts with
11323  *                  any of the given strings.
11324  *
11325  * === Methods for Comparing
11326  *
11327  * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11328  * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11329  *                                  has the same content and encoding.
11330  * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11331  *              symbol is smaller than, equal to, or larger than symbol.
11332  * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11333  *               after Unicode case folding; +false+ otherwise.
11334  *
11335  * === Methods for Converting
11336  *
11337  * - #capitalize:: Returns symbol with the first character upcased
11338  *                 and all other characters downcased.
11339  * - #downcase:: Returns symbol with all characters downcased.
11340  * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11341  * - #name:: Returns the frozen string corresponding to symbol.
11342  * - #succ, #next:: Returns the symbol that is the successor to symbol.
11343  * - #swapcase:: Returns symbol with all upcase characters downcased
11344  *               and all downcase characters upcased.
11345  * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11346  * - #to_s, #id2name:: Returns the string corresponding to +self+.
11347  * - #to_sym, #intern:: Returns +self+.
11348  * - #upcase:: Returns symbol with all characters upcased.
11349  *
11350  */
11351
11352
11353 /*
11354  *  call-seq:
11355  *     sym == obj   -> true or false
11356  *
11357  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11358  *  symbol, returns <code>true</code>.
11359  */
11360
11361 #define sym_equal rb_obj_equal
11362
11363 static int
11364 sym_printable(const char *s, const char *send, rb_encoding *enc)
11365 {
11366     while (s < send) {
11367         int n;
11368         int c = rb_enc_precise_mbclen(s, send, enc);
11369
11370         if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11371         n = MBCLEN_CHARFOUND_LEN(c);
11372         c = rb_enc_mbc_to_codepoint(s, send, enc);
11373         if (!rb_enc_isprint(c, enc)) return FALSE;
11374         s += n;
11375     }
11376     return TRUE;
11377 }
11378
11379 int
11380 rb_str_symname_p(VALUE sym)
11381 {
11382     rb_encoding *enc;
11383     const char *ptr;
11384     long len;
11385     rb_encoding *resenc = rb_default_internal_encoding();
11386
11387     if (resenc == NULL) resenc = rb_default_external_encoding();
11388     enc = STR_ENC_GET(sym);
11389     ptr = RSTRING_PTR(sym);
11390     len = RSTRING_LEN(sym);
11391     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11392         !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11393         return FALSE;
11394     }
11395     return TRUE;
11396 }
11397
11398 VALUE
11399 rb_str_quote_unprintable(VALUE str)
11400 {
11401     rb_encoding *enc;
11402     const char *ptr;
11403     long len;
11404     rb_encoding *resenc;
11405
11406     Check_Type(str, T_STRING);
11407     resenc = rb_default_internal_encoding();
11408     if (resenc == NULL) resenc = rb_default_external_encoding();
11409     enc = STR_ENC_GET(str);
11410     ptr = RSTRING_PTR(str);
11411     len = RSTRING_LEN(str);
11412     if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11413         !sym_printable(ptr, ptr + len, enc)) {
11414         return rb_str_escape(str);
11415     }
11416     return str;
11417 }
11418
11419 MJIT_FUNC_EXPORTED VALUE
11420 rb_id_quote_unprintable(ID id)
11421 {
11422     VALUE str = rb_id2str(id);
11423     if (!rb_str_symname_p(str)) {
11424         return rb_str_escape(str);
11425     }
11426     return str;
11427 }
11428
11429 /*
11430  *  call-seq:
11431  *     sym.inspect    -> string
11432  *
11433  *  Returns the representation of <i>sym</i> as a symbol literal.
11434  *
11435  *     :fred.inspect   #=> ":fred"
11436  */
11437
11438 static VALUE
11439 sym_inspect(VALUE sym)
11440 {
11441     VALUE str = rb_sym2str(sym);
11442     const char *ptr;
11443     long len;
11444     char *dest;
11445
11446     if (!rb_str_symname_p(str)) {
11447         str = rb_str_inspect(str);
11448         len = RSTRING_LEN(str);
11449         rb_str_resize(str, len + 1);
11450         dest = RSTRING_PTR(str);
11451         memmove(dest + 1, dest, len);
11452     }
11453     else {
11454         rb_encoding *enc = STR_ENC_GET(str);
11455         RSTRING_GETMEM(str, ptr, len);
11456         str = rb_enc_str_new(0, len + 1, enc);
11457         dest = RSTRING_PTR(str);
11458         memcpy(dest + 1, ptr, len);
11459     }
11460     dest[0] = ':';
11461     return str;
11462 }
11463
11464 #if 0 /* for RDoc */
11465 /*
11466  *  call-seq:
11467  *     sym.name   -> string
11468  *
11469  *  Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11470  *  returned string is frozen.
11471  *
11472  *     :fred.name         #=> "fred"
11473  *     :fred.name.frozen? #=> true
11474  *     :fred.to_s         #=> "fred"
11475  *     :fred.to_s.frozen? #=> false
11476  */
11477 VALUE
11478 rb_sym2str(VALUE sym)
11479 {
11480
11481 }
11482 #endif
11483
11484
11485 /*
11486  *  call-seq:
11487  *     sym.id2name   -> string
11488  *     sym.to_s      -> string
11489  *
11490  *  Returns the name or string corresponding to <i>sym</i>.
11491  *
11492  *     :fred.id2name   #=> "fred"
11493  *     :ginger.to_s    #=> "ginger"
11494  *
11495  *  Note that this string is not frozen (unlike the symbol itself).
11496  *  To get a frozen string, use #name.
11497  */
11498
11499
11500 VALUE
11501 rb_sym_to_s(VALUE sym)
11502 {
11503     return str_new_shared(rb_cString, rb_sym2str(sym));
11504 }
11505
11506
11507 /*
11508  * call-seq:
11509  *   sym.to_sym   -> sym
11510  *   sym.intern   -> sym
11511  *
11512  * In general, <code>to_sym</code> returns the Symbol corresponding
11513  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11514  * in this case.
11515  */
11516
11517 static VALUE
11518 sym_to_sym(VALUE sym)
11519 {
11520     return sym;
11521 }
11522
11523 MJIT_FUNC_EXPORTED VALUE
11524 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11525 {
11526     VALUE obj;
11527
11528     if (argc < 1) {
11529         rb_raise(rb_eArgError, "no receiver given");
11530     }
11531     obj = argv[0];
11532     return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11533 }
11534
11535 #if 0
11536 /*
11537  * call-seq:
11538  *   sym.to_proc
11539  *
11540  * Returns a _Proc_ object which responds to the given method by _sym_.
11541  *
11542  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
11543  */
11544
11545 VALUE
11546 rb_sym_to_proc(VALUE sym)
11547 {
11548 }
11549 #endif
11550
11551 /*
11552  * call-seq:
11553  *
11554  *   sym.succ
11555  *
11556  * Same as <code>sym.to_s.succ.intern</code>.
11557  */
11558
11559 static VALUE
11560 sym_succ(VALUE sym)
11561 {
11562     return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11563 }
11564
11565 /*
11566  * call-seq:
11567  *
11568  *   symbol <=> other_symbol       -> -1, 0, +1, or nil
11569  *
11570  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11571  * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11572  * less than, equal to, or greater than +other_symbol+.
11573  *
11574  * +nil+ is returned if the two values are incomparable.
11575  *
11576  * See String#<=> for more information.
11577  */
11578
11579 static VALUE
11580 sym_cmp(VALUE sym, VALUE other)
11581 {
11582     if (!SYMBOL_P(other)) {
11583         return Qnil;
11584     }
11585     return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11586 }
11587
11588 /*
11589  *  call-seq:
11590  *    casecmp(other_symbol) -> -1, 0, 1, or nil
11591  *
11592  *  Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11593  *
11594  *    :aBcDeF.casecmp(:abcde)   # => 1
11595  *    :aBcDeF.casecmp(:abcdef)  # => 0
11596  *    :aBcDeF.casecmp(:abcdefg) # => -1
11597  *    :abcdef.casecmp(:ABCDEF)  # => 0
11598  *
11599  *  Returns +nil+ if the two symbols have incompatible encodings,
11600  *  or if +other_symbol+ is not a symbol:
11601  *
11602  *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11603  *    other_sym = :"\u{c4 d6 dc}"
11604  *    sym.casecmp(other_sym) # => nil
11605  *    :foo.casecmp(2)        # => nil
11606  *
11607  *  Currently, case-insensitivity only works on characters A-Z/a-z,
11608  *  not all of Unicode. This is different from Symbol#casecmp?.
11609  *
11610  *  Related: Symbol#casecmp?.
11611  *
11612  */
11613
11614 static VALUE
11615 sym_casecmp(VALUE sym, VALUE other)
11616 {
11617     if (!SYMBOL_P(other)) {
11618         return Qnil;
11619     }
11620     return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11621 }
11622
11623 /*
11624  *  call-seq:
11625  *    casecmp?(other_symbol) -> true, false, or nil
11626  *
11627  *  Returns +true+ if +sym+ and +other_symbol+ are equal after
11628  *  Unicode case folding, +false+ if they are not equal:
11629  *
11630  *    :aBcDeF.casecmp?(:abcde)                  # => false
11631  *    :aBcDeF.casecmp?(:abcdef)                 # => true
11632  *    :aBcDeF.casecmp?(:abcdefg)                # => false
11633  *    :abcdef.casecmp?(:ABCDEF)                 # => true
11634  *    :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11635  *
11636  *  Returns +nil+ if the two symbols have incompatible encodings,
11637  *  or if +other_symbol+ is not a symbol:
11638  *
11639  *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11640  *    other_sym = :"\u{c4 d6 dc}"
11641  *    sym.casecmp?(other_sym) # => nil
11642  *    :foo.casecmp?(2)        # => nil
11643  *
11644  *  See {Case Mapping}[doc/case_mapping_rdoc.html].
11645  *
11646  *  Related: Symbol#casecmp.
11647  *
11648  */
11649
11650 static VALUE
11651 sym_casecmp_p(VALUE sym, VALUE other)
11652 {
11653     if (!SYMBOL_P(other)) {
11654         return Qnil;
11655     }
11656     return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11657 }
11658
11659 /*
11660  * call-seq:
11661  *   sym =~ obj   -> integer or nil
11662  *
11663  * Returns <code>sym.to_s =~ obj</code>.
11664  */
11665
11666 static VALUE
11667 sym_match(VALUE sym, VALUE other)
11668 {
11669     return rb_str_match(rb_sym2str(sym), other);
11670 }
11671
11672 /*
11673  * call-seq:
11674  *   sym.match(pattern)        -> matchdata or nil
11675  *   sym.match(pattern, pos)   -> matchdata or nil
11676  *
11677  * Returns <code>sym.to_s.match</code>.
11678  */
11679
11680 static VALUE
11681 sym_match_m(int argc, VALUE *argv, VALUE sym)
11682 {
11683     return rb_str_match_m(argc, argv, rb_sym2str(sym));
11684 }
11685
11686 /*
11687  * call-seq:
11688  *   sym.match?(pattern)        -> true or false
11689  *   sym.match?(pattern, pos)   -> true or false
11690  *
11691  * Returns <code>sym.to_s.match?</code>.
11692  */
11693
11694 static VALUE
11695 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11696 {
11697     return rb_str_match_m_p(argc, argv, sym);
11698 }
11699
11700 /*
11701  * call-seq:
11702  *   sym[idx]      -> char
11703  *   sym[b, n]     -> string
11704  *   sym.slice(idx)      -> char
11705  *   sym.slice(b, n)     -> string
11706  *
11707  * Returns <code>sym.to_s[]</code>.
11708  */
11709
11710 static VALUE
11711 sym_aref(int argc, VALUE *argv, VALUE sym)
11712 {
11713     return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11714 }
11715
11716 /*
11717  * call-seq:
11718  *   sym.length   -> integer
11719  *   sym.size     -> integer
11720  *
11721  * Same as <code>sym.to_s.length</code>.
11722  */
11723
11724 static VALUE
11725 sym_length(VALUE sym)
11726 {
11727     return rb_str_length(rb_sym2str(sym));
11728 }
11729
11730 /*
11731  * call-seq:
11732  *   sym.empty?   -> true or false
11733  *
11734  * Returns whether _sym_ is :"" or not.
11735  */
11736
11737 static VALUE
11738 sym_empty(VALUE sym)
11739 {
11740     return rb_str_empty(rb_sym2str(sym));
11741 }
11742
11743 /*
11744  *  call-seq:
11745  *    upcase(*options) -> symbol
11746  *
11747  *  Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11748  *
11749  *  See String#upcase.
11750  *
11751  */
11752
11753 static VALUE
11754 sym_upcase(int argc, VALUE *argv, VALUE sym)
11755 {
11756     return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11757 }
11758
11759 /*
11760  *  call-seq:
11761  *    downcase(*options) -> symbol
11762  *
11763  *  Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11764  *
11765  *  See String#downcase.
11766  *
11767  *  Related: Symbol#upcase.
11768  *
11769  */
11770
11771 static VALUE
11772 sym_downcase(int argc, VALUE *argv, VALUE sym)
11773 {
11774     return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11775 }
11776
11777 /*
11778  *  call-seq:
11779  *    capitalize(*options) -> symbol
11780  *
11781  *  Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11782  *
11783  *  See String#capitalize.
11784  *
11785  */
11786
11787 static VALUE
11788 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11789 {
11790     return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11791 }
11792
11793 /*
11794  *  call-seq:
11795  *    swapcase(*options) -> symbol
11796  *
11797  *  Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11798  *
11799  *  See String#swapcase.
11800  *
11801  */
11802
11803 static VALUE
11804 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11805 {
11806     return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11807 }
11808
11809 /*
11810  *  call-seq:
11811  *     sym.start_with?([prefixes]+)   -> true or false
11812  *
11813  *  Returns true if +sym+ starts with one of the +prefixes+ given.
11814  *  Each of the +prefixes+ should be a String or a Regexp.
11815  *
11816  *    :hello.start_with?("hell")               #=> true
11817  *    :hello.start_with?(/H/i)                 #=> true
11818  *
11819  *    # returns true if one of the prefixes matches.
11820  *    :hello.start_with?("heaven", "hell")     #=> true
11821  *    :hello.start_with?("heaven", "paradise") #=> false
11822  */
11823
11824 static VALUE
11825 sym_start_with(int argc, VALUE *argv, VALUE sym)
11826 {
11827     return rb_str_start_with(argc, argv, rb_sym2str(sym));
11828 }
11829
11830 /*
11831  *  call-seq:
11832  *     sym.end_with?([suffixes]+)   -> true or false
11833  *
11834  *  Returns true if +sym+ ends with one of the +suffixes+ given.
11835  *
11836  *    :hello.end_with?("ello")               #=> true
11837  *
11838  *    # returns true if one of the +suffixes+ matches.
11839  *    :hello.end_with?("heaven", "ello")     #=> true
11840  *    :hello.end_with?("heaven", "paradise") #=> false
11841  */
11842
11843 static VALUE
11844 sym_end_with(int argc, VALUE *argv, VALUE sym)
11845 {
11846     return rb_str_end_with(argc, argv, rb_sym2str(sym));
11847 }
11848
11849 /*
11850  * call-seq:
11851  *   sym.encoding   -> encoding
11852  *
11853  * Returns the Encoding object that represents the encoding of _sym_.
11854  */
11855
11856 static VALUE
11857 sym_encoding(VALUE sym)
11858 {
11859     return rb_obj_encoding(rb_sym2str(sym));
11860 }
11861
11862 static VALUE
11863 string_for_symbol(VALUE name)
11864 {
11865     if (!RB_TYPE_P(name, T_STRING)) {
11866         VALUE tmp = rb_check_string_type(name);
11867         if (NIL_P(tmp)) {
11868             rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11869                      name);
11870         }
11871         name = tmp;
11872     }
11873     return name;
11874 }
11875
11876 ID
11877 rb_to_id(VALUE name)
11878 {
11879     if (SYMBOL_P(name)) {
11880         return SYM2ID(name);
11881     }
11882     name = string_for_symbol(name);
11883     return rb_intern_str(name);
11884 }
11885
11886 VALUE
11887 rb_to_symbol(VALUE name)
11888 {
11889     if (SYMBOL_P(name)) {
11890         return name;
11891     }
11892     name = string_for_symbol(name);
11893     return rb_str_intern(name);
11894 }
11895
11896 /*
11897  *  call-seq:
11898  *     Symbol.all_symbols    => array
11899  *
11900  *  Returns an array of all the symbols currently in Ruby's symbol
11901  *  table.
11902  *
11903  *     Symbol.all_symbols.size    #=> 903
11904  *     Symbol.all_symbols[1,20]   #=> [:floor, :ARGV, :Binding, :symlink,
11905  *                                     :chown, :EOFError, :$;, :String,
11906  *                                     :LOCK_SH, :"setuid?", :$<,
11907  *                                     :default_proc, :compact, :extend,
11908  *                                     :Tms, :getwd, :$=, :ThreadGroup,
11909  *                                     :wait2, :$>]
11910  */
11911
11912 static VALUE
11913 sym_all_symbols(VALUE _)
11914 {
11915     return rb_sym_all_symbols();
11916 }
11917
11918 VALUE
11919 rb_str_to_interned_str(VALUE str)
11920 {
11921     return rb_fstring(str);
11922 }
11923
11924 VALUE
11925 rb_interned_str(const char *ptr, long len)
11926 {
11927     struct RString fake_str;
11928     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11929 }
11930
11931 VALUE
11932 rb_interned_str_cstr(const char *ptr)
11933 {
11934     return rb_interned_str(ptr, strlen(ptr));
11935 }
11936
11937 VALUE
11938 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11939 {
11940     if (UNLIKELY(rb_enc_autoload_p(enc))) {
11941         rb_enc_autoload(enc);
11942     }
11943
11944     struct RString fake_str;
11945     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11946 }
11947
11948 VALUE
11949 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
11950 {
11951     return rb_enc_interned_str(ptr, strlen(ptr), enc);
11952 }
11953
11954 /*
11955  *  A \String object has an arbitrary sequence of bytes,
11956  *  typically representing text or binary data.
11957  *  A \String object may be created using String::new or as literals.
11958  *
11959  *  String objects differ from Symbol objects in that Symbol objects are
11960  *  designed to be used as identifiers, instead of text or data.
11961  *
11962  *  You can create a \String object explicitly with:
11963  *
11964  *  - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11965  *  - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11966  *
11967  *  You can convert certain objects to Strings with:
11968  *
11969  *  - \Method {String}[Kernel.html#method-i-String].
11970  *
11971  *  Some \String methods modify +self+.
11972  *  Typically, a method whose name ends with <tt>!</tt> modifies +self+
11973  *  and returns +self+;
11974  *  often a similarly named method (without the <tt>!</tt>)
11975  *  returns a new string.
11976  *
11977  *  In general, if there exist both bang and non-bang version of method,
11978  *  the bang! mutates and the non-bang! does not.
11979  *  However, a method without a bang can also mutate, such as String#replace.
11980  *
11981  *  == Substitution Methods
11982  *
11983  *  These methods perform substitutions:
11984  *
11985  *  - String#sub: One substitution (or none); returns a new string.
11986  *  - String#sub!: One substitution (or none); returns +self+.
11987  *  - String#gsub: Zero or more substitutions; returns a new string.
11988  *  - String#gsub!: Zero or more substitutions; returns +self+.
11989  *
11990  *  Each of these methods takes:
11991  *
11992  *  - A first argument, +pattern+ (string or regexp),
11993  *    that specifies the substring(s) to be replaced.
11994  *
11995  *  - Either of these:
11996  *
11997  *    - A second argument, +replacement+ (string or hash),
11998  *      that determines the replacing string.
11999  *    - A block that will determine the replacing string.
12000  *
12001  *  The examples in this section mostly use methods String#sub and String#gsub;
12002  *  the principles illustrated apply to all four substitution methods.
12003  *
12004  *  <b>Argument +pattern+</b>
12005  *
12006  *  Argument +pattern+ is commonly a regular expression:
12007  *
12008  *    s = 'hello'
12009  *    s.sub(/[aeiou]/, '*')  # => "h*llo"
12010  *    s.gsub(/[aeiou]/, '*') # => "h*ll*"
12011  *    s.gsub(/[aeiou]/, '')  # => "hll"
12012  *    s.sub(/ell/, 'al')     # => "halo"
12013  *    s.gsub(/xyzzy/, '*')   # => "hello"
12014  *    'THX1138'.gsub(/\d+/, '00') # => "THX00"
12015  *
12016  *  When +pattern+ is a string, all its characters are treated
12017  *  as ordinary characters (not as regexp special characters):
12018  *
12019  *    'THX1138'.gsub('\d+', '00') # => "THX1138"
12020  *
12021  *  <b>\String +replacement+</b>
12022  *
12023  *  If +replacement+ is a string, that string will determine
12024  *  the replacing string that is to be substituted for the matched text.
12025  *
12026  *  Each of the examples above uses a simple string as the replacing string.
12027  *
12028  *  \String +replacement+ may contain back-references to the pattern's captures:
12029  *
12030  *  - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12031  *  - <tt>\k<name></tt> refers to the named capture +name+.
12032  *
12033  *  See rdoc-ref:regexp.rdoc for details.
12034  *
12035  *  Note that within the string +replacement+, a character combination
12036  *  such as <tt>$&</tt> is treated as ordinary text, and not as
12037  *  a special match variable.
12038  *  However, you may refer to some special match variables using these
12039  *  combinations:
12040  *
12041  *  - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12042  *    which contains the complete matched text.
12043  *  - <tt>\'</tt> corresponds to <tt>$'</tt>,
12044  *    which contains string after match.
12045  *  - <tt>\`</tt> corresponds to <tt>$`</tt>,
12046  *    which contains string before match.
12047  *  - <tt>\+</tt> corresponds to <tt>$+</tt>,
12048  *    which contains last capture group.
12049  *
12050  *  See rdoc-ref:regexp.rdoc for details.
12051  *
12052  *  Note that <tt>\\\\</tt> is interpreted as an escape, i.e., a single backslash.
12053  *
12054  *  Note also that a string literal consumes backslashes.
12055  *  See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12056  *
12057  *  A back-reference is typically preceded by an additional backslash.
12058  *  For example, if you want to write a back-reference <tt>\&</tt> in
12059  *  +replacement+ with a double-quoted string literal, you need to write
12060  *  <tt>"..\\\\&.."</tt>.
12061  *
12062  *  If you want to write a non-back-reference string <tt>\&</tt> in
12063  *  +replacement+, you need first to escape the backslash to prevent
12064  *  this method from interpreting it as a back-reference, and then you
12065  *  need to escape the backslashes again to prevent a string literal from
12066  *  consuming them: <tt>"..\\\\\\\\&.."</tt>.
12067  *
12068  *  You may want to use the block form to avoid a lot of backslashes.
12069  *
12070  *  <b>\Hash +replacement+</b>
12071  *
12072  *  If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12073  *  the replacing string is the value for that key:
12074  *
12075  *    h = {'foo' => 'bar', 'baz' => 'bat'}
12076  *    'food'.sub('foo', h) # => "bard"
12077  *
12078  *  Note that a symbol key does not match:
12079  *
12080  *    h = {foo: 'bar', baz: 'bat'}
12081  *    'food'.sub('foo', h) # => "d"
12082  *
12083  *  <b>Block</b>
12084  *
12085  *  In the block form, the current match string is passed to the block;
12086  *  the block's return value becomes the replacing string:
12087  *
12088  *    s = '@'
12089  *   '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12090  *
12091  *  Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12092  *  <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12093  *
12094  *
12095  *  == What's Here
12096  *
12097  *  First, what's elsewhere. \Class \String:
12098  *
12099  *  - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12100  *  - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12101  *
12102  *  Here, class \String provides methods that are useful for:
12103  *
12104  *  - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12105  *  - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12106  *  - {Querying}[#class-String-label-Methods+for+Querying]
12107  *  - {Comparing}[#class-String-label-Methods+for+Comparing]
12108  *  - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12109  *  - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12110  *  - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12111  *  - {Iterating}[#class-String-label-Methods+for+Iterating]
12112  *
12113  *  === Methods for Creating a \String
12114  *
12115  *  - ::new:: Returns a new string.
12116  *  - ::try_convert:: Returns a new string created from a given object.
12117  *
12118  *  === Methods for a Frozen/Unfrozen String
12119  *
12120  *  - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12121  *                                  +self+, if not frozen; +self.dup+ otherwise.
12122  *  - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12123  *                                  +self+, if already frozen; +self.freeze+ otherwise.
12124  *  - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12125  *
12126  *  === Methods for Querying
12127  *
12128  *  _Counts_
12129  *
12130  *  - #length, #size:: Returns the count of characters (not bytes).
12131  *  - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12132  *  - #bytesize:: Returns the count of bytes.
12133  *  - #count:: Returns the count of substrings matching given strings.
12134  *
12135  *  _Substrings_
12136  *
12137  *  - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12138  *                           returns +nil+ if no match is found.
12139  *  - #index:: Returns the index of the _first_ occurrence of a given substring;
12140  *             returns +nil+ if none found.
12141  *  - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12142  *              returns +nil+ if none found.
12143  *  - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12144  *  - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12145  *  - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12146  *  - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12147  *  - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12148  *
12149  *  _Encodings_
12150  *
12151  *  - #encoding:: Returns the Encoding object that represents the encoding of the string.
12152  *  - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12153  *  - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12154  *                       for its encoding.
12155  *  - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12156  *
12157  *  _Other_
12158  *
12159  *  - #sum:: Returns a basic checksum for the string: the sum of each byte.
12160  *  - #hash:: Returns the integer hash code.
12161  *
12162  *  === Methods for Comparing
12163  *
12164  *  - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12165  *  - #eql?:: Returns +true+ if the content is the same as the given other string.
12166  *  - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12167  *  - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12168  *               other string is smaller than, equal to, or larger than +self+.
12169  *  - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12170  *                +false+ otherwise.
12171  *
12172  *  === Methods for Modifying a \String
12173  *
12174  *  Each of these methods modifies +self+.
12175  *
12176  *  _Insertion_
12177  *
12178  *  - #insert:: Returns +self+ with a given string inserted at a given offset.
12179  *  - #<<:: Returns +self+ concatenated with a given string or integer.
12180  *
12181  *  _Substitution_
12182  *
12183  *  - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12184  *            returns +self+ if any changes, +nil+ otherwise.
12185  *  - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12186  *             returns +self+ if any changes, +nil+ otherwise.
12187  *  - #succ!, #next!:: Returns +self+ modified to become its own successor.
12188  *  - #replace:: Returns +self+ with its entire content replaced by a given string.
12189  *  - #reverse!:: Returns +self+ with its characters in reverse order.
12190  *  - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12191  *  - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12192  *           returns +self+ if any changes, +nil+ otherwise.
12193  *  - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12194  *             removing duplicates from the substrings that were modified;
12195  *             returns +self+ if any changes, +nil+ otherwise.
12196  *
12197  *  _Casing_
12198  *
12199  *  - #capitalize!:: Upcases the initial character and downcases all others;
12200  *                   returns +self+ if any changes, +nil+ otherwise.
12201  *  - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12202  *  - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12203  *  - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12204  *                 returns +self+ if any changes, +nil+ otherwise.
12205  *
12206  *  _Encoding_
12207  *
12208  *  - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12209  *  - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12210  *  - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12211  *  - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12212  *
12213  *  _Deletion_
12214  *
12215  *  - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12216  *  - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12217  *  - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12218  *  - #delete!:: Removes characters as determined by the intersection of substring arguments.
12219  *  - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12220  *  - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12221  *  - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12222  *  - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12223  *  - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12224  *             returns +self+ if any changes, +nil+ otherwise.
12225  *
12226  *  === Methods for Converting to New \String
12227  *
12228  *  Each of these methods returns a new \String based on +self+,
12229  *  often just a modified copy of +self+.
12230  *
12231  *  _Extension_
12232  *
12233  *  - #*:: Returns the concatenation of multiple copies of +self+,
12234  *  - #+:: Returns the concatenation of +self+ and a given other string.
12235  *  - #center:: Returns a copy of +self+ centered between pad substring.
12236  *  - #concat:: Returns the concatenation of +self+ with given other strings.
12237  *  - #prepend:: Returns the concatenation of a given other string with +self+.
12238  *  - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12239  *  - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12240  *
12241  *  _Encoding_
12242  *
12243  *  - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12244  *  - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12245  *  - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12246  *  - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12247  *
12248  *  _Substitution_
12249  *
12250  *  - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12251  *            and all special characters escaped.
12252  *  - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12253  *              and all escaped characters unescaped.
12254  *  - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12255  *           replaced with a given replacement string;.
12256  *  - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12257  *            replaced with a given replacement string.
12258  *  - #succ, #next:: Returns the string that is the successor to +self+.
12259  *  - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12260  *  - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12261  *  - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12262  *            removing duplicates from the substrings that were modified.
12263  *  - #%:: Returns the string resulting from formatting a given object into +self+
12264  *
12265  *  _Casing_
12266  *
12267  *  - #capitalize:: Returns a copy of +self+ with the first character upcased
12268  *                  and all other characters downcased.
12269  *  - #downcase:: Returns a copy of +self+ with all characters downcased.
12270  *  - #upcase:: Returns a copy of +self+ with all characters upcased.
12271  *  - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12272  *                and all downcase characters upcased.
12273  *
12274  *  _Deletion_
12275  *
12276  *  - #delete:: Returns a copy of +self+ with characters removed
12277  *  - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12278  *  - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12279  *  - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12280  *  - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12281  *  - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12282  *  - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12283  *  - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12284  *  - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12285  *  - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12286  *  - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12287  *  - #chr:: Returns the first character.
12288  *
12289  *  _Duplication_
12290  *
12291  *  - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12292  *                     otherwise, returns +self+.
12293  *
12294  *  === Methods for Converting to Non-\String
12295  *
12296  *  Each of these methods converts the contents of +self+ to a non-\String.
12297  *
12298  *  <em>Characters, Bytes, and Clusters</em>
12299  *
12300  *  - #bytes:: Returns an array of the bytes in +self+.
12301  *  - #chars:: Returns an array of the characters in +self+.
12302  *  - #codepoints:: Returns an array of the integer ordinals in +self+.
12303  *  - #getbyte:: Returns an integer byte as determined by a given index.
12304  *  - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12305  *
12306  *  _Splitting_
12307  *
12308  *  - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12309  *  - #partition:: Returns a 3-element array determined by the first substring that matches
12310  *                 a given substring or regexp,
12311  *  - #rpartition:: Returns a 3-element array determined by the last substring that matches
12312  *                  a given substring or regexp,
12313  *  - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12314  *             or, if a block given, passes those substrings to the block.
12315  *
12316  *  _Matching_
12317  *
12318  *  - #scan:: Returns an array of substrings matching a given regexp or string, or,
12319  *            if a block given, passes each matching substring to the  block.
12320  *  - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12321  *  - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12322  *
12323  *  _Numerics_
12324  *
12325  *  - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12326  *  - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12327  *  - #ord:: Returns the integer ordinal of the first character in +self+.
12328  *  - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12329  *  - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12330  *
12331  *  <em>Strings and Symbols</em>
12332  *
12333  *  - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12334  *  - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12335  *
12336  *  === Methods for Iterating
12337  *
12338  *  - #each_byte:: Calls the given block with each successive byte in +self+.
12339  *  - #each_char:: Calls the given block with each successive character in +self+.
12340  *  - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12341  *  - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12342  *  - #each_line:: Calls the given block with each successive line in +self+,
12343  *                 as determined by a given record separator.
12344  *  - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12345  */
12346
12347 void
12348 Init_String(void)
12349 {
12350     rb_cString  = rb_define_class("String", rb_cObject);
12351     assert(rb_vm_fstring_table());
12352     st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12353     rb_include_module(rb_cString, rb_mComparable);
12354     rb_define_alloc_func(rb_cString, empty_str_alloc);
12355     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12356     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12357     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12358     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12359     rb_define_method(rb_cString, "==", rb_str_equal, 1);
12360     rb_define_method(rb_cString, "===", rb_str_equal, 1);
12361     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12362     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12363     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12364     rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12365     rb_define_method(rb_cString, "+", rb_str_plus, 1);
12366     rb_define_method(rb_cString, "*", rb_str_times, 1);
12367     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12368     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12369     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12370     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12371     rb_define_method(rb_cString, "length", rb_str_length, 0);
12372     rb_define_method(rb_cString, "size", rb_str_length, 0);
12373     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12374     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12375     rb_define_method(rb_cString, "=~", rb_str_match, 1);
12376     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12377     rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12378     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12379     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12380     rb_define_method(rb_cString, "next", rb_str_succ, 0);
12381     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12382     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12383     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12384     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12385     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12386     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12387     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12388     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12389     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12390     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12391     rb_define_method(rb_cString, "scrub", str_scrub, -1);
12392     rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12393     rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12394     rb_define_method(rb_cString, "+@", str_uplus, 0);
12395     rb_define_method(rb_cString, "-@", str_uminus, 0);
12396
12397     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12398     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12399     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12400     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12401     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12402     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12403     rb_define_method(rb_cString, "undump", str_undump, 0);
12404
12405     sym_ascii      = ID2SYM(rb_intern_const("ascii"));
12406     sym_turkic     = ID2SYM(rb_intern_const("turkic"));
12407     sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12408     sym_fold       = ID2SYM(rb_intern_const("fold"));
12409
12410     rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12411     rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12412     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12413     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12414
12415     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12416     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12417     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12418     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12419
12420     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12421     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12422     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12423     rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12424     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12425     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12426     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12427     rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12428     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12429     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12430     rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12431     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12432     rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12433     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12434     rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12435     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12436     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12437
12438     rb_define_method(rb_cString, "include?", rb_str_include, 1);
12439     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12440     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12441
12442     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12443
12444     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12445     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12446     rb_define_method(rb_cString, "center", rb_str_center, -1);
12447
12448     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12449     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12450     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12451     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12452     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12453     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12454     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12455     rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12456     rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12457
12458     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12459     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12460     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12461     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12462     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12463     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12464     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12465     rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12466     rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12467
12468     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12469     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12470     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12471     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12472     rb_define_method(rb_cString, "count", rb_str_count, -1);
12473
12474     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12475     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12476     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12477     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12478
12479     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12480     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12481     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12482     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12483     rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12484
12485     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12486
12487     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12488     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12489
12490     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12491     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12492
12493     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12494     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12495     rb_define_method(rb_cString, "b", rb_str_b, 0);
12496     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12497     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12498
12499     /* define UnicodeNormalize module here so that we don't have to look it up */
12500     mUnicodeNormalize          = rb_define_module("UnicodeNormalize");
12501     id_normalize               = rb_intern_const("normalize");
12502     id_normalized_p            = rb_intern_const("normalized?");
12503
12504     rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12505     rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12506     rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12507
12508     rb_fs = Qnil;
12509     rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12510     rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12511     rb_gc_register_address(&rb_fs);
12512
12513     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12514     rb_include_module(rb_cSymbol, rb_mComparable);
12515     rb_undef_alloc_func(rb_cSymbol);
12516     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12517     rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12518
12519     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12520     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12521     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12522     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12523     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12524     rb_define_method(rb_cSymbol, "name", rb_sym2str, 0);
12525     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12526     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12527     rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12528     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12529     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12530
12531     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12532     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12533     rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12534     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12535
12536     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12537     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12538     rb_define_method(rb_cSymbol, "length", sym_length, 0);
12539     rb_define_method(rb_cSymbol, "size", sym_length, 0);
12540     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12541     rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12542     rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12543
12544     rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12545     rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12546     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12547     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12548
12549     rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12550     rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12551
12552     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12553 }