string.c

   1 /**********************************************************************
   2
   3   string.c -
   4
   5   $Author$
   6   created at: Mon Aug  9 17:12:58 JST 1993
   7
   8   Copyright (C) 1993-2007 Yukihiro Matsumoto
   9   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
  10   Copyright (C) 2000  Information-technology Promotion Agency, Japan
  11
  12 **********************************************************************/
  13
  14 #include "ruby/internal/config.h"
  15
  16 #include <ctype.h>
  17 #include <errno.h>
  18 #include <math.h>
  19
  20 #ifdef HAVE_UNISTD_H
  21 # include <unistd.h>
  22 #endif
  23
  24 #include "debug_counter.h"
  25 #include "encindex.h"
  26 #include "id.h"
  27 #include "internal.h"
  28 #include "internal/array.h"
  29 #include "internal/compar.h"
  30 #include "internal/compilers.h"
  31 #include "internal/encoding.h"
  32 #include "internal/error.h"
  33 #include "internal/gc.h"
  34 #include "internal/numeric.h"
  35 #include "internal/object.h"
  36 #include "internal/proc.h"
  37 #include "internal/re.h"
  38 #include "internal/sanitizers.h"
  39 #include "internal/string.h"
  40 #include "internal/transcode.h"
  41 #include "probes.h"
  42 #include "ruby/encoding.h"
  43 #include "ruby/re.h"
  44 #include "ruby/util.h"
  45 #include "ruby_assert.h"
  46 #include "vm_sync.h"
  47
  48 #if defined HAVE_CRYPT_R
  49 # if defined HAVE_CRYPT_H
  50 #  include <crypt.h>
  51 # endif
  52 #elif !defined HAVE_CRYPT
  53 # include "missing/crypt.h"
  54 # define HAVE_CRYPT_R 1
  55 #endif
  56
  57 #define BEG(no) (regs->beg[(no)])
  58 #define END(no) (regs->end[(no)])
  59
  60 #undef rb_str_new
  61 #undef rb_usascii_str_new
  62 #undef rb_utf8_str_new
  63 #undef rb_enc_str_new
  64 #undef rb_str_new_cstr
  65 #undef rb_usascii_str_new_cstr
  66 #undef rb_utf8_str_new_cstr
  67 #undef rb_enc_str_new_cstr
  68 #undef rb_external_str_new_cstr
  69 #undef rb_locale_str_new_cstr
  70 #undef rb_str_dup_frozen
  71 #undef rb_str_buf_new_cstr
  72 #undef rb_str_buf_cat
  73 #undef rb_str_buf_cat2
  74 #undef rb_str_cat2
  75 #undef rb_str_cat_cstr
  76 #undef rb_fstring_cstr
  77
  78 VALUE rb_cString;
  79 VALUE rb_cSymbol;
  80
  81 /* Flags of RString
  82  *
  83  * 1:     RSTRING_NOEMBED
  84  *            The string is not embedded. When a string is embedded, the contents
  85  *            follow the header. When a string is not embedded, the contents is
  86  *            on a separately allocated buffer.
  87  * 2:     STR_SHARED (equal to ELTS_SHARED)
  88  *            The string is shared. The buffer this string points to is owned by
  89  *            another string (the shared root).
  90  * 3:     STR_CHILLED (will be frozen in a future version)
  91  *            The string appears frozen but can be mutated with a warning.
  92  * 5:     STR_SHARED_ROOT
  93  *            Other strings may point to the contents of this string. When this
  94  *            flag is set, STR_SHARED must not be set.
  95  * 6:     STR_BORROWED
  96  *            When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
  97  *            to be unshared by rb_str_tmp_frozen_release.
  98  * 7:     STR_TMPLOCK
  99  *            The pointer to the buffer is passed to a system call such as
 100  *            read(2). Any modification and realloc is prohibited.
 101  * 8-9:   ENC_CODERANGE
 102  *            Stores the coderange of the string.
 103  * 10-16: ENCODING
 104  *            Stores the encoding of the string.
 105  * 17:    RSTRING_FSTR
 106  *            The string is a fstring. The string is deduplicated in the fstring
 107  *            table.
 108  * 18:    STR_NOFREE
 109  *            Do not free this string's buffer when the string is reclaimed
 110  *            by the garbage collector. Used for when the string buffer is a C
 111  *            string literal.
 112  * 19:    STR_FAKESTR
 113  *            The string is not allocated or managed by the garbage collector.
 114  *            Typically, the string object header (struct RString) is temporarily
 115  *            allocated on C stack.
 116  */
 117
 118 #define RUBY_MAX_CHAR_LEN 16
 119 #define STR_SHARED_ROOT FL_USER5
 120 #define STR_BORROWED FL_USER6
 121 #define STR_TMPLOCK FL_USER7
 122 #define STR_NOFREE FL_USER18
 123 #define STR_FAKESTR FL_USER19
 124
 125 #define STR_SET_NOEMBED(str) do {\
 126     FL_SET((str), STR_NOEMBED);\
 127     FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
 128 } while (0)
 129 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
 130
 131 #define STR_SET_LEN(str, n) do { \
 132     RSTRING(str)->len = (n); \
 133 } while (0)
 134
 135 static inline bool
 136 str_enc_fastpath(VALUE str)
 137 {
 138     // The overwhelming majority of strings are in one of these 3 encodings.
 139     switch (ENCODING_GET_INLINED(str)) {
 140       case ENCINDEX_ASCII_8BIT:
 141       case ENCINDEX_UTF_8:
 142       case ENCINDEX_US_ASCII:
 143         return true;
 144       default:
 145         return false;
 146     }
 147 }
 148
 149 #define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
 150 #define TERM_FILL(ptr, termlen) do {\
 151     char *const term_fill_ptr = (ptr);\
 152     const int term_fill_len = (termlen);\
 153     *term_fill_ptr = '\0';\
 154     if (UNLIKELY(term_fill_len > 1))\
 155         memset(term_fill_ptr, 0, term_fill_len);\
 156 } while (0)
 157
 158 #define RESIZE_CAPA(str,capacity) do {\
 159     const int termlen = TERM_LEN(str);\
 160     RESIZE_CAPA_TERM(str,capacity,termlen);\
 161 } while (0)
 162 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
 163     if (STR_EMBED_P(str)) {\
 164         if (str_embed_capa(str) < capacity + termlen) {\
 165             char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
 166             const long tlen = RSTRING_LEN(str);\
 167             memcpy(tmp, RSTRING_PTR(str), tlen);\
 168             RSTRING(str)->as.heap.ptr = tmp;\
 169             RSTRING(str)->len = tlen;\
 170             STR_SET_NOEMBED(str);\
 171             RSTRING(str)->as.heap.aux.capa = (capacity);\
 172         }\
 173     }\
 174     else {\
 175         RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
 176         SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
 177                         (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
 178         RSTRING(str)->as.heap.aux.capa = (capacity);\
 179     }\
 180 } while (0)
 181
 182 #define STR_SET_SHARED(str, shared_str) do { \
 183     if (!FL_TEST(str, STR_FAKESTR)) { \
 184         RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
 185         RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
 186         RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
 187         FL_SET((str), STR_SHARED); \
 188         FL_SET((shared_str), STR_SHARED_ROOT); \
 189         if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
 190             FL_SET_RAW((shared_str), STR_BORROWED); \
 191     } \
 192 } while (0)
 193
 194 #define STR_HEAP_PTR(str)  (RSTRING(str)->as.heap.ptr)
 195 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
 196 /* TODO: include the terminator size in capa. */
 197
 198 #define STR_ENC_GET(str) get_encoding(str)
 199
 200 #if !defined SHARABLE_MIDDLE_SUBSTRING
 201 # define SHARABLE_MIDDLE_SUBSTRING 0
 202 #endif
 203 #if !SHARABLE_MIDDLE_SUBSTRING
 204 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
 205 #else
 206 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
 207 #endif
 208
 209
 210 static inline long
 211 str_embed_capa(VALUE str)
 212 {
 213     return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
 214 }
 215
 216 bool
 217 rb_str_reembeddable_p(VALUE str)
 218 {
 219     return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
 220 }
 221
 222 static inline size_t
 223 rb_str_embed_size(long capa)
 224 {
 225     return offsetof(struct RString, as.embed.ary) + capa;
 226 }
 227
 228 size_t
 229 rb_str_size_as_embedded(VALUE str)
 230 {
 231     size_t real_size;
 232     if (STR_EMBED_P(str)) {
 233         real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
 234     }
 235     /* if the string is not currently embedded, but it can be embedded, how
 236      * much space would it require */
 237     else if (rb_str_reembeddable_p(str)) {
 238         real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
 239     }
 240     else {
 241         real_size = sizeof(struct RString);
 242     }
 243     return real_size;
 244 }
 245
 246 static inline bool
 247 STR_EMBEDDABLE_P(long len, long termlen)
 248 {
 249     return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
 250 }
 251
 252 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
 253 static VALUE str_new_frozen(VALUE klass, VALUE orig);
 254 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
 255 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
 256 static VALUE str_new(VALUE klass, const char *ptr, long len);
 257 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
 258 static inline void str_modifiable(VALUE str);
 259 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
 260
 261 static inline void
 262 str_make_independent(VALUE str)
 263 {
 264     long len = RSTRING_LEN(str);
 265     int termlen = TERM_LEN(str);
 266     str_make_independent_expand((str), len, 0L, termlen);
 267 }
 268
 269 static inline int str_dependent_p(VALUE str);
 270
 271 void
 272 rb_str_make_independent(VALUE str)
 273 {
 274     if (str_dependent_p(str)) {
 275         str_make_independent(str);
 276     }
 277 }
 278
 279 void
 280 rb_str_make_embedded(VALUE str)
 281 {
 282     RUBY_ASSERT(rb_str_reembeddable_p(str));
 283     RUBY_ASSERT(!STR_EMBED_P(str));
 284
 285     char *buf = RSTRING(str)->as.heap.ptr;
 286     long len = RSTRING(str)->len;
 287
 288     STR_SET_EMBED(str);
 289     STR_SET_LEN(str, len);
 290
 291     if (len > 0) {
 292         memcpy(RSTRING_PTR(str), buf, len);
 293         ruby_xfree(buf);
 294     }
 295
 296     TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
 297 }
 298
 299 void
 300 rb_debug_rstring_null_ptr(const char *func)
 301 {
 302     fprintf(stderr, "%s is returning NULL!! "
 303             "SIGSEGV is highly expected to follow immediately.\n"
 304             "If you could reproduce, attach your debugger here, "
 305             "and look at the passed string.\n",
 306             func);
 307 }
 308
 309 /* symbols for [up|down|swap]case/capitalize options */
 310 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
 311
 312 static rb_encoding *
 313 get_encoding(VALUE str)
 314 {
 315     return rb_enc_from_index(ENCODING_GET(str));
 316 }
 317
 318 static void
 319 mustnot_broken(VALUE str)
 320 {
 321     if (is_broken_string(str)) {
 322         rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
 323     }
 324 }
 325
 326 static void
 327 mustnot_wchar(VALUE str)
 328 {
 329     rb_encoding *enc = STR_ENC_GET(str);
 330     if (rb_enc_mbminlen(enc) > 1) {
 331         rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
 332     }
 333 }
 334
 335 static int fstring_cmp(VALUE a, VALUE b);
 336
 337 static VALUE register_fstring(VALUE str, bool copy);
 338
 339 const struct st_hash_type rb_fstring_hash_type = {
 340     fstring_cmp,
 341     rb_str_hash,
 342 };
 343
 344 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
 345
 346 struct fstr_update_arg {
 347     VALUE fstr;
 348     bool copy;
 349 };
 350
 351 static int
 352 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
 353 {
 354
 355     struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
 356     VALUE str = (VALUE)*key;
 357
 358     if (existing) {
 359         /* because of lazy sweep, str may be unmarked already and swept
 360          * at next time */
 361
 362         if (rb_objspace_garbage_object_p(str)) {
 363             arg->fstr = Qundef;
 364             return ST_DELETE;
 365         }
 366
 367         arg->fstr = str;
 368         return ST_STOP;
 369     }
 370     else {
 371         if (FL_TEST_RAW(str, STR_FAKESTR)) {
 372             if (arg->copy) {
 373                 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
 374                 rb_enc_copy(new_str, str);
 375                 str = new_str;
 376             }
 377             else {
 378                 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
 379                                      RSTRING(str)->len,
 380                                      ENCODING_GET(str));
 381             }
 382             OBJ_FREEZE(str);
 383         }
 384         else {
 385             if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
 386                 str = str_new_frozen(rb_cString, str);
 387             }
 388             if (STR_SHARED_P(str)) { /* str should not be shared */
 389                 /* shared substring  */
 390                 str_make_independent(str);
 391                 RUBY_ASSERT(OBJ_FROZEN(str));
 392             }
 393             if (!BARE_STRING_P(str)) {
 394                 str = str_new_frozen(rb_cString, str);
 395             }
 396         }
 397         RBASIC(str)->flags |= RSTRING_FSTR;
 398
 399         *key = *value = arg->fstr = str;
 400         return ST_CONTINUE;
 401     }
 402 }
 403
 404 VALUE
 405 rb_fstring(VALUE str)
 406 {
 407     VALUE fstr;
 408     int bare;
 409
 410     Check_Type(str, T_STRING);
 411
 412     if (FL_TEST(str, RSTRING_FSTR))
 413         return str;
 414
 415     bare = BARE_STRING_P(str);
 416     if (!bare) {
 417         if (STR_EMBED_P(str)) {
 418             OBJ_FREEZE(str);
 419             return str;
 420         }
 421
 422         if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
 423             RUBY_ASSERT(OBJ_FROZEN(str));
 424             return str;
 425         }
 426     }
 427
 428     if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
 429         rb_str_resize(str, RSTRING_LEN(str));
 430
 431     fstr = register_fstring(str, FALSE);
 432
 433     if (!bare) {
 434         str_replace_shared_without_enc(str, fstr);
 435         OBJ_FREEZE(str);
 436         return str;
 437     }
 438     return fstr;
 439 }
 440
 441 static VALUE
 442 register_fstring(VALUE str, bool copy)
 443 {
 444     struct fstr_update_arg args;
 445     args.copy = copy;
 446
 447     RB_VM_LOCK_ENTER();
 448     {
 449         st_table *frozen_strings = rb_vm_fstring_table();
 450         do {
 451             args.fstr = str;
 452             st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
 453         } while (UNDEF_P(args.fstr));
 454     }
 455     RB_VM_LOCK_LEAVE();
 456
 457     RUBY_ASSERT(OBJ_FROZEN(args.fstr));
 458     RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
 459     RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
 460     RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
 461
 462     return args.fstr;
 463 }
 464
 465 static VALUE
 466 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
 467 {
 468     fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
 469     /* SHARED to be allocated by the callback */
 470
 471     if (!name) {
 472         RUBY_ASSERT_ALWAYS(len == 0);
 473         name = "";
 474     }
 475
 476     ENCODING_SET_INLINED((VALUE)fake_str, encidx);
 477
 478     RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
 479     fake_str->len = len;
 480     fake_str->as.heap.ptr = (char *)name;
 481     fake_str->as.heap.aux.capa = len;
 482     return (VALUE)fake_str;
 483 }
 484
 485 /*
 486  * set up a fake string which refers a static string literal.
 487  */
 488 VALUE
 489 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
 490 {
 491     return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
 492 }
 493
 494 /*
 495  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
 496  * shared string which refers a static string literal.  `ptr` must
 497  * point a constant string.
 498  */
 499 VALUE
 500 rb_fstring_new(const char *ptr, long len)
 501 {
 502     struct RString fake_str;
 503     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
 504 }
 505
 506 VALUE
 507 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
 508 {
 509     struct RString fake_str;
 510     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
 511 }
 512
 513 VALUE
 514 rb_fstring_cstr(const char *ptr)
 515 {
 516     return rb_fstring_new(ptr, strlen(ptr));
 517 }
 518
 519 static int
 520 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
 521 {
 522     RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
 523     return ST_CONTINUE;
 524 }
 525
 526 static int
 527 fstring_cmp(VALUE a, VALUE b)
 528 {
 529     long alen, blen;
 530     const char *aptr, *bptr;
 531     RSTRING_GETMEM(a, aptr, alen);
 532     RSTRING_GETMEM(b, bptr, blen);
 533     return (alen != blen ||
 534             ENCODING_GET(a) != ENCODING_GET(b) ||
 535             memcmp(aptr, bptr, alen) != 0);
 536 }
 537
 538 static inline int
 539 single_byte_optimizable(VALUE str)
 540 {
 541     rb_encoding *enc;
 542
 543     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
 544     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
 545         return 1;
 546
 547     enc = STR_ENC_GET(str);
 548     if (rb_enc_mbmaxlen(enc) == 1)
 549         return 1;
 550
 551     /* Conservative.  Possibly single byte.
 552      * "\xa1" in Shift_JIS for example. */
 553     return 0;
 554 }
 555
 556 VALUE rb_fs;
 557
 558 static inline const char *
 559 search_nonascii(const char *p, const char *e)
 560 {
 561     const uintptr_t *s, *t;
 562
 563 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
 564 # if SIZEOF_UINTPTR_T == 8
 565 #  define NONASCII_MASK UINT64_C(0x8080808080808080)
 566 # elif SIZEOF_UINTPTR_T == 4
 567 #  define NONASCII_MASK UINT32_C(0x80808080)
 568 # else
 569 #  error "don't know what to do."
 570 # endif
 571 #else
 572 # if SIZEOF_UINTPTR_T == 8
 573 #  define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
 574 # elif SIZEOF_UINTPTR_T == 4
 575 #  define NONASCII_MASK 0x80808080UL /* or...? */
 576 # else
 577 #  error "don't know what to do."
 578 # endif
 579 #endif
 580
 581     if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
 582 #if !UNALIGNED_WORD_ACCESS
 583         if ((uintptr_t)p % SIZEOF_VOIDP) {
 584             int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
 585             p += l;
 586             switch (l) {
 587               default: UNREACHABLE;
 588 #if SIZEOF_VOIDP > 4
 589               case 7: if (p[-7]&0x80) return p-7;
 590               case 6: if (p[-6]&0x80) return p-6;
 591               case 5: if (p[-5]&0x80) return p-5;
 592               case 4: if (p[-4]&0x80) return p-4;
 593 #endif
 594               case 3: if (p[-3]&0x80) return p-3;
 595               case 2: if (p[-2]&0x80) return p-2;
 596               case 1: if (p[-1]&0x80) return p-1;
 597               case 0: break;
 598             }
 599         }
 600 #endif
 601 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
 602 #define aligned_ptr(value) \
 603         __builtin_assume_aligned((value), sizeof(uintptr_t))
 604 #else
 605 #define aligned_ptr(value) (uintptr_t *)(value)
 606 #endif
 607         s = aligned_ptr(p);
 608         t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
 609 #undef aligned_ptr
 610         for (;s < t; s++) {
 611             if (*s & NONASCII_MASK) {
 612 #ifdef WORDS_BIGENDIAN
 613                 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
 614 #else
 615                 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
 616 #endif
 617             }
 618         }
 619         p = (const char *)s;
 620     }
 621
 622     switch (e - p) {
 623       default: UNREACHABLE;
 624 #if SIZEOF_VOIDP > 4
 625       case 7: if (e[-7]&0x80) return e-7;
 626       case 6: if (e[-6]&0x80) return e-6;
 627       case 5: if (e[-5]&0x80) return e-5;
 628       case 4: if (e[-4]&0x80) return e-4;
 629 #endif
 630       case 3: if (e[-3]&0x80) return e-3;
 631       case 2: if (e[-2]&0x80) return e-2;
 632       case 1: if (e[-1]&0x80) return e-1;
 633       case 0: return NULL;
 634     }
 635 }
 636
 637 static int
 638 coderange_scan(const char *p, long len, rb_encoding *enc)
 639 {
 640     const char *e = p + len;
 641
 642     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 643         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 644         p = search_nonascii(p, e);
 645         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 646     }
 647
 648     if (rb_enc_asciicompat(enc)) {
 649         p = search_nonascii(p, e);
 650         if (!p) return ENC_CODERANGE_7BIT;
 651         for (;;) {
 652             int ret = rb_enc_precise_mbclen(p, e, enc);
 653             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 654             p += MBCLEN_CHARFOUND_LEN(ret);
 655             if (p == e) break;
 656             p = search_nonascii(p, e);
 657             if (!p) break;
 658         }
 659     }
 660     else {
 661         while (p < e) {
 662             int ret = rb_enc_precise_mbclen(p, e, enc);
 663             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
 664             p += MBCLEN_CHARFOUND_LEN(ret);
 665         }
 666     }
 667     return ENC_CODERANGE_VALID;
 668 }
 669
 670 long
 671 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
 672 {
 673     const char *p = s;
 674
 675     if (*cr == ENC_CODERANGE_BROKEN)
 676         return e - s;
 677
 678     if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
 679         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
 680         if (*cr == ENC_CODERANGE_VALID) return e - s;
 681         p = search_nonascii(p, e);
 682         *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
 683         return e - s;
 684     }
 685     else if (rb_enc_asciicompat(enc)) {
 686         p = search_nonascii(p, e);
 687         if (!p) {
 688             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
 689             return e - s;
 690         }
 691         for (;;) {
 692             int ret = rb_enc_precise_mbclen(p, e, enc);
 693             if (!MBCLEN_CHARFOUND_P(ret)) {
 694                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 695                 return p - s;
 696             }
 697             p += MBCLEN_CHARFOUND_LEN(ret);
 698             if (p == e) break;
 699             p = search_nonascii(p, e);
 700             if (!p) break;
 701         }
 702     }
 703     else {
 704         while (p < e) {
 705             int ret = rb_enc_precise_mbclen(p, e, enc);
 706             if (!MBCLEN_CHARFOUND_P(ret)) {
 707                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
 708                 return p - s;
 709             }
 710             p += MBCLEN_CHARFOUND_LEN(ret);
 711         }
 712     }
 713     *cr = ENC_CODERANGE_VALID;
 714     return e - s;
 715 }
 716
 717 static inline void
 718 str_enc_copy(VALUE str1, VALUE str2)
 719 {
 720     rb_enc_set_index(str1, ENCODING_GET(str2));
 721 }
 722
 723 /* Like str_enc_copy, but does not check frozen status of str1.
 724  * You should use this only if you're certain that str1 is not frozen. */
 725 static inline void
 726 str_enc_copy_direct(VALUE str1, VALUE str2)
 727 {
 728     int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
 729     if (inlined_encoding == ENCODING_INLINE_MAX) {
 730         rb_enc_set_index(str1, rb_enc_get_index(str2));
 731     }
 732     else {
 733         ENCODING_SET_INLINED(str1, inlined_encoding);
 734     }
 735 }
 736
 737 static void
 738 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 739 {
 740     /* this function is designed for copying encoding and coderange
 741      * from src to new string "dest" which is made from the part of src.
 742      */
 743     str_enc_copy(dest, src);
 744     if (RSTRING_LEN(dest) == 0) {
 745         if (!rb_enc_asciicompat(STR_ENC_GET(src)))
 746             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 747         else
 748             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 749         return;
 750     }
 751     switch (ENC_CODERANGE(src)) {
 752       case ENC_CODERANGE_7BIT:
 753         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 754         break;
 755       case ENC_CODERANGE_VALID:
 756         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
 757             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
 758             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 759         else
 760             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 761         break;
 762       default:
 763         break;
 764     }
 765 }
 766
 767 static void
 768 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
 769 {
 770     str_enc_copy(dest, src);
 771     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 772 }
 773
 774 static int
 775 enc_coderange_scan(VALUE str, rb_encoding *enc)
 776 {
 777     return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
 778 }
 779
 780 int
 781 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
 782 {
 783     return enc_coderange_scan(str, enc);
 784 }
 785
 786 int
 787 rb_enc_str_coderange(VALUE str)
 788 {
 789     int cr = ENC_CODERANGE(str);
 790
 791     if (cr == ENC_CODERANGE_UNKNOWN) {
 792         cr = enc_coderange_scan(str, get_encoding(str));
 793         ENC_CODERANGE_SET(str, cr);
 794     }
 795     return cr;
 796 }
 797
 798 int
 799 rb_enc_str_asciionly_p(VALUE str)
 800 {
 801     rb_encoding *enc = STR_ENC_GET(str);
 802
 803     if (!rb_enc_asciicompat(enc))
 804         return FALSE;
 805     else if (is_ascii_string(str))
 806         return TRUE;
 807     return FALSE;
 808 }
 809
 810 static inline void
 811 str_mod_check(VALUE s, const char *p, long len)
 812 {
 813     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
 814         rb_raise(rb_eRuntimeError, "string modified");
 815     }
 816 }
 817
 818 static size_t
 819 str_capacity(VALUE str, const int termlen)
 820 {
 821     if (STR_EMBED_P(str)) {
 822         return str_embed_capa(str) - termlen;
 823     }
 824     else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
 825         return RSTRING(str)->len;
 826     }
 827     else {
 828         return RSTRING(str)->as.heap.aux.capa;
 829     }
 830 }
 831
 832 size_t
 833 rb_str_capacity(VALUE str)
 834 {
 835     return str_capacity(str, TERM_LEN(str));
 836 }
 837
 838 static inline void
 839 must_not_null(const char *ptr)
 840 {
 841     if (!ptr) {
 842         rb_raise(rb_eArgError, "NULL pointer given");
 843     }
 844 }
 845
 846 static inline VALUE
 847 str_alloc_embed(VALUE klass, size_t capa)
 848 {
 849     size_t size = rb_str_embed_size(capa);
 850     RUBY_ASSERT(size > 0);
 851     RUBY_ASSERT(rb_gc_size_allocatable_p(size));
 852
 853     NEWOBJ_OF(str, struct RString, klass,
 854             T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size, 0);
 855
 856     return (VALUE)str;
 857 }
 858
 859 static inline VALUE
 860 str_alloc_heap(VALUE klass)
 861 {
 862     NEWOBJ_OF(str, struct RString, klass,
 863             T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
 864
 865     return (VALUE)str;
 866 }
 867
 868 static inline VALUE
 869 empty_str_alloc(VALUE klass)
 870 {
 871     RUBY_DTRACE_CREATE_HOOK(STRING, 0);
 872     VALUE str = str_alloc_embed(klass, 0);
 873     memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
 874     return str;
 875 }
 876
 877 static VALUE
 878 str_new0(VALUE klass, const char *ptr, long len, int termlen)
 879 {
 880     VALUE str;
 881
 882     if (len < 0) {
 883         rb_raise(rb_eArgError, "negative string size (or size too big)");
 884     }
 885
 886     RUBY_DTRACE_CREATE_HOOK(STRING, len);
 887
 888     if (STR_EMBEDDABLE_P(len, termlen)) {
 889         str = str_alloc_embed(klass, len + termlen);
 890         if (len == 0) {
 891             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
 892         }
 893     }
 894     else {
 895         str = str_alloc_heap(klass);
 896         RSTRING(str)->as.heap.aux.capa = len;
 897         /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
 898          * integer overflow.  If we can STATIC_ASSERT that, the following
 899          * mul_add_mul can be reverted to a simple ALLOC_N. */
 900         RSTRING(str)->as.heap.ptr =
 901             rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
 902     }
 903     if (ptr) {
 904         memcpy(RSTRING_PTR(str), ptr, len);
 905     }
 906     STR_SET_LEN(str, len);
 907     TERM_FILL(RSTRING_PTR(str) + len, termlen);
 908     return str;
 909 }
 910
 911 static VALUE
 912 str_new(VALUE klass, const char *ptr, long len)
 913 {
 914     return str_new0(klass, ptr, len, 1);
 915 }
 916
 917 VALUE
 918 rb_str_new(const char *ptr, long len)
 919 {
 920     return str_new(rb_cString, ptr, len);
 921 }
 922
 923 VALUE
 924 rb_usascii_str_new(const char *ptr, long len)
 925 {
 926     VALUE str = rb_str_new(ptr, len);
 927     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 928     return str;
 929 }
 930
 931 VALUE
 932 rb_utf8_str_new(const char *ptr, long len)
 933 {
 934     VALUE str = str_new(rb_cString, ptr, len);
 935     rb_enc_associate_index(str, rb_utf8_encindex());
 936     return str;
 937 }
 938
 939 VALUE
 940 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 941 {
 942     VALUE str;
 943
 944     if (!enc) return rb_str_new(ptr, len);
 945
 946     str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
 947     rb_enc_associate(str, enc);
 948     return str;
 949 }
 950
 951 VALUE
 952 rb_str_new_cstr(const char *ptr)
 953 {
 954     must_not_null(ptr);
 955     /* rb_str_new_cstr() can take pointer from non-malloc-generated
 956      * memory regions, and that cannot be detected by the MSAN.  Just
 957      * trust the programmer that the argument passed here is a sane C
 958      * string. */
 959     __msan_unpoison_string(ptr);
 960     return rb_str_new(ptr, strlen(ptr));
 961 }
 962
 963 VALUE
 964 rb_usascii_str_new_cstr(const char *ptr)
 965 {
 966     VALUE str = rb_str_new_cstr(ptr);
 967     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
 968     return str;
 969 }
 970
 971 VALUE
 972 rb_utf8_str_new_cstr(const char *ptr)
 973 {
 974     VALUE str = rb_str_new_cstr(ptr);
 975     rb_enc_associate_index(str, rb_utf8_encindex());
 976     return str;
 977 }
 978
 979 VALUE
 980 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
 981 {
 982     must_not_null(ptr);
 983     if (rb_enc_mbminlen(enc) != 1) {
 984         rb_raise(rb_eArgError, "wchar encoding given");
 985     }
 986     return rb_enc_str_new(ptr, strlen(ptr), enc);
 987 }
 988
 989 static VALUE
 990 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
 991 {
 992     VALUE str;
 993
 994     if (len < 0) {
 995         rb_raise(rb_eArgError, "negative string size (or size too big)");
 996     }
 997
 998     if (!ptr) {
 999         rb_encoding *enc = rb_enc_get_from_index(encindex);
1000         str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1001     }
1002     else {
1003         RUBY_DTRACE_CREATE_HOOK(STRING, len);
1004         str = str_alloc_heap(klass);
1005         RSTRING(str)->len = len;
1006         RSTRING(str)->as.heap.ptr = (char *)ptr;
1007         RSTRING(str)->as.heap.aux.capa = len;
1008         RBASIC(str)->flags |= STR_NOFREE;
1009     }
1010     rb_enc_associate_index(str, encindex);
1011     return str;
1012 }
1013
1014 VALUE
1015 rb_str_new_static(const char *ptr, long len)
1016 {
1017     return str_new_static(rb_cString, ptr, len, 0);
1018 }
1019
1020 VALUE
1021 rb_usascii_str_new_static(const char *ptr, long len)
1022 {
1023     return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1024 }
1025
1026 VALUE
1027 rb_utf8_str_new_static(const char *ptr, long len)
1028 {
1029     return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1030 }
1031
1032 VALUE
1033 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1034 {
1035     return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1036 }
1037
1038 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1039                                    rb_encoding *from, rb_encoding *to,
1040                                    int ecflags, VALUE ecopts);
1041
1042 static inline bool
1043 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1044 {
1045     int encidx = rb_enc_to_index(enc);
1046     if (rb_enc_get_index(str) == encidx)
1047         return is_ascii_string(str);
1048     return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1049 }
1050
1051 VALUE
1052 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1053 {
1054     long len;
1055     const char *ptr;
1056     VALUE newstr;
1057
1058     if (!to) return str;
1059     if (!from) from = rb_enc_get(str);
1060     if (from == to) return str;
1061     if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1062         rb_is_ascii8bit_enc(to)) {
1063         if (STR_ENC_GET(str) != to) {
1064             str = rb_str_dup(str);
1065             rb_enc_associate(str, to);
1066         }
1067         return str;
1068     }
1069
1070     RSTRING_GETMEM(str, ptr, len);
1071     newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1072                                    from, to, ecflags, ecopts);
1073     if (NIL_P(newstr)) {
1074         /* some error, return original */
1075         return str;
1076     }
1077     return newstr;
1078 }
1079
1080 VALUE
1081 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1082                          rb_encoding *from, int ecflags, VALUE ecopts)
1083 {
1084     long olen;
1085
1086     olen = RSTRING_LEN(newstr);
1087     if (ofs < -olen || olen < ofs)
1088         rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1089     if (ofs < 0) ofs += olen;
1090     if (!from) {
1091         STR_SET_LEN(newstr, ofs);
1092         return rb_str_cat(newstr, ptr, len);
1093     }
1094
1095     rb_str_modify(newstr);
1096     return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1097                                  rb_enc_get(newstr),
1098                                  ecflags, ecopts);
1099 }
1100
1101 VALUE
1102 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1103 {
1104     STR_SET_LEN(str, 0);
1105     rb_enc_associate(str, enc);
1106     rb_str_cat(str, ptr, len);
1107     return str;
1108 }
1109
1110 static VALUE
1111 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1112                       rb_encoding *from, rb_encoding *to,
1113                       int ecflags, VALUE ecopts)
1114 {
1115     rb_econv_t *ec;
1116     rb_econv_result_t ret;
1117     long olen;
1118     VALUE econv_wrapper;
1119     const unsigned char *start, *sp;
1120     unsigned char *dest, *dp;
1121     size_t converted_output = (size_t)ofs;
1122
1123     olen = rb_str_capacity(newstr);
1124
1125     econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1126     RBASIC_CLEAR_CLASS(econv_wrapper);
1127     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1128     if (!ec) return Qnil;
1129     DATA_PTR(econv_wrapper) = ec;
1130
1131     sp = (unsigned char*)ptr;
1132     start = sp;
1133     while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1134            (dp = dest + converted_output),
1135            (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1136            ret == econv_destination_buffer_full) {
1137         /* destination buffer short */
1138         size_t converted_input = sp - start;
1139         size_t rest = len - converted_input;
1140         converted_output = dp - dest;
1141         rb_str_set_len(newstr, converted_output);
1142         if (converted_input && converted_output &&
1143             rest < (LONG_MAX / converted_output)) {
1144             rest = (rest * converted_output) / converted_input;
1145         }
1146         else {
1147             rest = olen;
1148         }
1149         olen += rest < 2 ? 2 : rest;
1150         rb_str_resize(newstr, olen);
1151     }
1152     DATA_PTR(econv_wrapper) = 0;
1153     RB_GC_GUARD(econv_wrapper);
1154     rb_econv_close(ec);
1155     switch (ret) {
1156       case econv_finished:
1157         len = dp - (unsigned char*)RSTRING_PTR(newstr);
1158         rb_str_set_len(newstr, len);
1159         rb_enc_associate(newstr, to);
1160         return newstr;
1161
1162       default:
1163         return Qnil;
1164     }
1165 }
1166
1167 VALUE
1168 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
1169 {
1170     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1171 }
1172
1173 VALUE
1174 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
1175 {
1176     rb_encoding *ienc;
1177     VALUE str;
1178     const int eidx = rb_enc_to_index(eenc);
1179
1180     if (!ptr) {
1181         return rb_enc_str_new(ptr, len, eenc);
1182     }
1183
1184     /* ASCII-8BIT case, no conversion */
1185     if ((eidx == rb_ascii8bit_encindex()) ||
1186         (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1187         return rb_str_new(ptr, len);
1188     }
1189     /* no default_internal or same encoding, no conversion */
1190     ienc = rb_default_internal_encoding();
1191     if (!ienc || eenc == ienc) {
1192         return rb_enc_str_new(ptr, len, eenc);
1193     }
1194     /* ASCII compatible, and ASCII only string, no conversion in
1195      * default_internal */
1196     if ((eidx == rb_ascii8bit_encindex()) ||
1197         (eidx == rb_usascii_encindex()) ||
1198         (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1199         return rb_enc_str_new(ptr, len, ienc);
1200     }
1201     /* convert from the given encoding to default_internal */
1202     str = rb_enc_str_new(NULL, 0, ienc);
1203     /* when the conversion failed for some reason, just ignore the
1204      * default_internal and result in the given encoding as-is. */
1205     if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1206         rb_str_initialize(str, ptr, len, eenc);
1207     }
1208     return str;
1209 }
1210
1211 VALUE
1212 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1213 {
1214     int eidx = rb_enc_to_index(eenc);
1215     if (eidx == rb_usascii_encindex() &&
1216         !is_ascii_string(str)) {
1217         rb_enc_associate_index(str, rb_ascii8bit_encindex());
1218         return str;
1219     }
1220     rb_enc_associate_index(str, eidx);
1221     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1222 }
1223
1224 VALUE
1225 rb_external_str_new(const char *ptr, long len)
1226 {
1227     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1228 }
1229
1230 VALUE
1231 rb_external_str_new_cstr(const char *ptr)
1232 {
1233     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1234 }
1235
1236 VALUE
1237 rb_locale_str_new(const char *ptr, long len)
1238 {
1239     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1240 }
1241
1242 VALUE
1243 rb_locale_str_new_cstr(const char *ptr)
1244 {
1245     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1246 }
1247
1248 VALUE
1249 rb_filesystem_str_new(const char *ptr, long len)
1250 {
1251     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1252 }
1253
1254 VALUE
1255 rb_filesystem_str_new_cstr(const char *ptr)
1256 {
1257     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1258 }
1259
1260 VALUE
1261 rb_str_export(VALUE str)
1262 {
1263     return rb_str_export_to_enc(str, rb_default_external_encoding());
1264 }
1265
1266 VALUE
1267 rb_str_export_locale(VALUE str)
1268 {
1269     return rb_str_export_to_enc(str, rb_locale_encoding());
1270 }
1271
1272 VALUE
1273 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
1274 {
1275     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1276 }
1277
1278 static VALUE
1279 str_replace_shared_without_enc(VALUE str2, VALUE str)
1280 {
1281     const int termlen = TERM_LEN(str);
1282     char *ptr;
1283     long len;
1284
1285     RSTRING_GETMEM(str, ptr, len);
1286     if (str_embed_capa(str2) >= len + termlen) {
1287         char *ptr2 = RSTRING(str2)->as.embed.ary;
1288         STR_SET_EMBED(str2);
1289         memcpy(ptr2, RSTRING_PTR(str), len);
1290         TERM_FILL(ptr2+len, termlen);
1291     }
1292     else {
1293         VALUE root;
1294         if (STR_SHARED_P(str)) {
1295             root = RSTRING(str)->as.heap.aux.shared;
1296             RSTRING_GETMEM(str, ptr, len);
1297         }
1298         else {
1299             root = rb_str_new_frozen(str);
1300             RSTRING_GETMEM(root, ptr, len);
1301         }
1302         RUBY_ASSERT(OBJ_FROZEN(root));
1303
1304         if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1305             if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1306                 rb_fatal("about to free a possible shared root");
1307             }
1308             char *ptr2 = STR_HEAP_PTR(str2);
1309             if (ptr2 != ptr) {
1310                 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1311             }
1312         }
1313         FL_SET(str2, STR_NOEMBED);
1314         RSTRING(str2)->as.heap.ptr = ptr;
1315         STR_SET_SHARED(str2, root);
1316     }
1317
1318     STR_SET_LEN(str2, len);
1319
1320     return str2;
1321 }
1322
1323 static VALUE
1324 str_replace_shared(VALUE str2, VALUE str)
1325 {
1326     str_replace_shared_without_enc(str2, str);
1327     rb_enc_cr_str_exact_copy(str2, str);
1328     return str2;
1329 }
1330
1331 static VALUE
1332 str_new_shared(VALUE klass, VALUE str)
1333 {
1334     return str_replace_shared(str_alloc_heap(klass), str);
1335 }
1336
1337 VALUE
1338 rb_str_new_shared(VALUE str)
1339 {
1340     return str_new_shared(rb_obj_class(str), str);
1341 }
1342
1343 VALUE
1344 rb_str_new_frozen(VALUE orig)
1345 {
1346     if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1347     return str_new_frozen(rb_obj_class(orig), orig);
1348 }
1349
1350 static VALUE
1351 rb_str_new_frozen_String(VALUE orig)
1352 {
1353     if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1354     return str_new_frozen(rb_cString, orig);
1355 }
1356
1357 VALUE
1358 rb_str_tmp_frozen_acquire(VALUE orig)
1359 {
1360     if (OBJ_FROZEN_RAW(orig)) return orig;
1361     return str_new_frozen_buffer(0, orig, FALSE);
1362 }
1363
1364 VALUE
1365 rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1366 {
1367     if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1368     if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1369
1370     VALUE str = str_alloc_heap(0);
1371     OBJ_FREEZE(str);
1372     /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1373     FL_SET(str, STR_SHARED_ROOT);
1374
1375     size_t capa = str_capacity(orig, TERM_LEN(orig));
1376
1377     /* If the string is embedded then we want to create a copy that is heap
1378      * allocated. If the string is shared then the shared root must be
1379      * embedded, so we want to create a copy. If the string is a shared root
1380      * then it must be embedded, so we want to create a copy. */
1381     if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1382         RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1383         memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1384     }
1385     else {
1386         /* orig must be heap allocated and not shared, so we can safely transfer
1387          * the pointer to str. */
1388         RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1389         RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1390         RBASIC(orig)->flags &= ~STR_NOFREE;
1391         STR_SET_SHARED(orig, str);
1392     }
1393
1394     RSTRING(str)->len = RSTRING(orig)->len;
1395     RSTRING(str)->as.heap.aux.capa = capa;
1396
1397     return str;
1398 }
1399
1400 void
1401 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1402 {
1403     if (RBASIC_CLASS(tmp) != 0)
1404         return;
1405
1406     if (STR_EMBED_P(tmp)) {
1407         RUBY_ASSERT(OBJ_FROZEN_RAW(tmp));
1408     }
1409     else if (FL_TEST_RAW(orig, STR_SHARED) &&
1410             !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1411         VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1412
1413         if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1414             RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1415             RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1416
1417             /* Unshare orig since the root (tmp) only has this one child. */
1418             FL_UNSET_RAW(orig, STR_SHARED);
1419             RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1420             RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1421             RUBY_ASSERT(OBJ_FROZEN_RAW(tmp));
1422
1423             /* Make tmp embedded and empty so it is safe for sweeping. */
1424             STR_SET_EMBED(tmp);
1425             STR_SET_LEN(tmp, 0);
1426         }
1427     }
1428 }
1429
1430 static VALUE
1431 str_new_frozen(VALUE klass, VALUE orig)
1432 {
1433     return str_new_frozen_buffer(klass, orig, TRUE);
1434 }
1435
1436 static VALUE
1437 heap_str_make_shared(VALUE klass, VALUE orig)
1438 {
1439     RUBY_ASSERT(!STR_EMBED_P(orig));
1440     RUBY_ASSERT(!STR_SHARED_P(orig));
1441
1442     VALUE str = str_alloc_heap(klass);
1443     STR_SET_LEN(str, RSTRING_LEN(orig));
1444     RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1445     RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1446     RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1447     RBASIC(orig)->flags &= ~STR_NOFREE;
1448     STR_SET_SHARED(orig, str);
1449     if (klass == 0)
1450         FL_UNSET_RAW(str, STR_BORROWED);
1451     return str;
1452 }
1453
1454 static VALUE
1455 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1456 {
1457     VALUE str;
1458
1459     long len = RSTRING_LEN(orig);
1460     int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1461
1462     if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1463         str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1464         RUBY_ASSERT(STR_EMBED_P(str));
1465     }
1466     else {
1467         if (FL_TEST_RAW(orig, STR_SHARED)) {
1468             VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1469             long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1470             long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1471             RUBY_ASSERT(ofs >= 0);
1472             RUBY_ASSERT(rest >= 0);
1473             RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1474             RUBY_ASSERT(OBJ_FROZEN(shared));
1475
1476             if ((ofs > 0) || (rest > 0) ||
1477                 (klass != RBASIC(shared)->klass) ||
1478                 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1479                 str = str_new_shared(klass, shared);
1480                 RUBY_ASSERT(!STR_EMBED_P(str));
1481                 RSTRING(str)->as.heap.ptr += ofs;
1482                 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1483             }
1484             else {
1485                 if (RBASIC_CLASS(shared) == 0)
1486                     FL_SET_RAW(shared, STR_BORROWED);
1487                 return shared;
1488             }
1489         }
1490         else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1491             str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1492             STR_SET_EMBED(str);
1493             memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1494             STR_SET_LEN(str, RSTRING_LEN(orig));
1495             TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1496         }
1497         else {
1498             str = heap_str_make_shared(klass, orig);
1499         }
1500     }
1501
1502     if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1503     OBJ_FREEZE(str);
1504     return str;
1505 }
1506
1507 VALUE
1508 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1509 {
1510     return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1511 }
1512
1513 static VALUE
1514 str_new_empty_String(VALUE str)
1515 {
1516     VALUE v = rb_str_new(0, 0);
1517     rb_enc_copy(v, str);
1518     return v;
1519 }
1520
1521 #define STR_BUF_MIN_SIZE 63
1522
1523 VALUE
1524 rb_str_buf_new(long capa)
1525 {
1526     if (STR_EMBEDDABLE_P(capa, 1)) {
1527         return str_alloc_embed(rb_cString, capa + 1);
1528     }
1529
1530     VALUE str = str_alloc_heap(rb_cString);
1531
1532     RSTRING(str)->as.heap.aux.capa = capa;
1533     RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1534     RSTRING(str)->as.heap.ptr[0] = '\0';
1535
1536     return str;
1537 }
1538
1539 VALUE
1540 rb_str_buf_new_cstr(const char *ptr)
1541 {
1542     VALUE str;
1543     long len = strlen(ptr);
1544
1545     str = rb_str_buf_new(len);
1546     rb_str_buf_cat(str, ptr, len);
1547
1548     return str;
1549 }
1550
1551 VALUE
1552 rb_str_tmp_new(long len)
1553 {
1554     return str_new(0, 0, len);
1555 }
1556
1557 void
1558 rb_str_free(VALUE str)
1559 {
1560     if (FL_TEST(str, RSTRING_FSTR)) {
1561         st_data_t fstr = (st_data_t)str;
1562
1563         RB_VM_LOCK_ENTER();
1564         {
1565             st_delete(rb_vm_fstring_table(), &fstr, NULL);
1566             RB_DEBUG_COUNTER_INC(obj_str_fstr);
1567         }
1568         RB_VM_LOCK_LEAVE();
1569     }
1570
1571     if (STR_EMBED_P(str)) {
1572         RB_DEBUG_COUNTER_INC(obj_str_embed);
1573     }
1574     else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1575         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1576         (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1577     }
1578     else {
1579         RB_DEBUG_COUNTER_INC(obj_str_ptr);
1580         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1581     }
1582 }
1583
1584 size_t
1585 rb_str_memsize(VALUE str)
1586 {
1587     if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1588         return STR_HEAP_SIZE(str);
1589     }
1590     else {
1591         return 0;
1592     }
1593 }
1594
1595 VALUE
1596 rb_str_to_str(VALUE str)
1597 {
1598     return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1599 }
1600
1601 static inline void str_discard(VALUE str);
1602 static void str_shared_replace(VALUE str, VALUE str2);
1603
1604 void
1605 rb_str_shared_replace(VALUE str, VALUE str2)
1606 {
1607     if (str != str2) str_shared_replace(str, str2);
1608 }
1609
1610 static void
1611 str_shared_replace(VALUE str, VALUE str2)
1612 {
1613     rb_encoding *enc;
1614     int cr;
1615     int termlen;
1616
1617     RUBY_ASSERT(str2 != str);
1618     enc = STR_ENC_GET(str2);
1619     cr = ENC_CODERANGE(str2);
1620     str_discard(str);
1621     termlen = rb_enc_mbminlen(enc);
1622
1623     STR_SET_LEN(str, RSTRING_LEN(str2));
1624
1625     if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1626         STR_SET_EMBED(str);
1627         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1628         rb_enc_associate(str, enc);
1629         ENC_CODERANGE_SET(str, cr);
1630     }
1631     else {
1632         if (STR_EMBED_P(str2)) {
1633             RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1634             long len = RSTRING_LEN(str2);
1635             RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1636
1637             char *new_ptr = ALLOC_N(char, len + termlen);
1638             memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1639             RSTRING(str2)->as.heap.ptr = new_ptr;
1640             STR_SET_LEN(str2, len);
1641             RSTRING(str2)->as.heap.aux.capa = len;
1642             STR_SET_NOEMBED(str2);
1643         }
1644
1645         STR_SET_NOEMBED(str);
1646         FL_UNSET(str, STR_SHARED);
1647         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1648
1649         if (FL_TEST(str2, STR_SHARED)) {
1650             VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1651             STR_SET_SHARED(str, shared);
1652         }
1653         else {
1654             RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1655         }
1656
1657         /* abandon str2 */
1658         STR_SET_EMBED(str2);
1659         RSTRING_PTR(str2)[0] = 0;
1660         STR_SET_LEN(str2, 0);
1661         rb_enc_associate(str, enc);
1662         ENC_CODERANGE_SET(str, cr);
1663     }
1664 }
1665
1666 VALUE
1667 rb_obj_as_string(VALUE obj)
1668 {
1669     VALUE str;
1670
1671     if (RB_TYPE_P(obj, T_STRING)) {
1672         return obj;
1673     }
1674     str = rb_funcall(obj, idTo_s, 0);
1675     return rb_obj_as_string_result(str, obj);
1676 }
1677
1678 VALUE
1679 rb_obj_as_string_result(VALUE str, VALUE obj)
1680 {
1681     if (!RB_TYPE_P(str, T_STRING))
1682         return rb_any_to_s(obj);
1683     return str;
1684 }
1685
1686 static VALUE
1687 str_replace(VALUE str, VALUE str2)
1688 {
1689     long len;
1690
1691     len = RSTRING_LEN(str2);
1692     if (STR_SHARED_P(str2)) {
1693         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1694         RUBY_ASSERT(OBJ_FROZEN(shared));
1695         STR_SET_NOEMBED(str);
1696         STR_SET_LEN(str, len);
1697         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1698         STR_SET_SHARED(str, shared);
1699         rb_enc_cr_str_exact_copy(str, str2);
1700     }
1701     else {
1702         str_replace_shared(str, str2);
1703     }
1704
1705     return str;
1706 }
1707
1708 static inline VALUE
1709 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1710 {
1711     size_t size = rb_str_embed_size(capa);
1712     RUBY_ASSERT(size > 0);
1713     RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1714
1715     NEWOBJ_OF(str, struct RString, klass,
1716             T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size, ec);
1717
1718     return (VALUE)str;
1719 }
1720
1721 static inline VALUE
1722 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1723 {
1724     NEWOBJ_OF(str, struct RString, klass,
1725             T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1726
1727     return (VALUE)str;
1728 }
1729
1730 static inline VALUE
1731 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1732 {
1733     const VALUE flag_mask =
1734         ENC_CODERANGE_MASK | ENCODING_MASK |
1735         FL_FREEZE
1736         ;
1737     VALUE flags = FL_TEST_RAW(str, flag_mask);
1738     int encidx = 0;
1739     if (STR_EMBED_P(str)) {
1740         long len = RSTRING_LEN(str);
1741
1742         RUBY_ASSERT(STR_EMBED_P(dup));
1743         RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1744         MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1745     }
1746     else {
1747         VALUE root = str;
1748         if (FL_TEST_RAW(str, STR_SHARED)) {
1749             root = RSTRING(str)->as.heap.aux.shared;
1750         }
1751         else if (UNLIKELY(!(flags & FL_FREEZE))) {
1752             root = str = str_new_frozen(klass, str);
1753             flags = FL_TEST_RAW(str, flag_mask);
1754         }
1755         RUBY_ASSERT(!STR_SHARED_P(root));
1756         RUBY_ASSERT(RB_OBJ_FROZEN_RAW(root));
1757
1758         RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1759         FL_SET(root, STR_SHARED_ROOT);
1760         RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1761         flags |= RSTRING_NOEMBED | STR_SHARED;
1762     }
1763
1764     STR_SET_LEN(dup, RSTRING_LEN(str));
1765
1766     if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1767         encidx = rb_enc_get_index(str);
1768         flags &= ~ENCODING_MASK;
1769     }
1770     FL_SET_RAW(dup, flags & ~FL_FREEZE);
1771     if (encidx) rb_enc_associate_index(dup, encidx);
1772     return dup;
1773 }
1774
1775 static inline VALUE
1776 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1777 {
1778     VALUE dup;
1779     if (STR_EMBED_P(str)) {
1780         dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1781     }
1782     else {
1783         dup = ec_str_alloc_heap(ec, klass);
1784     }
1785
1786     return str_duplicate_setup(klass, str, dup);
1787 }
1788
1789 static inline VALUE
1790 str_duplicate(VALUE klass, VALUE str)
1791 {
1792     VALUE dup;
1793     if (STR_EMBED_P(str)) {
1794         dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1795     }
1796     else {
1797         dup = str_alloc_heap(klass);
1798     }
1799
1800     return str_duplicate_setup(klass, str, dup);
1801 }
1802
1803 VALUE
1804 rb_str_dup(VALUE str)
1805 {
1806     return str_duplicate(rb_obj_class(str), str);
1807 }
1808
1809 /* :nodoc: */
1810 VALUE
1811 rb_str_dup_m(VALUE str)
1812 {
1813     if (LIKELY(BARE_STRING_P(str))) {
1814         return str_duplicate(rb_obj_class(str), str);
1815     }
1816     else {
1817         return rb_obj_dup(str);
1818     }
1819 }
1820
1821 VALUE
1822 rb_str_resurrect(VALUE str)
1823 {
1824     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1825     return str_duplicate(rb_cString, str);
1826 }
1827
1828 VALUE
1829 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1830 {
1831     RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1832     VALUE new_str = ec_str_duplicate(ec, rb_cString, str);
1833     if (chilled) {
1834         STR_CHILL_RAW(new_str);
1835     }
1836     return new_str;
1837 }
1838
1839 bool
1840 rb_str_chilled_p(VALUE str)
1841 {
1842     return CHILLED_STRING_P(str);
1843 }
1844
1845 /*
1846  *
1847  *  call-seq:
1848  *    String.new(string = '', **opts) -> new_string
1849  *
1850  *  :include: doc/string/new.rdoc
1851  *
1852  */
1853
1854 static VALUE
1855 rb_str_init(int argc, VALUE *argv, VALUE str)
1856 {
1857     static ID keyword_ids[2];
1858     VALUE orig, opt, venc, vcapa;
1859     VALUE kwargs[2];
1860     rb_encoding *enc = 0;
1861     int n;
1862
1863     if (!keyword_ids[0]) {
1864         keyword_ids[0] = rb_id_encoding();
1865         CONST_ID(keyword_ids[1], "capacity");
1866     }
1867
1868     n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1869     if (!NIL_P(opt)) {
1870         rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1871         venc = kwargs[0];
1872         vcapa = kwargs[1];
1873         if (!UNDEF_P(venc) && !NIL_P(venc)) {
1874             enc = rb_to_encoding(venc);
1875         }
1876         if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1877             long capa = NUM2LONG(vcapa);
1878             long len = 0;
1879             int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1880
1881             if (capa < STR_BUF_MIN_SIZE) {
1882                 capa = STR_BUF_MIN_SIZE;
1883             }
1884             if (n == 1) {
1885                 StringValue(orig);
1886                 len = RSTRING_LEN(orig);
1887                 if (capa < len) {
1888                     capa = len;
1889                 }
1890                 if (orig == str) n = 0;
1891             }
1892             str_modifiable(str);
1893             if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1894                 /* make noembed always */
1895                 const size_t size = (size_t)capa + termlen;
1896                 const char *const old_ptr = RSTRING_PTR(str);
1897                 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
1898                 char *new_ptr = ALLOC_N(char, size);
1899                 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
1900                 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1901                 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1902                 RSTRING(str)->as.heap.ptr = new_ptr;
1903             }
1904             else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1905                 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1906                         (size_t)capa + termlen, STR_HEAP_SIZE(str));
1907             }
1908             STR_SET_LEN(str, len);
1909             TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1910             if (n == 1) {
1911                 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1912                 rb_enc_cr_str_exact_copy(str, orig);
1913             }
1914             FL_SET(str, STR_NOEMBED);
1915             RSTRING(str)->as.heap.aux.capa = capa;
1916         }
1917         else if (n == 1) {
1918             rb_str_replace(str, orig);
1919         }
1920         if (enc) {
1921             rb_enc_associate(str, enc);
1922             ENC_CODERANGE_CLEAR(str);
1923         }
1924     }
1925     else if (n == 1) {
1926         rb_str_replace(str, orig);
1927     }
1928     return str;
1929 }
1930
1931 /* :nodoc: */
1932 static VALUE
1933 rb_str_s_new(int argc, VALUE *argv, VALUE klass)
1934 {
1935     if (klass != rb_cString) {
1936         return rb_class_new_instance_pass_kw(argc, argv, klass);
1937     }
1938
1939     static ID keyword_ids[2];
1940     VALUE orig, opt, encoding = Qnil, capacity = Qnil;
1941     VALUE kwargs[2];
1942     rb_encoding *enc = NULL;
1943
1944     int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1945     if (NIL_P(opt)) {
1946         return rb_class_new_instance_pass_kw(argc, argv, klass);
1947     }
1948
1949     keyword_ids[0] = rb_id_encoding();
1950     CONST_ID(keyword_ids[1], "capacity");
1951     rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1952     encoding = kwargs[0];
1953     capacity = kwargs[1];
1954
1955     int termlen = 1;
1956
1957     if (n == 1) {
1958         orig = StringValue(orig);
1959     }
1960     else {
1961         orig = Qnil;
1962     }
1963
1964     if (UNDEF_P(encoding)) {
1965         if (!NIL_P(orig)) {
1966             encoding = rb_obj_encoding(orig);
1967         }
1968     }
1969
1970     if (!UNDEF_P(encoding)) {
1971         enc = rb_to_encoding(encoding);
1972         termlen = rb_enc_mbminlen(enc);
1973     }
1974
1975     // If capacity is nil, we're basically just duping `orig`.
1976     if (UNDEF_P(capacity)) {
1977         if (NIL_P(orig)) {
1978             VALUE empty_str = str_new(klass, "", 0);
1979             if (enc) {
1980                 rb_enc_associate(empty_str, enc);
1981             }
1982             return empty_str;
1983         }
1984         VALUE copy = str_duplicate(klass, orig);
1985         rb_enc_associate(copy, enc);
1986         ENC_CODERANGE_CLEAR(copy);
1987         return copy;
1988     }
1989
1990     long capa = 0;
1991     capa = NUM2LONG(capacity);
1992     if (capa < 0) {
1993         capa = 0;
1994     }
1995
1996     if (!NIL_P(orig)) {
1997         long orig_capa = rb_str_capacity(orig);
1998         if (orig_capa > capa) {
1999             capa = orig_capa;
2000         }
2001     }
2002
2003     long fake_len = capa - termlen;
2004     if (fake_len < 0) {
2005         fake_len = 0;
2006     }
2007
2008     VALUE str = str_new0(klass, NULL, fake_len, termlen);
2009     STR_SET_LEN(str, 0);
2010     TERM_FILL(RSTRING_PTR(str), termlen);
2011
2012     if (enc) {
2013         rb_enc_associate(str, enc);
2014     }
2015
2016     if (!NIL_P(orig)) {
2017         rb_str_buf_append(str, orig);
2018     }
2019
2020     return str;
2021 }
2022
2023 #ifdef NONASCII_MASK
2024 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2025
2026 /*
2027  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2028  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2029  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2030  *
2031  * if (!(byte & 0x80))
2032  *   byte |= 0x40;          // turn on bit6
2033  * return ((byte>>6) & 1);  // bit6 represent whether this byte is leading or not.
2034  *
2035  * This function calculates whether a byte is leading or not for all bytes
2036  * in the argument word by concurrently using the above logic, and then
2037  * adds up the number of leading bytes in the word.
2038  */
2039 static inline uintptr_t
2040 count_utf8_lead_bytes_with_word(const uintptr_t *s)
2041 {
2042     uintptr_t d = *s;
2043
2044     /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2045     d = (d>>6) | (~d>>7);
2046     d &= NONASCII_MASK >> 7;
2047
2048     /* Gather all bytes. */
2049 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2050     /* use only if it can use POPCNT */
2051     return rb_popcount_intptr(d);
2052 #else
2053     d += (d>>8);
2054     d += (d>>16);
2055 # if SIZEOF_VOIDP == 8
2056     d += (d>>32);
2057 # endif
2058     return (d&0xF);
2059 #endif
2060 }
2061 #endif
2062
2063 static inline long
2064 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2065 {
2066     long c;
2067     const char *q;
2068
2069     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2070         long diff = (long)(e - p);
2071         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2072     }
2073 #ifdef NONASCII_MASK
2074     else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2075         uintptr_t len = 0;
2076         if ((int)sizeof(uintptr_t) * 2 < e - p) {
2077             const uintptr_t *s, *t;
2078             const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2079             s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2080             t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2081             while (p < (const char *)s) {
2082                 if (is_utf8_lead_byte(*p)) len++;
2083                 p++;
2084             }
2085             while (s < t) {
2086                 len += count_utf8_lead_bytes_with_word(s);
2087                 s++;
2088             }
2089             p = (const char *)s;
2090         }
2091         while (p < e) {
2092             if (is_utf8_lead_byte(*p)) len++;
2093             p++;
2094         }
2095         return (long)len;
2096     }
2097 #endif
2098     else if (rb_enc_asciicompat(enc)) {
2099         c = 0;
2100         if (ENC_CODERANGE_CLEAN_P(cr)) {
2101             while (p < e) {
2102                 if (ISASCII(*p)) {
2103                     q = search_nonascii(p, e);
2104                     if (!q)
2105                         return c + (e - p);
2106                     c += q - p;
2107                     p = q;
2108                 }
2109                 p += rb_enc_fast_mbclen(p, e, enc);
2110                 c++;
2111             }
2112         }
2113         else {
2114             while (p < e) {
2115                 if (ISASCII(*p)) {
2116                     q = search_nonascii(p, e);
2117                     if (!q)
2118                         return c + (e - p);
2119                     c += q - p;
2120                     p = q;
2121                 }
2122                 p += rb_enc_mbclen(p, e, enc);
2123                 c++;
2124             }
2125         }
2126         return c;
2127     }
2128
2129     for (c=0; p<e; c++) {
2130         p += rb_enc_mbclen(p, e, enc);
2131     }
2132     return c;
2133 }
2134
2135 long
2136 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2137 {
2138     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2139 }
2140
2141 /* To get strlen with cr
2142  * Note that given cr is not used.
2143  */
2144 long
2145 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2146 {
2147     long c;
2148     const char *q;
2149     int ret;
2150
2151     *cr = 0;
2152     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2153         long diff = (long)(e - p);
2154         return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2155     }
2156     else if (rb_enc_asciicompat(enc)) {
2157         c = 0;
2158         while (p < e) {
2159             if (ISASCII(*p)) {
2160                 q = search_nonascii(p, e);
2161                 if (!q) {
2162                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2163                     return c + (e - p);
2164                 }
2165                 c += q - p;
2166                 p = q;
2167             }
2168             ret = rb_enc_precise_mbclen(p, e, enc);
2169             if (MBCLEN_CHARFOUND_P(ret)) {
2170                 *cr |= ENC_CODERANGE_VALID;
2171                 p += MBCLEN_CHARFOUND_LEN(ret);
2172             }
2173             else {
2174                 *cr = ENC_CODERANGE_BROKEN;
2175                 p++;
2176             }
2177             c++;
2178         }
2179         if (!*cr) *cr = ENC_CODERANGE_7BIT;
2180         return c;
2181     }
2182
2183     for (c=0; p<e; c++) {
2184         ret = rb_enc_precise_mbclen(p, e, enc);
2185         if (MBCLEN_CHARFOUND_P(ret)) {
2186             *cr |= ENC_CODERANGE_VALID;
2187             p += MBCLEN_CHARFOUND_LEN(ret);
2188         }
2189         else {
2190             *cr = ENC_CODERANGE_BROKEN;
2191             if (p + rb_enc_mbminlen(enc) <= e)
2192                 p += rb_enc_mbminlen(enc);
2193             else
2194                 p = e;
2195         }
2196     }
2197     if (!*cr) *cr = ENC_CODERANGE_7BIT;
2198     return c;
2199 }
2200
2201 /* enc must be str's enc or rb_enc_check(str, str2) */
2202 static long
2203 str_strlen(VALUE str, rb_encoding *enc)
2204 {
2205     const char *p, *e;
2206     int cr;
2207
2208     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2209     if (!enc) enc = STR_ENC_GET(str);
2210     p = RSTRING_PTR(str);
2211     e = RSTRING_END(str);
2212     cr = ENC_CODERANGE(str);
2213
2214     if (cr == ENC_CODERANGE_UNKNOWN) {
2215         long n = rb_enc_strlen_cr(p, e, enc, &cr);
2216         if (cr) ENC_CODERANGE_SET(str, cr);
2217         return n;
2218     }
2219     else {
2220         return enc_strlen(p, e, enc, cr);
2221     }
2222 }
2223
2224 long
2225 rb_str_strlen(VALUE str)
2226 {
2227     return str_strlen(str, NULL);
2228 }
2229
2230 /*
2231  *  call-seq:
2232  *    length -> integer
2233  *
2234  *  :include: doc/string/length.rdoc
2235  *
2236  */
2237
2238 VALUE
2239 rb_str_length(VALUE str)
2240 {
2241     return LONG2NUM(str_strlen(str, NULL));
2242 }
2243
2244 /*
2245  *  call-seq:
2246  *    bytesize -> integer
2247  *
2248  *  :include: doc/string/bytesize.rdoc
2249  *
2250  */
2251
2252 VALUE
2253 rb_str_bytesize(VALUE str)
2254 {
2255     return LONG2NUM(RSTRING_LEN(str));
2256 }
2257
2258 /*
2259  *  call-seq:
2260  *    empty? -> true or false
2261  *
2262  *  Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2263  *
2264  *    "hello".empty? # => false
2265  *    " ".empty? # => false
2266  *    "".empty? # => true
2267  *
2268  */
2269
2270 static VALUE
2271 rb_str_empty(VALUE str)
2272 {
2273     return RBOOL(RSTRING_LEN(str) == 0);
2274 }
2275
2276 /*
2277  *  call-seq:
2278  *    string + other_string -> new_string
2279  *
2280  *  Returns a new +String+ containing +other_string+ concatenated to +self+:
2281  *
2282  *    "Hello from " + self.to_s # => "Hello from main"
2283  *
2284  */
2285
2286 VALUE
2287 rb_str_plus(VALUE str1, VALUE str2)
2288 {
2289     VALUE str3;
2290     rb_encoding *enc;
2291     char *ptr1, *ptr2, *ptr3;
2292     long len1, len2;
2293     int termlen;
2294
2295     StringValue(str2);
2296     enc = rb_enc_check_str(str1, str2);
2297     RSTRING_GETMEM(str1, ptr1, len1);
2298     RSTRING_GETMEM(str2, ptr2, len2);
2299     termlen = rb_enc_mbminlen(enc);
2300     if (len1 > LONG_MAX - len2) {
2301         rb_raise(rb_eArgError, "string size too big");
2302     }
2303     str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2304     ptr3 = RSTRING_PTR(str3);
2305     memcpy(ptr3, ptr1, len1);
2306     memcpy(ptr3+len1, ptr2, len2);
2307     TERM_FILL(&ptr3[len1+len2], termlen);
2308
2309     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2310                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
2311     RB_GC_GUARD(str1);
2312     RB_GC_GUARD(str2);
2313     return str3;
2314 }
2315
2316 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2317 VALUE
2318 rb_str_opt_plus(VALUE str1, VALUE str2)
2319 {
2320     RUBY_ASSERT(RBASIC_CLASS(str1) == rb_cString);
2321     RUBY_ASSERT(RBASIC_CLASS(str2) == rb_cString);
2322     long len1, len2;
2323     MAYBE_UNUSED(char) *ptr1, *ptr2;
2324     RSTRING_GETMEM(str1, ptr1, len1);
2325     RSTRING_GETMEM(str2, ptr2, len2);
2326     int enc1 = rb_enc_get_index(str1);
2327     int enc2 = rb_enc_get_index(str2);
2328
2329     if (enc1 < 0) {
2330         return Qundef;
2331     }
2332     else if (enc2 < 0) {
2333         return Qundef;
2334     }
2335     else if (enc1 != enc2) {
2336         return Qundef;
2337     }
2338     else if (len1 > LONG_MAX - len2) {
2339         return Qundef;
2340     }
2341     else {
2342         return rb_str_plus(str1, str2);
2343     }
2344
2345 }
2346
2347 /*
2348  *  call-seq:
2349  *    string * integer -> new_string
2350  *
2351  *  Returns a new +String+ containing +integer+ copies of +self+:
2352  *
2353  *    "Ho! " * 3 # => "Ho! Ho! Ho! "
2354  *    "Ho! " * 0 # => ""
2355  *
2356  */
2357
2358 VALUE
2359 rb_str_times(VALUE str, VALUE times)
2360 {
2361     VALUE str2;
2362     long n, len;
2363     char *ptr2;
2364     int termlen;
2365
2366     if (times == INT2FIX(1)) {
2367         return str_duplicate(rb_cString, str);
2368     }
2369     if (times == INT2FIX(0)) {
2370         str2 = str_alloc_embed(rb_cString, 0);
2371         rb_enc_copy(str2, str);
2372         return str2;
2373     }
2374     len = NUM2LONG(times);
2375     if (len < 0) {
2376         rb_raise(rb_eArgError, "negative argument");
2377     }
2378     if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2379         if (STR_EMBEDDABLE_P(len, 1)) {
2380             str2 = str_alloc_embed(rb_cString, len + 1);
2381             memset(RSTRING_PTR(str2), 0, len + 1);
2382         }
2383         else {
2384             str2 = str_alloc_heap(rb_cString);
2385             RSTRING(str2)->as.heap.aux.capa = len;
2386             RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2387         }
2388         STR_SET_LEN(str2, len);
2389         rb_enc_copy(str2, str);
2390         return str2;
2391     }
2392     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
2393         rb_raise(rb_eArgError, "argument too big");
2394     }
2395
2396     len *= RSTRING_LEN(str);
2397     termlen = TERM_LEN(str);
2398     str2 = str_new0(rb_cString, 0, len, termlen);
2399     ptr2 = RSTRING_PTR(str2);
2400     if (len) {
2401         n = RSTRING_LEN(str);
2402         memcpy(ptr2, RSTRING_PTR(str), n);
2403         while (n <= len/2) {
2404             memcpy(ptr2 + n, ptr2, n);
2405             n *= 2;
2406         }
2407         memcpy(ptr2 + n, ptr2, len-n);
2408     }
2409     STR_SET_LEN(str2, len);
2410     TERM_FILL(&ptr2[len], termlen);
2411     rb_enc_cr_str_copy_for_substr(str2, str);
2412
2413     return str2;
2414 }
2415
2416 /*
2417  *  call-seq:
2418  *    string % object -> new_string
2419  *
2420  *  Returns the result of formatting +object+ into the format specification +self+
2421  *  (see Kernel#sprintf for formatting details):
2422  *
2423  *    "%05d" % 123 # => "00123"
2424  *
2425  *  If +self+ contains multiple substitutions, +object+ must be
2426  *  an Array or Hash containing the values to be substituted:
2427  *
2428  *    "%-5s: %016x" % [ "ID", self.object_id ] # => "ID   : 00002b054ec93168"
2429  *    "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2430  *    "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2431  *
2432  */
2433
2434 static VALUE
2435 rb_str_format_m(VALUE str, VALUE arg)
2436 {
2437     VALUE tmp = rb_check_array_type(arg);
2438
2439     if (!NIL_P(tmp)) {
2440         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2441     }
2442     return rb_str_format(1, &arg, str);
2443 }
2444
2445 static inline void
2446 rb_check_lockedtmp(VALUE str)
2447 {
2448     if (FL_TEST(str, STR_TMPLOCK)) {
2449         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2450     }
2451 }
2452
2453 static inline void
2454 str_modifiable(VALUE str)
2455 {
2456     rb_check_lockedtmp(str);
2457     rb_check_frozen(str);
2458 }
2459
2460 static inline int
2461 str_dependent_p(VALUE str)
2462 {
2463     if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2464         return 0;
2465     }
2466     else {
2467         return 1;
2468     }
2469 }
2470
2471 static inline int
2472 str_independent(VALUE str)
2473 {
2474     str_modifiable(str);
2475     return !str_dependent_p(str);
2476 }
2477
2478 static void
2479 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2480 {
2481     char *ptr;
2482     char *oldptr;
2483     long capa = len + expand;
2484
2485     if (len > capa) len = capa;
2486
2487     if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2488         ptr = RSTRING(str)->as.heap.ptr;
2489         STR_SET_EMBED(str);
2490         memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2491         TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2492         STR_SET_LEN(str, len);
2493         return;
2494     }
2495
2496     ptr = ALLOC_N(char, (size_t)capa + termlen);
2497     oldptr = RSTRING_PTR(str);
2498     if (oldptr) {
2499         memcpy(ptr, oldptr, len);
2500     }
2501     if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2502         xfree(oldptr);
2503     }
2504     STR_SET_NOEMBED(str);
2505     FL_UNSET(str, STR_SHARED|STR_NOFREE);
2506     TERM_FILL(ptr + len, termlen);
2507     RSTRING(str)->as.heap.ptr = ptr;
2508     STR_SET_LEN(str, len);
2509     RSTRING(str)->as.heap.aux.capa = capa;
2510 }
2511
2512 void
2513 rb_str_modify(VALUE str)
2514 {
2515     if (!str_independent(str))
2516         str_make_independent(str);
2517     ENC_CODERANGE_CLEAR(str);
2518 }
2519
2520 void
2521 rb_str_modify_expand(VALUE str, long expand)
2522 {
2523     int termlen = TERM_LEN(str);
2524     long len = RSTRING_LEN(str);
2525
2526     if (expand < 0) {
2527         rb_raise(rb_eArgError, "negative expanding string size");
2528     }
2529     if (expand >= LONG_MAX - len) {
2530         rb_raise(rb_eArgError, "string size too big");
2531     }
2532
2533     if (!str_independent(str)) {
2534         str_make_independent_expand(str, len, expand, termlen);
2535     }
2536     else if (expand > 0) {
2537         RESIZE_CAPA_TERM(str, len + expand, termlen);
2538     }
2539     ENC_CODERANGE_CLEAR(str);
2540 }
2541
2542 /* As rb_str_modify(), but don't clear coderange */
2543 static void
2544 str_modify_keep_cr(VALUE str)
2545 {
2546     if (!str_independent(str))
2547         str_make_independent(str);
2548     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2549         /* Force re-scan later */
2550         ENC_CODERANGE_CLEAR(str);
2551 }
2552
2553 static inline void
2554 str_discard(VALUE str)
2555 {
2556     str_modifiable(str);
2557     if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2558         ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2559         RSTRING(str)->as.heap.ptr = 0;
2560         STR_SET_LEN(str, 0);
2561     }
2562 }
2563
2564 void
2565 rb_must_asciicompat(VALUE str)
2566 {
2567     rb_encoding *enc = rb_enc_get(str);
2568     if (!enc) {
2569         rb_raise(rb_eTypeError, "not encoding capable object");
2570     }
2571     if (!rb_enc_asciicompat(enc)) {
2572         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2573     }
2574 }
2575
2576 VALUE
2577 rb_string_value(volatile VALUE *ptr)
2578 {
2579     VALUE s = *ptr;
2580     if (!RB_TYPE_P(s, T_STRING)) {
2581         s = rb_str_to_str(s);
2582         *ptr = s;
2583     }
2584     return s;
2585 }
2586
2587 char *
2588 rb_string_value_ptr(volatile VALUE *ptr)
2589 {
2590     VALUE str = rb_string_value(ptr);
2591     return RSTRING_PTR(str);
2592 }
2593
2594 static int
2595 zero_filled(const char *s, int n)
2596 {
2597     for (; n > 0; --n) {
2598         if (*s++) return 0;
2599     }
2600     return 1;
2601 }
2602
2603 static const char *
2604 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2605 {
2606     const char *e = s + len;
2607
2608     for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2609         if (zero_filled(s, minlen)) return s;
2610     }
2611     return 0;
2612 }
2613
2614 static char *
2615 str_fill_term(VALUE str, char *s, long len, int termlen)
2616 {
2617     /* This function assumes that (capa + termlen) bytes of memory
2618      * is allocated, like many other functions in this file.
2619      */
2620     if (str_dependent_p(str)) {
2621         if (!zero_filled(s + len, termlen))
2622             str_make_independent_expand(str, len, 0L, termlen);
2623     }
2624     else {
2625         TERM_FILL(s + len, termlen);
2626         return s;
2627     }
2628     return RSTRING_PTR(str);
2629 }
2630
2631 void
2632 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2633 {
2634     long capa = str_capacity(str, oldtermlen) + oldtermlen;
2635     long len = RSTRING_LEN(str);
2636
2637     RUBY_ASSERT(capa >= len);
2638     if (capa - len < termlen) {
2639         rb_check_lockedtmp(str);
2640         str_make_independent_expand(str, len, 0L, termlen);
2641     }
2642     else if (str_dependent_p(str)) {
2643         if (termlen > oldtermlen)
2644             str_make_independent_expand(str, len, 0L, termlen);
2645     }
2646     else {
2647         if (!STR_EMBED_P(str)) {
2648             /* modify capa instead of realloc */
2649             RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2650             RSTRING(str)->as.heap.aux.capa = capa - termlen;
2651         }
2652         if (termlen > oldtermlen) {
2653             TERM_FILL(RSTRING_PTR(str) + len, termlen);
2654         }
2655     }
2656
2657     return;
2658 }
2659
2660 static char *
2661 str_null_check(VALUE str, int *w)
2662 {
2663     char *s = RSTRING_PTR(str);
2664     long len = RSTRING_LEN(str);
2665     rb_encoding *enc = rb_enc_get(str);
2666     const int minlen = rb_enc_mbminlen(enc);
2667
2668     if (minlen > 1) {
2669         *w = 1;
2670         if (str_null_char(s, len, minlen, enc)) {
2671             return NULL;
2672         }
2673         return str_fill_term(str, s, len, minlen);
2674     }
2675     *w = 0;
2676     if (!s || memchr(s, 0, len)) {
2677         return NULL;
2678     }
2679     if (s[len]) {
2680         s = str_fill_term(str, s, len, minlen);
2681     }
2682     return s;
2683 }
2684
2685 char *
2686 rb_str_to_cstr(VALUE str)
2687 {
2688     int w;
2689     return str_null_check(str, &w);
2690 }
2691
2692 char *
2693 rb_string_value_cstr(volatile VALUE *ptr)
2694 {
2695     VALUE str = rb_string_value(ptr);
2696     int w;
2697     char *s = str_null_check(str, &w);
2698     if (!s) {
2699         if (w) {
2700             rb_raise(rb_eArgError, "string contains null char");
2701         }
2702         rb_raise(rb_eArgError, "string contains null byte");
2703     }
2704     return s;
2705 }
2706
2707 char *
2708 rb_str_fill_terminator(VALUE str, const int newminlen)
2709 {
2710     char *s = RSTRING_PTR(str);
2711     long len = RSTRING_LEN(str);
2712     return str_fill_term(str, s, len, newminlen);
2713 }
2714
2715 VALUE
2716 rb_check_string_type(VALUE str)
2717 {
2718     str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2719     return str;
2720 }
2721
2722 /*
2723  *  call-seq:
2724  *    String.try_convert(object) -> object, new_string, or nil
2725  *
2726  *  If +object+ is a +String+ object, returns +object+.
2727  *
2728  *  Otherwise if +object+ responds to <tt>:to_str</tt>,
2729  *  calls <tt>object.to_str</tt> and returns the result.
2730  *
2731  *  Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2732  *
2733  *  Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2734  */
2735 static VALUE
2736 rb_str_s_try_convert(VALUE dummy, VALUE str)
2737 {
2738     return rb_check_string_type(str);
2739 }
2740
2741 static char*
2742 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2743 {
2744     long nth = *nthp;
2745     if (rb_enc_mbmaxlen(enc) == 1) {
2746         p += nth;
2747     }
2748     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2749         p += nth * rb_enc_mbmaxlen(enc);
2750     }
2751     else if (rb_enc_asciicompat(enc)) {
2752         const char *p2, *e2;
2753         int n;
2754
2755         while (p < e && 0 < nth) {
2756             e2 = p + nth;
2757             if (e < e2) {
2758                 *nthp = nth;
2759                 return (char *)e;
2760             }
2761             if (ISASCII(*p)) {
2762                 p2 = search_nonascii(p, e2);
2763                 if (!p2) {
2764                     nth -= e2 - p;
2765                     *nthp = nth;
2766                     return (char *)e2;
2767                 }
2768                 nth -= p2 - p;
2769                 p = p2;
2770             }
2771             n = rb_enc_mbclen(p, e, enc);
2772             p += n;
2773             nth--;
2774         }
2775         *nthp = nth;
2776         if (nth != 0) {
2777             return (char *)e;
2778         }
2779         return (char *)p;
2780     }
2781     else {
2782         while (p < e && nth--) {
2783             p += rb_enc_mbclen(p, e, enc);
2784         }
2785     }
2786     if (p > e) p = e;
2787     *nthp = nth;
2788     return (char*)p;
2789 }
2790
2791 char*
2792 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2793 {
2794     return str_nth_len(p, e, &nth, enc);
2795 }
2796
2797 static char*
2798 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2799 {
2800     if (singlebyte)
2801         p += nth;
2802     else {
2803         p = str_nth_len(p, e, &nth, enc);
2804     }
2805     if (!p) return 0;
2806     if (p > e) p = e;
2807     return (char *)p;
2808 }
2809
2810 /* char offset to byte offset */
2811 static long
2812 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2813 {
2814     const char *pp = str_nth(p, e, nth, enc, singlebyte);
2815     if (!pp) return e - p;
2816     return pp - p;
2817 }
2818
2819 long
2820 rb_str_offset(VALUE str, long pos)
2821 {
2822     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2823                       STR_ENC_GET(str), single_byte_optimizable(str));
2824 }
2825
2826 #ifdef NONASCII_MASK
2827 static char *
2828 str_utf8_nth(const char *p, const char *e, long *nthp)
2829 {
2830     long nth = *nthp;
2831     if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2832         const uintptr_t *s, *t;
2833         const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2834         s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2835         t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2836         while (p < (const char *)s) {
2837             if (is_utf8_lead_byte(*p)) nth--;
2838             p++;
2839         }
2840         do {
2841             nth -= count_utf8_lead_bytes_with_word(s);
2842             s++;
2843         } while (s < t && (int)SIZEOF_VOIDP <= nth);
2844         p = (char *)s;
2845     }
2846     while (p < e) {
2847         if (is_utf8_lead_byte(*p)) {
2848             if (nth == 0) break;
2849             nth--;
2850         }
2851         p++;
2852     }
2853     *nthp = nth;
2854     return (char *)p;
2855 }
2856
2857 static long
2858 str_utf8_offset(const char *p, const char *e, long nth)
2859 {
2860     const char *pp = str_utf8_nth(p, e, &nth);
2861     return pp - p;
2862 }
2863 #endif
2864
2865 /* byte offset to char offset */
2866 long
2867 rb_str_sublen(VALUE str, long pos)
2868 {
2869     if (single_byte_optimizable(str) || pos < 0)
2870         return pos;
2871     else {
2872         char *p = RSTRING_PTR(str);
2873         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2874     }
2875 }
2876
2877 static VALUE
2878 str_subseq(VALUE str, long beg, long len)
2879 {
2880     VALUE str2;
2881
2882     RUBY_ASSERT(beg >= 0);
2883     RUBY_ASSERT(len >= 0);
2884     RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
2885
2886     const int termlen = TERM_LEN(str);
2887     if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2888         str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2889         RB_GC_GUARD(str);
2890         return str2;
2891     }
2892
2893     str2 = str_alloc_heap(rb_cString);
2894     if (str_embed_capa(str2) >= len + termlen) {
2895         char *ptr2 = RSTRING(str2)->as.embed.ary;
2896         STR_SET_EMBED(str2);
2897         memcpy(ptr2, RSTRING_PTR(str) + beg, len);
2898         TERM_FILL(ptr2+len, termlen);
2899
2900         STR_SET_LEN(str2, len);
2901         RB_GC_GUARD(str);
2902     }
2903     else {
2904         str_replace_shared(str2, str);
2905         RUBY_ASSERT(!STR_EMBED_P(str2));
2906         ENC_CODERANGE_CLEAR(str2);
2907         RSTRING(str2)->as.heap.ptr += beg;
2908         if (RSTRING_LEN(str2) > len) {
2909             STR_SET_LEN(str2, len);
2910         }
2911     }
2912
2913     return str2;
2914 }
2915
2916 VALUE
2917 rb_str_subseq(VALUE str, long beg, long len)
2918 {
2919     VALUE str2 = str_subseq(str, beg, len);
2920     rb_enc_cr_str_copy_for_substr(str2, str);
2921     return str2;
2922 }
2923
2924 char *
2925 rb_str_subpos(VALUE str, long beg, long *lenp)
2926 {
2927     long len = *lenp;
2928     long slen = -1L;
2929     long blen = RSTRING_LEN(str);
2930     rb_encoding *enc = STR_ENC_GET(str);
2931     char *p, *s = RSTRING_PTR(str), *e = s + blen;
2932
2933     if (len < 0) return 0;
2934     if (!blen) {
2935         len = 0;
2936     }
2937     if (single_byte_optimizable(str)) {
2938         if (beg > blen) return 0;
2939         if (beg < 0) {
2940             beg += blen;
2941             if (beg < 0) return 0;
2942         }
2943         if (len > blen - beg)
2944             len = blen - beg;
2945         if (len < 0) return 0;
2946         p = s + beg;
2947         goto end;
2948     }
2949     if (beg < 0) {
2950         if (len > -beg) len = -beg;
2951         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2952             beg = -beg;
2953             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2954             p = e;
2955             if (!p) return 0;
2956             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2957             if (!p) return 0;
2958             len = e - p;
2959             goto end;
2960         }
2961         else {
2962             slen = str_strlen(str, enc);
2963             beg += slen;
2964             if (beg < 0) return 0;
2965             p = s + beg;
2966             if (len == 0) goto end;
2967         }
2968     }
2969     else if (beg > 0 && beg > RSTRING_LEN(str)) {
2970         return 0;
2971     }
2972     if (len == 0) {
2973         if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2974         p = s + beg;
2975     }
2976 #ifdef NONASCII_MASK
2977     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2978         enc == rb_utf8_encoding()) {
2979         p = str_utf8_nth(s, e, &beg);
2980         if (beg > 0) return 0;
2981         len = str_utf8_offset(p, e, len);
2982     }
2983 #endif
2984     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2985         int char_sz = rb_enc_mbmaxlen(enc);
2986
2987         p = s + beg * char_sz;
2988         if (p > e) {
2989             return 0;
2990         }
2991         else if (len * char_sz > e - p)
2992             len = e - p;
2993         else
2994             len *= char_sz;
2995     }
2996     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2997         if (beg > 0) return 0;
2998         len = 0;
2999     }
3000     else {
3001         len = str_offset(p, e, len, enc, 0);
3002     }
3003   end:
3004     *lenp = len;
3005     RB_GC_GUARD(str);
3006     return p;
3007 }
3008
3009 static VALUE str_substr(VALUE str, long beg, long len, int empty);
3010
3011 VALUE
3012 rb_str_substr(VALUE str, long beg, long len)
3013 {
3014     return str_substr(str, beg, len, TRUE);
3015 }
3016
3017 static VALUE
3018 str_substr(VALUE str, long beg, long len, int empty)
3019 {
3020     char *p = rb_str_subpos(str, beg, &len);
3021
3022     if (!p) return Qnil;
3023     if (!len && !empty) return Qnil;
3024
3025     beg = p - RSTRING_PTR(str);
3026
3027     VALUE str2 = str_subseq(str, beg, len);
3028     rb_enc_cr_str_copy_for_substr(str2, str);
3029     return str2;
3030 }
3031
3032 /* :nodoc: */
3033 VALUE
3034 rb_str_freeze(VALUE str)
3035 {
3036     if (CHILLED_STRING_P(str)) {
3037         FL_UNSET_RAW(str, STR_CHILLED);
3038     }
3039
3040     if (OBJ_FROZEN(str)) return str;
3041     rb_str_resize(str, RSTRING_LEN(str));
3042     return rb_obj_freeze(str);
3043 }
3044
3045 /*
3046  * call-seq:
3047  *   +string -> new_string or self
3048  *
3049  * Returns +self+ if +self+ is not frozen.
3050  *
3051  * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3052  */
3053 static VALUE
3054 str_uplus(VALUE str)
3055 {
3056     if (OBJ_FROZEN(str)) {
3057         return rb_str_dup(str);
3058     }
3059     else {
3060         return str;
3061     }
3062 }
3063
3064 /*
3065  * call-seq:
3066  *   -string -> frozen_string
3067  *   dedup -> frozen_string
3068  *
3069  * Returns a frozen, possibly pre-existing copy of the string.
3070  *
3071  * The returned +String+ will be deduplicated as long as it does not have
3072  * any instance variables set on it and is not a String subclass.
3073  *
3074  * Note that <tt>-string</tt> variant is more convenient for defining
3075  * constants:
3076  *
3077  *    FILENAME = -'config/database.yml'
3078  *
3079  * while +dedup+ is better suitable for using the method in chains
3080  * of calculations:
3081  *
3082  *    @url_list.concat(urls.map(&:dedup))
3083  *
3084  */
3085 static VALUE
3086 str_uminus(VALUE str)
3087 {
3088     if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3089         str = rb_str_dup(str);
3090     }
3091     return rb_fstring(str);
3092 }
3093
3094 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3095 #define rb_str_dup_frozen rb_str_new_frozen
3096
3097 VALUE
3098 rb_str_locktmp(VALUE str)
3099 {
3100     if (FL_TEST(str, STR_TMPLOCK)) {
3101         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3102     }
3103     FL_SET(str, STR_TMPLOCK);
3104     return str;
3105 }
3106
3107 VALUE
3108 rb_str_unlocktmp(VALUE str)
3109 {
3110     if (!FL_TEST(str, STR_TMPLOCK)) {
3111         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3112     }
3113     FL_UNSET(str, STR_TMPLOCK);
3114     return str;
3115 }
3116
3117 VALUE
3118 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3119 {
3120     rb_str_locktmp(str);
3121     return rb_ensure(func, arg, rb_str_unlocktmp, str);
3122 }
3123
3124 void
3125 rb_str_set_len(VALUE str, long len)
3126 {
3127     long capa;
3128     const int termlen = TERM_LEN(str);
3129
3130     str_modifiable(str);
3131     if (STR_SHARED_P(str)) {
3132         rb_raise(rb_eRuntimeError, "can't set length of shared string");
3133     }
3134     if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3135         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3136     }
3137
3138     int cr = ENC_CODERANGE(str);
3139     if (cr == ENC_CODERANGE_UNKNOWN) {
3140         /* Leave unknown. */
3141     }
3142     else if (len > RSTRING_LEN(str)) {
3143         if (ENC_CODERANGE_CLEAN_P(cr)) {
3144             /* Update the coderange regarding the extended part. */
3145             const char *const prev_end = RSTRING_END(str);
3146             const char *const new_end = RSTRING_PTR(str) + len;
3147             rb_encoding *enc = rb_enc_get(str);
3148             rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3149             ENC_CODERANGE_SET(str, cr);
3150         }
3151         else if (cr == ENC_CODERANGE_BROKEN) {
3152             /* May be valid now, by appended part. */
3153             ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
3154         }
3155     }
3156     else if (len < RSTRING_LEN(str)) {
3157         if (cr != ENC_CODERANGE_7BIT) {
3158             /* ASCII-only string is keeping after truncated.  Valid
3159              * and broken may be invalid or valid, leave unknown. */
3160             ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
3161         }
3162     }
3163
3164     STR_SET_LEN(str, len);
3165     TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3166 }
3167
3168 VALUE
3169 rb_str_resize(VALUE str, long len)
3170 {
3171     if (len < 0) {
3172         rb_raise(rb_eArgError, "negative string size (or size too big)");
3173     }
3174
3175     int independent = str_independent(str);
3176     long slen = RSTRING_LEN(str);
3177
3178     if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3179         ENC_CODERANGE_CLEAR(str);
3180     }
3181
3182     {
3183         long capa;
3184         const int termlen = TERM_LEN(str);
3185         if (STR_EMBED_P(str)) {
3186             if (len == slen) return str;
3187             if (str_embed_capa(str) >= len + termlen) {
3188                 STR_SET_LEN(str, len);
3189                 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3190                 return str;
3191             }
3192             str_make_independent_expand(str, slen, len - slen, termlen);
3193         }
3194         else if (str_embed_capa(str) >= len + termlen) {
3195             char *ptr = STR_HEAP_PTR(str);
3196             STR_SET_EMBED(str);
3197             if (slen > len) slen = len;
3198             if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3199             TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3200             STR_SET_LEN(str, len);
3201             if (independent) ruby_xfree(ptr);
3202             return str;
3203         }
3204         else if (!independent) {
3205             if (len == slen) return str;
3206             str_make_independent_expand(str, slen, len - slen, termlen);
3207         }
3208         else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3209                  (capa - len) > (len < 1024 ? len : 1024)) {
3210             SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3211                             (size_t)len + termlen, STR_HEAP_SIZE(str));
3212             RSTRING(str)->as.heap.aux.capa = len;
3213         }
3214         else if (len == slen) return str;
3215         STR_SET_LEN(str, len);
3216         TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3217     }
3218     return str;
3219 }
3220
3221 static VALUE
3222 str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3223 {
3224     if (keep_cr) {
3225         str_modify_keep_cr(str);
3226     }
3227     else {
3228         rb_str_modify(str);
3229     }
3230     if (len == 0) return 0;
3231
3232     long total, olen, off = -1;
3233     char *sptr;
3234     const int termlen = TERM_LEN(str);
3235
3236     RSTRING_GETMEM(str, sptr, olen);
3237     if (ptr >= sptr && ptr <= sptr + olen) {
3238         off = ptr - sptr;
3239     }
3240
3241     long capa = str_capacity(str, termlen);
3242
3243     if (olen > LONG_MAX - len) {
3244         rb_raise(rb_eArgError, "string sizes too big");
3245     }
3246     total = olen + len;
3247     if (capa < total) {
3248         if (total >= LONG_MAX / 2) {
3249             capa = total;
3250         }
3251         while (total > capa) {
3252             capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3253         }
3254         RESIZE_CAPA_TERM(str, capa, termlen);
3255         sptr = RSTRING_PTR(str);
3256     }
3257     if (off != -1) {
3258         ptr = sptr + off;
3259     }
3260     memcpy(sptr + olen, ptr, len);
3261     STR_SET_LEN(str, total);
3262     TERM_FILL(sptr + total, termlen); /* sentinel */
3263
3264     return str;
3265 }
3266
3267 #define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3268 #define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3269
3270 VALUE
3271 rb_str_cat(VALUE str, const char *ptr, long len)
3272 {
3273     if (len == 0) return str;
3274     if (len < 0) {
3275         rb_raise(rb_eArgError, "negative string size (or size too big)");
3276     }
3277     return str_buf_cat(str, ptr, len);
3278 }
3279
3280 VALUE
3281 rb_str_cat_cstr(VALUE str, const char *ptr)
3282 {
3283     must_not_null(ptr);
3284     return rb_str_buf_cat(str, ptr, strlen(ptr));
3285 }
3286
3287 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3288 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3289 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3290
3291 static VALUE
3292 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3293     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3294 {
3295     int str_encindex = ENCODING_GET(str);
3296     int res_encindex;
3297     int str_cr, res_cr;
3298     rb_encoding *str_enc, *ptr_enc;
3299
3300     str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3301
3302     if (str_encindex == ptr_encindex) {
3303         if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3304             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3305         }
3306     }
3307     else {
3308         str_enc = rb_enc_from_index(str_encindex);
3309         ptr_enc = rb_enc_from_index(ptr_encindex);
3310         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3311             if (len == 0)
3312                 return str;
3313             if (RSTRING_LEN(str) == 0) {
3314                 rb_str_buf_cat(str, ptr, len);
3315                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3316                 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3317                 return str;
3318             }
3319             goto incompatible;
3320         }
3321         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3322             ptr_cr = coderange_scan(ptr, len, ptr_enc);
3323         }
3324         if (str_cr == ENC_CODERANGE_UNKNOWN) {
3325             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3326                 str_cr = rb_enc_str_coderange(str);
3327             }
3328         }
3329     }
3330     if (ptr_cr_ret)
3331         *ptr_cr_ret = ptr_cr;
3332
3333     if (str_encindex != ptr_encindex &&
3334         str_cr != ENC_CODERANGE_7BIT &&
3335         ptr_cr != ENC_CODERANGE_7BIT) {
3336         str_enc = rb_enc_from_index(str_encindex);
3337         ptr_enc = rb_enc_from_index(ptr_encindex);
3338         goto incompatible;
3339     }
3340
3341     if (str_cr == ENC_CODERANGE_UNKNOWN) {
3342         res_encindex = str_encindex;
3343         res_cr = ENC_CODERANGE_UNKNOWN;
3344     }
3345     else if (str_cr == ENC_CODERANGE_7BIT) {
3346         if (ptr_cr == ENC_CODERANGE_7BIT) {
3347             res_encindex = str_encindex;
3348             res_cr = ENC_CODERANGE_7BIT;
3349         }
3350         else {
3351             res_encindex = ptr_encindex;
3352             res_cr = ptr_cr;
3353         }
3354     }
3355     else if (str_cr == ENC_CODERANGE_VALID) {
3356         res_encindex = str_encindex;
3357         if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3358             res_cr = str_cr;
3359         else
3360             res_cr = ptr_cr;
3361     }
3362     else { /* str_cr == ENC_CODERANGE_BROKEN */
3363         res_encindex = str_encindex;
3364         res_cr = str_cr;
3365         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3366     }
3367
3368     if (len < 0) {
3369         rb_raise(rb_eArgError, "negative string size (or size too big)");
3370     }
3371     str_buf_cat(str, ptr, len);
3372     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3373     return str;
3374
3375   incompatible:
3376     rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3377              rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3378     UNREACHABLE_RETURN(Qundef);
3379 }
3380
3381 VALUE
3382 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3383 {
3384     return rb_enc_cr_str_buf_cat(str, ptr, len,
3385         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3386 }
3387
3388 VALUE
3389 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3390 {
3391     /* ptr must reference NUL terminated ASCII string. */
3392     int encindex = ENCODING_GET(str);
3393     rb_encoding *enc = rb_enc_from_index(encindex);
3394     if (rb_enc_asciicompat(enc)) {
3395         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3396             encindex, ENC_CODERANGE_7BIT, 0);
3397     }
3398     else {
3399         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3400         while (*ptr) {
3401             unsigned int c = (unsigned char)*ptr;
3402             int len = rb_enc_codelen(c, enc);
3403             rb_enc_mbcput(c, buf, enc);
3404             rb_enc_cr_str_buf_cat(str, buf, len,
3405                 encindex, ENC_CODERANGE_VALID, 0);
3406             ptr++;
3407         }
3408         return str;
3409     }
3410 }
3411
3412 VALUE
3413 rb_str_buf_append(VALUE str, VALUE str2)
3414 {
3415     int str2_cr = rb_enc_str_coderange(str2);
3416
3417     if (str_enc_fastpath(str)) {
3418         switch (str2_cr) {
3419           case ENC_CODERANGE_7BIT:
3420             // If RHS is 7bit we can do simple concatenation
3421             str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3422             RB_GC_GUARD(str2);
3423             return str;
3424           case ENC_CODERANGE_VALID:
3425             // If RHS is valid, we can do simple concatenation if encodings are the same
3426             if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3427                 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3428                 int str_cr = ENC_CODERANGE(str);
3429                 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3430                     ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3431                 }
3432                 RB_GC_GUARD(str2);
3433                 return str;
3434             }
3435         }
3436     }
3437
3438     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3439         ENCODING_GET(str2), str2_cr, &str2_cr);
3440
3441     ENC_CODERANGE_SET(str2, str2_cr);
3442
3443     return str;
3444 }
3445
3446 VALUE
3447 rb_str_append(VALUE str, VALUE str2)
3448 {
3449     StringValue(str2);
3450     return rb_str_buf_append(str, str2);
3451 }
3452
3453 VALUE
3454 rb_str_concat_literals(size_t num, const VALUE *strary)
3455 {
3456     VALUE str;
3457     size_t i, s = 0;
3458     unsigned long len = 1;
3459
3460     if (UNLIKELY(!num)) return rb_str_new(0, 0);
3461     if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3462
3463     for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3464     str = rb_str_buf_new(len);
3465     str_enc_copy_direct(str, strary[0]);
3466
3467     for (i = s; i < num; ++i) {
3468         const VALUE v = strary[i];
3469         int encidx = ENCODING_GET(v);
3470
3471         rb_str_buf_append(str, v);
3472         if (encidx != ENCINDEX_US_ASCII) {
3473             if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3474                 rb_enc_set_index(str, encidx);
3475         }
3476     }
3477     return str;
3478 }
3479
3480 /*
3481  *  call-seq:
3482  *     concat(*objects) -> string
3483  *
3484  *  Concatenates each object in +objects+ to +self+ and returns +self+:
3485  *
3486  *    s = 'foo'
3487  *    s.concat('bar', 'baz') # => "foobarbaz"
3488  *    s                      # => "foobarbaz"
3489  *
3490  *  For each given object +object+ that is an Integer,
3491  *  the value is considered a codepoint and converted to a character before concatenation:
3492  *
3493  *    s = 'foo'
3494  *    s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3495  *
3496  *  Related: String#<<, which takes a single argument.
3497  */
3498 static VALUE
3499 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3500 {
3501     str_modifiable(str);
3502
3503     if (argc == 1) {
3504         return rb_str_concat(str, argv[0]);
3505     }
3506     else if (argc > 1) {
3507         int i;
3508         VALUE arg_str = rb_str_tmp_new(0);
3509         rb_enc_copy(arg_str, str);
3510         for (i = 0; i < argc; i++) {
3511             rb_str_concat(arg_str, argv[i]);
3512         }
3513         rb_str_buf_append(str, arg_str);
3514     }
3515
3516     return str;
3517 }
3518
3519 /*
3520  *  call-seq:
3521  *    string << object -> string
3522  *
3523  *  Concatenates +object+ to +self+ and returns +self+:
3524  *
3525  *    s = 'foo'
3526  *    s << 'bar' # => "foobar"
3527  *    s          # => "foobar"
3528  *
3529  *  If +object+ is an Integer,
3530  *  the value is considered a codepoint and converted to a character before concatenation:
3531  *
3532  *    s = 'foo'
3533  *    s << 33 # => "foo!"
3534  *
3535  *  Related: String#concat, which takes multiple arguments.
3536  */
3537 VALUE
3538 rb_str_concat(VALUE str1, VALUE str2)
3539 {
3540     unsigned int code;
3541     rb_encoding *enc = STR_ENC_GET(str1);
3542     int encidx;
3543
3544     if (RB_INTEGER_TYPE_P(str2)) {
3545         if (rb_num_to_uint(str2, &code) == 0) {
3546         }
3547         else if (FIXNUM_P(str2)) {
3548             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3549         }
3550         else {
3551             rb_raise(rb_eRangeError, "bignum out of char range");
3552         }
3553     }
3554     else {
3555         return rb_str_append(str1, str2);
3556     }
3557
3558     encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3559     if (encidx >= 0) {
3560         char buf[1];
3561         buf[0] = (char)code;
3562         rb_str_cat(str1, buf, 1);
3563         if (encidx != rb_enc_to_index(enc)) {
3564             rb_enc_associate_index(str1, encidx);
3565             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
3566         }
3567     }
3568     else {
3569         long pos = RSTRING_LEN(str1);
3570         int cr = ENC_CODERANGE(str1);
3571         int len;
3572         char *buf;
3573
3574         switch (len = rb_enc_codelen(code, enc)) {
3575           case ONIGERR_INVALID_CODE_POINT_VALUE:
3576             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3577             break;
3578           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3579           case 0:
3580             rb_raise(rb_eRangeError, "%u out of char range", code);
3581             break;
3582         }
3583         buf = ALLOCA_N(char, len + 1);
3584         rb_enc_mbcput(code, buf, enc);
3585         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3586             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3587         }
3588         rb_str_resize(str1, pos+len);
3589         memcpy(RSTRING_PTR(str1) + pos, buf, len);
3590         if (cr == ENC_CODERANGE_7BIT && code > 127) {
3591             cr = ENC_CODERANGE_VALID;
3592         }
3593         else if (cr == ENC_CODERANGE_BROKEN) {
3594             cr = ENC_CODERANGE_UNKNOWN;
3595         }
3596         ENC_CODERANGE_SET(str1, cr);
3597     }
3598     return str1;
3599 }
3600
3601 int
3602 rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3603 {
3604     int encidx = rb_enc_to_index(enc);
3605
3606     if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3607         /* US-ASCII automatically extended to ASCII-8BIT */
3608         if (code > 0xFF) {
3609             rb_raise(rb_eRangeError, "%u out of char range", code);
3610         }
3611         if (encidx == ENCINDEX_US_ASCII && code > 127) {
3612             return ENCINDEX_ASCII_8BIT;
3613         }
3614         return encidx;
3615     }
3616     else {
3617         return -1;
3618     }
3619 }
3620
3621 /*
3622  *  call-seq:
3623  *    prepend(*other_strings)  -> string
3624  *
3625  *  Prepends each string in +other_strings+ to +self+ and returns +self+:
3626  *
3627  *    s = 'foo'
3628  *    s.prepend('bar', 'baz') # => "barbazfoo"
3629  *    s                       # => "barbazfoo"
3630  *
3631  *  Related: String#concat.
3632  */
3633
3634 static VALUE
3635 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3636 {
3637     str_modifiable(str);
3638
3639     if (argc == 1) {
3640         rb_str_update(str, 0L, 0L, argv[0]);
3641     }
3642     else if (argc > 1) {
3643         int i;
3644         VALUE arg_str = rb_str_tmp_new(0);
3645         rb_enc_copy(arg_str, str);
3646         for (i = 0; i < argc; i++) {
3647             rb_str_append(arg_str, argv[i]);
3648         }
3649         rb_str_update(str, 0L, 0L, arg_str);
3650     }
3651
3652     return str;
3653 }
3654
3655 st_index_t
3656 rb_str_hash(VALUE str)
3657 {
3658     st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
3659     int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
3660     if (e && !is_ascii_string(str)) {
3661         h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
3662     }
3663     return h;
3664 }
3665
3666 int
3667 rb_str_hash_cmp(VALUE str1, VALUE str2)
3668 {
3669     long len1, len2;
3670     const char *ptr1, *ptr2;
3671     RSTRING_GETMEM(str1, ptr1, len1);
3672     RSTRING_GETMEM(str2, ptr2, len2);
3673     return (len1 != len2 ||
3674             !rb_str_comparable(str1, str2) ||
3675             memcmp(ptr1, ptr2, len1) != 0);
3676 }
3677
3678 /*
3679  * call-seq:
3680  *   hash -> integer
3681  *
3682  * Returns the integer hash value for +self+.
3683  * The value is based on the length, content and encoding of +self+.
3684  *
3685  * Related: Object#hash.
3686  */
3687
3688 static VALUE
3689 rb_str_hash_m(VALUE str)
3690 {
3691     st_index_t hval = rb_str_hash(str);
3692     return ST2FIX(hval);
3693 }
3694
3695 #define lesser(a,b) (((a)>(b))?(b):(a))
3696
3697 int
3698 rb_str_comparable(VALUE str1, VALUE str2)
3699 {
3700     int idx1, idx2;
3701     int rc1, rc2;
3702
3703     if (RSTRING_LEN(str1) == 0) return TRUE;
3704     if (RSTRING_LEN(str2) == 0) return TRUE;
3705     idx1 = ENCODING_GET(str1);
3706     idx2 = ENCODING_GET(str2);
3707     if (idx1 == idx2) return TRUE;
3708     rc1 = rb_enc_str_coderange(str1);
3709     rc2 = rb_enc_str_coderange(str2);
3710     if (rc1 == ENC_CODERANGE_7BIT) {
3711         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3712         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3713             return TRUE;
3714     }
3715     if (rc2 == ENC_CODERANGE_7BIT) {
3716         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3717             return TRUE;
3718     }
3719     return FALSE;
3720 }
3721
3722 int
3723 rb_str_cmp(VALUE str1, VALUE str2)
3724 {
3725     long len1, len2;
3726     const char *ptr1, *ptr2;
3727     int retval;
3728
3729     if (str1 == str2) return 0;
3730     RSTRING_GETMEM(str1, ptr1, len1);
3731     RSTRING_GETMEM(str2, ptr2, len2);
3732     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3733         if (len1 == len2) {
3734             if (!rb_str_comparable(str1, str2)) {
3735                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3736                     return 1;
3737                 return -1;
3738             }
3739             return 0;
3740         }
3741         if (len1 > len2) return 1;
3742         return -1;
3743     }
3744     if (retval > 0) return 1;
3745     return -1;
3746 }
3747
3748 /*
3749  *  call-seq:
3750  *    string == object -> true or false
3751  *    string === object -> true or false
3752  *
3753  *  Returns +true+ if +object+ has the same length and content;
3754  *  as +self+; +false+ otherwise:
3755  *
3756  *    s = 'foo'
3757  *    s == 'foo' # => true
3758  *    s == 'food' # => false
3759  *    s == 'FOO' # => false
3760  *
3761  *  Returns +false+ if the two strings' encodings are not compatible:
3762  *    "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3763  *
3764  *  If +object+ is not an instance of +String+ but responds to +to_str+, then the
3765  *  two strings are compared using <code>object.==</code>.
3766  */
3767
3768 VALUE
3769 rb_str_equal(VALUE str1, VALUE str2)
3770 {
3771     if (str1 == str2) return Qtrue;
3772     if (!RB_TYPE_P(str2, T_STRING)) {
3773         if (!rb_respond_to(str2, idTo_str)) {
3774             return Qfalse;
3775         }
3776         return rb_equal(str2, str1);
3777     }
3778     return rb_str_eql_internal(str1, str2);
3779 }
3780
3781 /*
3782  * call-seq:
3783  *   eql?(object) -> true or false
3784  *
3785  *  Returns +true+ if +object+ has the same length and content;
3786  *  as +self+; +false+ otherwise:
3787  *
3788  *    s = 'foo'
3789  *    s.eql?('foo') # => true
3790  *    s.eql?('food') # => false
3791  *    s.eql?('FOO') # => false
3792  *
3793  *  Returns +false+ if the two strings' encodings are not compatible:
3794  *
3795  *    "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3796  *
3797  */
3798
3799 VALUE
3800 rb_str_eql(VALUE str1, VALUE str2)
3801 {
3802     if (str1 == str2) return Qtrue;
3803     if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3804     return rb_str_eql_internal(str1, str2);
3805 }
3806
3807 /*
3808  *  call-seq:
3809  *    string <=> other_string -> -1, 0, 1, or nil
3810  *
3811  *  Compares +self+ and +other_string+, returning:
3812  *
3813  *  - -1 if +other_string+ is larger.
3814  *  - 0 if the two are equal.
3815  *  - 1 if +other_string+ is smaller.
3816  *  - +nil+ if the two are incomparable.
3817  *
3818  *  Examples:
3819  *
3820  *    'foo' <=> 'foo' # => 0
3821  *    'foo' <=> 'food' # => -1
3822  *    'food' <=> 'foo' # => 1
3823  *    'FOO' <=> 'foo' # => -1
3824  *    'foo' <=> 'FOO' # => 1
3825  *    'foo' <=> 1 # => nil
3826  *
3827  */
3828
3829 static VALUE
3830 rb_str_cmp_m(VALUE str1, VALUE str2)
3831 {
3832     int result;
3833     VALUE s = rb_check_string_type(str2);
3834     if (NIL_P(s)) {
3835         return rb_invcmp(str1, str2);
3836     }
3837     result = rb_str_cmp(str1, s);
3838     return INT2FIX(result);
3839 }
3840
3841 static VALUE str_casecmp(VALUE str1, VALUE str2);
3842 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3843
3844 /*
3845  *  call-seq:
3846  *    casecmp(other_string) -> -1, 0, 1, or nil
3847  *
3848  *  Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3849  *
3850  *  - -1 if <tt>other_string.downcase</tt> is larger.
3851  *  - 0 if the two are equal.
3852  *  - 1 if <tt>other_string.downcase</tt> is smaller.
3853  *  - +nil+ if the two are incomparable.
3854  *
3855  *  Examples:
3856  *
3857  *    'foo'.casecmp('foo') # => 0
3858  *    'foo'.casecmp('food') # => -1
3859  *    'food'.casecmp('foo') # => 1
3860  *    'FOO'.casecmp('foo') # => 0
3861  *    'foo'.casecmp('FOO') # => 0
3862  *    'foo'.casecmp(1) # => nil
3863  *
3864  *  See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3865  *
3866  *  Related: String#casecmp?.
3867  *
3868  */
3869
3870 static VALUE
3871 rb_str_casecmp(VALUE str1, VALUE str2)
3872 {
3873     VALUE s = rb_check_string_type(str2);
3874     if (NIL_P(s)) {
3875         return Qnil;
3876     }
3877     return str_casecmp(str1, s);
3878 }
3879
3880 static VALUE
3881 str_casecmp(VALUE str1, VALUE str2)
3882 {
3883     long len;
3884     rb_encoding *enc;
3885     const char *p1, *p1end, *p2, *p2end;
3886
3887     enc = rb_enc_compatible(str1, str2);
3888     if (!enc) {
3889         return Qnil;
3890     }
3891
3892     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3893     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3894     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3895         while (p1 < p1end && p2 < p2end) {
3896             if (*p1 != *p2) {
3897                 unsigned int c1 = TOLOWER(*p1 & 0xff);
3898                 unsigned int c2 = TOLOWER(*p2 & 0xff);
3899                 if (c1 != c2)
3900                     return INT2FIX(c1 < c2 ? -1 : 1);
3901             }
3902             p1++;
3903             p2++;
3904         }
3905     }
3906     else {
3907         while (p1 < p1end && p2 < p2end) {
3908             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3909             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3910
3911             if (0 <= c1 && 0 <= c2) {
3912                 c1 = TOLOWER(c1);
3913                 c2 = TOLOWER(c2);
3914                 if (c1 != c2)
3915                     return INT2FIX(c1 < c2 ? -1 : 1);
3916             }
3917             else {
3918                 int r;
3919                 l1 = rb_enc_mbclen(p1, p1end, enc);
3920                 l2 = rb_enc_mbclen(p2, p2end, enc);
3921                 len = l1 < l2 ? l1 : l2;
3922                 r = memcmp(p1, p2, len);
3923                 if (r != 0)
3924                     return INT2FIX(r < 0 ? -1 : 1);
3925                 if (l1 != l2)
3926                     return INT2FIX(l1 < l2 ? -1 : 1);
3927             }
3928             p1 += l1;
3929             p2 += l2;
3930         }
3931     }
3932     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3933     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3934     return INT2FIX(-1);
3935 }
3936
3937 /*
3938  *  call-seq:
3939  *    casecmp?(other_string) -> true, false, or nil
3940  *
3941  *  Returns +true+ if +self+ and +other_string+ are equal after
3942  *  Unicode case folding, otherwise +false+:
3943  *
3944  *    'foo'.casecmp?('foo') # => true
3945  *    'foo'.casecmp?('food') # => false
3946  *    'food'.casecmp?('foo') # => false
3947  *    'FOO'.casecmp?('foo') # => true
3948  *    'foo'.casecmp?('FOO') # => true
3949  *
3950  *  Returns +nil+ if the two values are incomparable:
3951  *
3952  *    'foo'.casecmp?(1) # => nil
3953  *
3954  *  See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3955  *
3956  *  Related: String#casecmp.
3957  *
3958  */
3959
3960 static VALUE
3961 rb_str_casecmp_p(VALUE str1, VALUE str2)
3962 {
3963     VALUE s = rb_check_string_type(str2);
3964     if (NIL_P(s)) {
3965         return Qnil;
3966     }
3967     return str_casecmp_p(str1, s);
3968 }
3969
3970 static VALUE
3971 str_casecmp_p(VALUE str1, VALUE str2)
3972 {
3973     rb_encoding *enc;
3974     VALUE folded_str1, folded_str2;
3975     VALUE fold_opt = sym_fold;
3976
3977     enc = rb_enc_compatible(str1, str2);
3978     if (!enc) {
3979         return Qnil;
3980     }
3981
3982     folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3983     folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3984
3985     return rb_str_eql(folded_str1, folded_str2);
3986 }
3987
3988 static long
3989 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3990             const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3991 {
3992     const char *search_start = str_ptr;
3993     long pos, search_len = str_len - offset;
3994
3995     for (;;) {
3996         const char *t;
3997         pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3998         if (pos < 0) return pos;
3999         t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4000         if (t == search_start + pos) break;
4001         search_len -= t - search_start;
4002         if (search_len <= 0) return -1;
4003         offset += t - search_start;
4004         search_start = t;
4005     }
4006     return pos + offset;
4007 }
4008
4009 /* found index in byte */
4010 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4011 #define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4012
4013 static long
4014 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4015 {
4016     const char *str_ptr, *str_ptr_end, *sub_ptr;
4017     long str_len, sub_len;
4018     rb_encoding *enc;
4019
4020     enc = rb_enc_check(str, sub);
4021     if (is_broken_string(sub)) return -1;
4022
4023     str_ptr = RSTRING_PTR(str);
4024     str_ptr_end = RSTRING_END(str);
4025     str_len = RSTRING_LEN(str);
4026     sub_ptr = RSTRING_PTR(sub);
4027     sub_len = RSTRING_LEN(sub);
4028
4029     if (str_len < sub_len) return -1;
4030
4031     if (offset != 0) {
4032         long str_len_char, sub_len_char;
4033         int single_byte = single_byte_optimizable(str);
4034         str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4035         sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4036         if (offset < 0) {
4037             offset += str_len_char;
4038             if (offset < 0) return -1;
4039         }
4040         if (str_len_char - offset < sub_len_char) return -1;
4041         if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4042         str_ptr += offset;
4043     }
4044     if (sub_len == 0) return offset;
4045
4046     /* need proceed one character at a time */
4047     return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4048 }
4049
4050
4051 /*
4052  *  call-seq:
4053  *    index(substring, offset = 0) -> integer or nil
4054  *    index(regexp, offset = 0) -> integer or nil
4055  *
4056  *  :include: doc/string/index.rdoc
4057  *
4058  */
4059
4060 static VALUE
4061 rb_str_index_m(int argc, VALUE *argv, VALUE str)
4062 {
4063     VALUE sub;
4064     VALUE initpos;
4065     rb_encoding *enc = STR_ENC_GET(str);
4066     long pos;
4067
4068     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4069         long slen = str_strlen(str, enc); /* str's enc */
4070         pos = NUM2LONG(initpos);
4071         if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4072             if (RB_TYPE_P(sub, T_REGEXP)) {
4073                 rb_backref_set(Qnil);
4074             }
4075             return Qnil;
4076         }
4077     }
4078     else {
4079         pos = 0;
4080     }
4081
4082     if (RB_TYPE_P(sub, T_REGEXP)) {
4083         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4084                          enc, single_byte_optimizable(str));
4085
4086         if (rb_reg_search(sub, str, pos, 0) >= 0) {
4087             VALUE match = rb_backref_get();
4088             struct re_registers *regs = RMATCH_REGS(match);
4089             pos = rb_str_sublen(str, BEG(0));
4090             return LONG2NUM(pos);
4091         }
4092     }
4093     else {
4094         StringValue(sub);
4095         pos = rb_str_index(str, sub, pos);
4096         if (pos >= 0) {
4097             pos = rb_str_sublen(str, pos);
4098             return LONG2NUM(pos);
4099         }
4100     }
4101     return Qnil;
4102 }
4103
4104 /* Ensure that the given pos is a valid character boundary.
4105  * Note that in this function, "character" means a code point
4106  * (Unicode scalar value), not a grapheme cluster.
4107  */
4108 static void
4109 str_ensure_byte_pos(VALUE str, long pos)
4110 {
4111     const char *s = RSTRING_PTR(str);
4112     const char *e = RSTRING_END(str);
4113     const char *p = s + pos;
4114     if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4115         rb_raise(rb_eIndexError,
4116                  "offset %ld does not land on character boundary", pos);
4117     }
4118 }
4119
4120 /*
4121  *  call-seq:
4122  *    byteindex(substring, offset = 0) -> integer or nil
4123  *    byteindex(regexp, offset = 0) -> integer or nil
4124  *
4125  *  Returns the Integer byte-based index of the first occurrence of the given +substring+,
4126  *  or +nil+ if none found:
4127  *
4128  *    'foo'.byteindex('f') # => 0
4129  *    'foo'.byteindex('o') # => 1
4130  *    'foo'.byteindex('oo') # => 1
4131  *    'foo'.byteindex('ooo') # => nil
4132  *
4133  *  Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4134  *  or +nil+ if none found:
4135  *
4136  *    'foo'.byteindex(/f/) # => 0
4137  *    'foo'.byteindex(/o/) # => 1
4138  *    'foo'.byteindex(/oo/) # => 1
4139  *    'foo'.byteindex(/ooo/) # => nil
4140  *
4141  *  Integer argument +offset+, if given, specifies the byte-based position in the
4142  *  string to begin the search:
4143  *
4144  *    'foo'.byteindex('o', 1) # => 1
4145  *    'foo'.byteindex('o', 2) # => 2
4146  *    'foo'.byteindex('o', 3) # => nil
4147  *
4148  *  If +offset+ is negative, counts backward from the end of +self+:
4149  *
4150  *    'foo'.byteindex('o', -1) # => 2
4151  *    'foo'.byteindex('o', -2) # => 1
4152  *    'foo'.byteindex('o', -3) # => 1
4153  *    'foo'.byteindex('o', -4) # => nil
4154  *
4155  *  If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4156  *  raised.
4157  *
4158  *  Related: String#index, String#byterindex.
4159  */
4160
4161 static VALUE
4162 rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4163 {
4164     VALUE sub;
4165     VALUE initpos;
4166     long pos;
4167
4168     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4169         long slen = RSTRING_LEN(str);
4170         pos = NUM2LONG(initpos);
4171         if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4172             if (RB_TYPE_P(sub, T_REGEXP)) {
4173                 rb_backref_set(Qnil);
4174             }
4175             return Qnil;
4176         }
4177     }
4178     else {
4179         pos = 0;
4180     }
4181
4182     str_ensure_byte_pos(str, pos);
4183
4184     if (RB_TYPE_P(sub, T_REGEXP)) {
4185         if (rb_reg_search(sub, str, pos, 0) >= 0) {
4186             VALUE match = rb_backref_get();
4187             struct re_registers *regs = RMATCH_REGS(match);
4188             pos = BEG(0);
4189             return LONG2NUM(pos);
4190         }
4191     }
4192     else {
4193         StringValue(sub);
4194         pos = rb_str_byteindex(str, sub, pos);
4195         if (pos >= 0) return LONG2NUM(pos);
4196     }
4197     return Qnil;
4198 }
4199
4200 #ifdef HAVE_MEMRCHR
4201 static long
4202 str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4203 {
4204     char *hit, *adjusted;
4205     int c;
4206     long slen, searchlen;
4207     char *sbeg, *e, *t;
4208
4209     sbeg = RSTRING_PTR(str);
4210     slen = RSTRING_LEN(sub);
4211     if (slen == 0) return s - sbeg;
4212     e = RSTRING_END(str);
4213     t = RSTRING_PTR(sub);
4214     c = *t & 0xff;
4215     searchlen = s - sbeg + 1;
4216
4217     do {
4218         hit = memrchr(sbeg, c, searchlen);
4219         if (!hit) break;
4220         adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4221         if (hit != adjusted) {
4222             searchlen = adjusted - sbeg;
4223             continue;
4224         }
4225         if (memcmp(hit, t, slen) == 0)
4226             return hit - sbeg;
4227         searchlen = adjusted - sbeg;
4228     } while (searchlen > 0);
4229
4230     return -1;
4231 }
4232 #else
4233 static long
4234 str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4235 {
4236     long slen;
4237     char *sbeg, *e, *t;
4238
4239     sbeg = RSTRING_PTR(str);
4240     e = RSTRING_END(str);
4241     t = RSTRING_PTR(sub);
4242     slen = RSTRING_LEN(sub);
4243
4244     while (s) {
4245         if (memcmp(s, t, slen) == 0) {
4246             return s - sbeg;
4247         }
4248         if (s <= sbeg) break;
4249         s = rb_enc_prev_char(sbeg, s, e, enc);
4250     }
4251
4252     return -1;
4253 }
4254 #endif
4255
4256 /* found index in byte */
4257 static long
4258 rb_str_rindex(VALUE str, VALUE sub, long pos)
4259 {
4260     long len, slen;
4261     char *sbeg, *s;
4262     rb_encoding *enc;
4263     int singlebyte;
4264
4265     enc = rb_enc_check(str, sub);
4266     if (is_broken_string(sub)) return -1;
4267     singlebyte = single_byte_optimizable(str);
4268     len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4269     slen = str_strlen(sub, enc); /* rb_enc_check */
4270
4271     /* substring longer than string */
4272     if (len < slen) return -1;
4273     if (len - pos < slen) pos = len - slen;
4274     if (len == 0) return pos;
4275
4276     sbeg = RSTRING_PTR(str);
4277
4278     if (pos == 0) {
4279         if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4280             return 0;
4281         else
4282             return -1;
4283     }
4284
4285     s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4286     return str_rindex(str, sub, s, enc);
4287 }
4288
4289 /*
4290  *  call-seq:
4291  *    rindex(substring, offset = self.length) -> integer or nil
4292  *    rindex(regexp, offset = self.length) -> integer or nil
4293  *
4294  *  Returns the Integer index of the _last_ occurrence of the given +substring+,
4295  *  or +nil+ if none found:
4296  *
4297  *    'foo'.rindex('f') # => 0
4298  *    'foo'.rindex('o') # => 2
4299  *    'foo'.rindex('oo') # => 1
4300  *    'foo'.rindex('ooo') # => nil
4301  *
4302  *  Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4303  *  or +nil+ if none found:
4304  *
4305  *    'foo'.rindex(/f/) # => 0
4306  *    'foo'.rindex(/o/) # => 2
4307  *    'foo'.rindex(/oo/) # => 1
4308  *    'foo'.rindex(/ooo/) # => nil
4309  *
4310  *  The _last_ match means starting at the possible last position, not
4311  *  the last of longest matches.
4312  *
4313  *    'foo'.rindex(/o+/) # => 2
4314  *    $~ #=> #<MatchData "o">
4315  *
4316  *  To get the last longest match, needs to combine with negative
4317  *  lookbehind.
4318  *
4319  *    'foo'.rindex(/(?<!o)o+/) # => 1
4320  *    $~ #=> #<MatchData "oo">
4321  *
4322  *  Or String#index with negative lookforward.
4323  *
4324  *    'foo'.index(/o+(?!.*o)/) # => 1
4325  *    $~ #=> #<MatchData "oo">
4326  *
4327  *  Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4328  *   string to _end_ the search:
4329  *
4330  *    'foo'.rindex('o', 0) # => nil
4331  *    'foo'.rindex('o', 1) # => 1
4332  *    'foo'.rindex('o', 2) # => 2
4333  *    'foo'.rindex('o', 3) # => 2
4334  *
4335  *  If +offset+ is a negative Integer, the maximum starting position in the
4336  *  string to _end_ the search is the sum of the string's length and +offset+:
4337  *
4338  *    'foo'.rindex('o', -1) # => 2
4339  *    'foo'.rindex('o', -2) # => 1
4340  *    'foo'.rindex('o', -3) # => nil
4341  *    'foo'.rindex('o', -4) # => nil
4342  *
4343  *  Related: String#index.
4344  */
4345
4346 static VALUE
4347 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4348 {
4349     VALUE sub;
4350     VALUE initpos;
4351     rb_encoding *enc = STR_ENC_GET(str);
4352     long pos, len = str_strlen(str, enc); /* str's enc */
4353
4354     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4355         pos = NUM2LONG(initpos);
4356         if (pos < 0 && (pos += len) < 0) {
4357             if (RB_TYPE_P(sub, T_REGEXP)) {
4358                 rb_backref_set(Qnil);
4359             }
4360             return Qnil;
4361         }
4362         if (pos > len) pos = len;
4363     }
4364     else {
4365         pos = len;
4366     }
4367
4368     if (RB_TYPE_P(sub, T_REGEXP)) {
4369         /* enc = rb_enc_check(str, sub); */
4370         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4371                          enc, single_byte_optimizable(str));
4372
4373         if (rb_reg_search(sub, str, pos, 1) >= 0) {
4374             VALUE match = rb_backref_get();
4375             struct re_registers *regs = RMATCH_REGS(match);
4376             pos = rb_str_sublen(str, BEG(0));
4377             return LONG2NUM(pos);
4378         }
4379     }
4380     else {
4381         StringValue(sub);
4382         pos = rb_str_rindex(str, sub, pos);
4383         if (pos >= 0) {
4384             pos = rb_str_sublen(str, pos);
4385             return LONG2NUM(pos);
4386         }
4387     }
4388     return Qnil;
4389 }
4390
4391 static long
4392 rb_str_byterindex(VALUE str, VALUE sub, long pos)
4393 {
4394     long len, slen;
4395     char *sbeg, *s;
4396     rb_encoding *enc;
4397
4398     enc = rb_enc_check(str, sub);
4399     if (is_broken_string(sub)) return -1;
4400     len = RSTRING_LEN(str);
4401     slen = RSTRING_LEN(sub);
4402
4403     /* substring longer than string */
4404     if (len < slen) return -1;
4405     if (len - pos < slen) pos = len - slen;
4406     if (len == 0) return pos;
4407
4408     sbeg = RSTRING_PTR(str);
4409
4410     if (pos == 0) {
4411         if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4412             return 0;
4413         else
4414             return -1;
4415     }
4416
4417     s = sbeg + pos;
4418     return str_rindex(str, sub, s, enc);
4419 }
4420
4421
4422 /*
4423  *  call-seq:
4424  *    byterindex(substring, offset = self.bytesize) -> integer or nil
4425  *    byterindex(regexp, offset = self.bytesize) -> integer or nil
4426  *
4427  *  Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4428  *  or +nil+ if none found:
4429  *
4430  *    'foo'.byterindex('f') # => 0
4431  *    'foo'.byterindex('o') # => 2
4432  *    'foo'.byterindex('oo') # => 1
4433  *    'foo'.byterindex('ooo') # => nil
4434  *
4435  *  Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4436  *  or +nil+ if none found:
4437  *
4438  *    'foo'.byterindex(/f/) # => 0
4439  *    'foo'.byterindex(/o/) # => 2
4440  *    'foo'.byterindex(/oo/) # => 1
4441  *    'foo'.byterindex(/ooo/) # => nil
4442  *
4443  *  The _last_ match means starting at the possible last position, not
4444  *  the last of longest matches.
4445  *
4446  *    'foo'.byterindex(/o+/) # => 2
4447  *    $~ #=> #<MatchData "o">
4448  *
4449  *  To get the last longest match, needs to combine with negative
4450  *  lookbehind.
4451  *
4452  *    'foo'.byterindex(/(?<!o)o+/) # => 1
4453  *    $~ #=> #<MatchData "oo">
4454  *
4455  *  Or String#byteindex with negative lookforward.
4456  *
4457  *    'foo'.byteindex(/o+(?!.*o)/) # => 1
4458  *    $~ #=> #<MatchData "oo">
4459  *
4460  *  Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4461  *   string to _end_ the search:
4462  *
4463  *    'foo'.byterindex('o', 0) # => nil
4464  *    'foo'.byterindex('o', 1) # => 1
4465  *    'foo'.byterindex('o', 2) # => 2
4466  *    'foo'.byterindex('o', 3) # => 2
4467  *
4468  *  If +offset+ is a negative Integer, the maximum starting position in the
4469  *  string to _end_ the search is the sum of the string's length and +offset+:
4470  *
4471  *    'foo'.byterindex('o', -1) # => 2
4472  *    'foo'.byterindex('o', -2) # => 1
4473  *    'foo'.byterindex('o', -3) # => nil
4474  *    'foo'.byterindex('o', -4) # => nil
4475  *
4476  *  If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4477  *  raised.
4478  *
4479  *  Related: String#byteindex.
4480  */
4481
4482 static VALUE
4483 rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4484 {
4485     VALUE sub;
4486     VALUE initpos;
4487     long pos, len = RSTRING_LEN(str);
4488
4489     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4490         pos = NUM2LONG(initpos);
4491         if (pos < 0 && (pos += len) < 0) {
4492             if (RB_TYPE_P(sub, T_REGEXP)) {
4493                 rb_backref_set(Qnil);
4494             }
4495             return Qnil;
4496         }
4497         if (pos > len) pos = len;
4498     }
4499     else {
4500         pos = len;
4501     }
4502
4503     str_ensure_byte_pos(str, pos);
4504
4505     if (RB_TYPE_P(sub, T_REGEXP)) {
4506         if (rb_reg_search(sub, str, pos, 1) >= 0) {
4507             VALUE match = rb_backref_get();
4508             struct re_registers *regs = RMATCH_REGS(match);
4509             pos = BEG(0);
4510             return LONG2NUM(pos);
4511         }
4512     }
4513     else {
4514         StringValue(sub);
4515         pos = rb_str_byterindex(str, sub, pos);
4516         if (pos >= 0) return LONG2NUM(pos);
4517     }
4518     return Qnil;
4519 }
4520
4521 /*
4522  *  call-seq:
4523  *    string =~ regexp -> integer or nil
4524  *    string =~ object -> integer or nil
4525  *
4526  *  Returns the Integer index of the first substring that matches
4527  *  the given +regexp+, or +nil+ if no match found:
4528  *
4529  *    'foo' =~ /f/ # => 0
4530  *    'foo' =~ /o/ # => 1
4531  *    'foo' =~ /x/ # => nil
4532  *
4533  *  Note: also updates Regexp@Global+Variables.
4534  *
4535  *  If the given +object+ is not a Regexp, returns the value
4536  *  returned by <tt>object =~ self</tt>.
4537  *
4538  *  Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4539  *  (see Regexp#=~):
4540  *
4541  *    number= nil
4542  *    "no. 9" =~ /(?<number>\d+)/
4543  *    number # => nil (not assigned)
4544  *    /(?<number>\d+)/ =~ "no. 9"
4545  *    number #=> "9"
4546  *
4547  */
4548
4549 static VALUE
4550 rb_str_match(VALUE x, VALUE y)
4551 {
4552     switch (OBJ_BUILTIN_TYPE(y)) {
4553       case T_STRING:
4554         rb_raise(rb_eTypeError, "type mismatch: String given");
4555
4556       case T_REGEXP:
4557         return rb_reg_match(y, x);
4558
4559       default:
4560         return rb_funcall(y, idEqTilde, 1, x);
4561     }
4562 }
4563
4564
4565 static VALUE get_pat(VALUE);
4566
4567
4568 /*
4569  *  call-seq:
4570  *    match(pattern, offset = 0) -> matchdata or nil
4571  *    match(pattern, offset = 0) {|matchdata| ... } -> object
4572  *
4573  *  Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4574  *
4575  *  Note: also updates Regexp@Global+Variables.
4576  *
4577  *  - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4578  *      regexp = Regexp.new(pattern)
4579  *  - Computes +matchdata+, which will be either a MatchData object or +nil+
4580  *    (see Regexp#match):
4581  *      matchdata = <tt>regexp.match(self)
4582  *
4583  *  With no block given, returns the computed +matchdata+:
4584  *
4585  *    'foo'.match('f') # => #<MatchData "f">
4586  *    'foo'.match('o') # => #<MatchData "o">
4587  *    'foo'.match('x') # => nil
4588  *
4589  *  If Integer argument +offset+ is given, the search begins at index +offset+:
4590  *
4591  *    'foo'.match('f', 1) # => nil
4592  *    'foo'.match('o', 1) # => #<MatchData "o">
4593  *
4594  *  With a block given, calls the block with the computed +matchdata+
4595  *  and returns the block's return value:
4596  *
4597  *    'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4598  *    'foo'.match(/x/) {|matchdata| matchdata } # => nil
4599  *    'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4600  *
4601  */
4602
4603 static VALUE
4604 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4605 {
4606     VALUE re, result;
4607     if (argc < 1)
4608         rb_check_arity(argc, 1, 2);
4609     re = argv[0];
4610     argv[0] = str;
4611     result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4612     if (!NIL_P(result) && rb_block_given_p()) {
4613         return rb_yield(result);
4614     }
4615     return result;
4616 }
4617
4618 /*
4619  *  call-seq:
4620  *    match?(pattern, offset = 0) -> true or false
4621  *
4622  *  Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4623  *
4624  *  Note: does not update Regexp@Global+Variables.
4625  *
4626  *  Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4627  *    regexp = Regexp.new(pattern)
4628  *
4629  *  Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
4630  *  +false+ otherwise:
4631  *
4632  *    'foo'.match?(/o/) # => true
4633  *    'foo'.match?('o') # => true
4634  *    'foo'.match?(/x/) # => false
4635  *
4636  *  If Integer argument +offset+ is given, the search begins at index +offset+:
4637  *    'foo'.match?('f', 1) # => false
4638  *    'foo'.match?('o', 1) # => true
4639  *
4640  */
4641
4642 static VALUE
4643 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4644 {
4645     VALUE re;
4646     rb_check_arity(argc, 1, 2);
4647     re = get_pat(argv[0]);
4648     return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4649 }
4650
4651 enum neighbor_char {
4652     NEIGHBOR_NOT_CHAR,
4653     NEIGHBOR_FOUND,
4654     NEIGHBOR_WRAPPED
4655 };
4656
4657 static enum neighbor_char
4658 enc_succ_char(char *p, long len, rb_encoding *enc)
4659 {
4660     long i;
4661     int l;
4662
4663     if (rb_enc_mbminlen(enc) > 1) {
4664         /* wchar, trivial case */
4665         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4666         if (!MBCLEN_CHARFOUND_P(r)) {
4667             return NEIGHBOR_NOT_CHAR;
4668         }
4669         c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4670         l = rb_enc_code_to_mbclen(c, enc);
4671         if (!l) return NEIGHBOR_NOT_CHAR;
4672         if (l != len) return NEIGHBOR_WRAPPED;
4673         rb_enc_mbcput(c, p, enc);
4674         r = rb_enc_precise_mbclen(p, p + len, enc);
4675         if (!MBCLEN_CHARFOUND_P(r)) {
4676             return NEIGHBOR_NOT_CHAR;
4677         }
4678         return NEIGHBOR_FOUND;
4679     }
4680     while (1) {
4681         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4682             p[i] = '\0';
4683         if (i < 0)
4684             return NEIGHBOR_WRAPPED;
4685         ++((unsigned char*)p)[i];
4686         l = rb_enc_precise_mbclen(p, p+len, enc);
4687         if (MBCLEN_CHARFOUND_P(l)) {
4688             l = MBCLEN_CHARFOUND_LEN(l);
4689             if (l == len) {
4690                 return NEIGHBOR_FOUND;
4691             }
4692             else {
4693                 memset(p+l, 0xff, len-l);
4694             }
4695         }
4696         if (MBCLEN_INVALID_P(l) && i < len-1) {
4697             long len2;
4698             int l2;
4699             for (len2 = len-1; 0 < len2; len2--) {
4700                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4701                 if (!MBCLEN_INVALID_P(l2))
4702                     break;
4703             }
4704             memset(p+len2+1, 0xff, len-(len2+1));
4705         }
4706     }
4707 }
4708
4709 static enum neighbor_char
4710 enc_pred_char(char *p, long len, rb_encoding *enc)
4711 {
4712     long i;
4713     int l;
4714     if (rb_enc_mbminlen(enc) > 1) {
4715         /* wchar, trivial case */
4716         int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4717         if (!MBCLEN_CHARFOUND_P(r)) {
4718             return NEIGHBOR_NOT_CHAR;
4719         }
4720         c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4721         if (!c) return NEIGHBOR_NOT_CHAR;
4722         --c;
4723         l = rb_enc_code_to_mbclen(c, enc);
4724         if (!l) return NEIGHBOR_NOT_CHAR;
4725         if (l != len) return NEIGHBOR_WRAPPED;
4726         rb_enc_mbcput(c, p, enc);
4727         r = rb_enc_precise_mbclen(p, p + len, enc);
4728         if (!MBCLEN_CHARFOUND_P(r)) {
4729             return NEIGHBOR_NOT_CHAR;
4730         }
4731         return NEIGHBOR_FOUND;
4732     }
4733     while (1) {
4734         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4735             p[i] = '\xff';
4736         if (i < 0)
4737             return NEIGHBOR_WRAPPED;
4738         --((unsigned char*)p)[i];
4739         l = rb_enc_precise_mbclen(p, p+len, enc);
4740         if (MBCLEN_CHARFOUND_P(l)) {
4741             l = MBCLEN_CHARFOUND_LEN(l);
4742             if (l == len) {
4743                 return NEIGHBOR_FOUND;
4744             }
4745             else {
4746                 memset(p+l, 0, len-l);
4747             }
4748         }
4749         if (MBCLEN_INVALID_P(l) && i < len-1) {
4750             long len2;
4751             int l2;
4752             for (len2 = len-1; 0 < len2; len2--) {
4753                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4754                 if (!MBCLEN_INVALID_P(l2))
4755                     break;
4756             }
4757             memset(p+len2+1, 0, len-(len2+1));
4758         }
4759     }
4760 }
4761
4762 /*
4763   overwrite +p+ by succeeding letter in +enc+ and returns
4764   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4765   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4766   assuming each ranges are successive, and mbclen
4767   never change in each ranges.
4768   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4769   character.
4770  */
4771 static enum neighbor_char
4772 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4773 {
4774     enum neighbor_char ret;
4775     unsigned int c;
4776     int ctype;
4777     int range;
4778     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4779
4780     /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4781     int try;
4782     const int max_gaps = 1;
4783
4784     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4785     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4786         ctype = ONIGENC_CTYPE_DIGIT;
4787     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4788         ctype = ONIGENC_CTYPE_ALPHA;
4789     else
4790         return NEIGHBOR_NOT_CHAR;
4791
4792     MEMCPY(save, p, char, len);
4793     for (try = 0; try <= max_gaps; ++try) {
4794         ret = enc_succ_char(p, len, enc);
4795         if (ret == NEIGHBOR_FOUND) {
4796             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4797             if (rb_enc_isctype(c, ctype, enc))
4798                 return NEIGHBOR_FOUND;
4799         }
4800     }
4801     MEMCPY(p, save, char, len);
4802     range = 1;
4803     while (1) {
4804         MEMCPY(save, p, char, len);
4805         ret = enc_pred_char(p, len, enc);
4806         if (ret == NEIGHBOR_FOUND) {
4807             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4808             if (!rb_enc_isctype(c, ctype, enc)) {
4809                 MEMCPY(p, save, char, len);
4810                 break;
4811             }
4812         }
4813         else {
4814             MEMCPY(p, save, char, len);
4815             break;
4816         }
4817         range++;
4818     }
4819     if (range == 1) {
4820         return NEIGHBOR_NOT_CHAR;
4821     }
4822
4823     if (ctype != ONIGENC_CTYPE_DIGIT) {
4824         MEMCPY(carry, p, char, len);
4825         return NEIGHBOR_WRAPPED;
4826     }
4827
4828     MEMCPY(carry, p, char, len);
4829     enc_succ_char(carry, len, enc);
4830     return NEIGHBOR_WRAPPED;
4831 }
4832
4833
4834 static VALUE str_succ(VALUE str);
4835
4836 /*
4837  *  call-seq:
4838  *    succ -> new_str
4839  *
4840  *  Returns the successor to +self+. The successor is calculated by
4841  *  incrementing characters.
4842  *
4843  *  The first character to be incremented is the rightmost alphanumeric:
4844  *  or, if no alphanumerics, the rightmost character:
4845  *
4846  *    'THX1138'.succ # => "THX1139"
4847  *    '<<koala>>'.succ # => "<<koalb>>"
4848  *    '***'.succ # => '**+'
4849  *
4850  *  The successor to a digit is another digit, "carrying" to the next-left
4851  *  character for a "rollover" from 9 to 0, and prepending another digit
4852  *  if necessary:
4853  *
4854  *    '00'.succ # => "01"
4855  *    '09'.succ # => "10"
4856  *    '99'.succ # => "100"
4857  *
4858  *  The successor to a letter is another letter of the same case,
4859  *  carrying to the next-left character for a rollover,
4860  *  and prepending another same-case letter if necessary:
4861  *
4862  *    'aa'.succ # => "ab"
4863  *    'az'.succ # => "ba"
4864  *    'zz'.succ # => "aaa"
4865  *    'AA'.succ # => "AB"
4866  *    'AZ'.succ # => "BA"
4867  *    'ZZ'.succ # => "AAA"
4868  *
4869  *  The successor to a non-alphanumeric character is the next character
4870  *  in the underlying character set's collating sequence,
4871  *  carrying to the next-left character for a rollover,
4872  *  and prepending another character if necessary:
4873  *
4874  *    s = 0.chr * 3
4875  *    s # => "\x00\x00\x00"
4876  *    s.succ # => "\x00\x00\x01"
4877  *    s = 255.chr * 3
4878  *    s # => "\xFF\xFF\xFF"
4879  *    s.succ # => "\x01\x00\x00\x00"
4880  *
4881  *  Carrying can occur between and among mixtures of alphanumeric characters:
4882  *
4883  *    s = 'zz99zz99'
4884  *    s.succ # => "aaa00aa00"
4885  *    s = '99zz99zz'
4886  *    s.succ # => "100aa00aa"
4887  *
4888  *  The successor to an empty +String+ is a new empty +String+:
4889  *
4890  *    ''.succ # => ""
4891  *
4892  */
4893
4894 VALUE
4895 rb_str_succ(VALUE orig)
4896 {
4897     VALUE str;
4898     str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4899     rb_enc_cr_str_copy_for_substr(str, orig);
4900     return str_succ(str);
4901 }
4902
4903 static VALUE
4904 str_succ(VALUE str)
4905 {
4906     rb_encoding *enc;
4907     char *sbeg, *s, *e, *last_alnum = 0;
4908     int found_alnum = 0;
4909     long l, slen;
4910     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4911     long carry_pos = 0, carry_len = 1;
4912     enum neighbor_char neighbor = NEIGHBOR_FOUND;
4913
4914     slen = RSTRING_LEN(str);
4915     if (slen == 0) return str;
4916
4917     enc = STR_ENC_GET(str);
4918     sbeg = RSTRING_PTR(str);
4919     s = e = sbeg + slen;
4920
4921     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4922         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4923             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4924                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4925                 break;
4926             }
4927         }
4928         l = rb_enc_precise_mbclen(s, e, enc);
4929         if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4930         l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4931         neighbor = enc_succ_alnum_char(s, l, enc, carry);
4932         switch (neighbor) {
4933           case NEIGHBOR_NOT_CHAR:
4934             continue;
4935           case NEIGHBOR_FOUND:
4936             return str;
4937           case NEIGHBOR_WRAPPED:
4938             last_alnum = s;
4939             break;
4940         }
4941         found_alnum = 1;
4942         carry_pos = s - sbeg;
4943         carry_len = l;
4944     }
4945     if (!found_alnum) {         /* str contains no alnum */
4946         s = e;
4947         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4948             enum neighbor_char neighbor;
4949             char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4950             l = rb_enc_precise_mbclen(s, e, enc);
4951             if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4952             l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4953             MEMCPY(tmp, s, char, l);
4954             neighbor = enc_succ_char(tmp, l, enc);
4955             switch (neighbor) {
4956               case NEIGHBOR_FOUND:
4957                 MEMCPY(s, tmp, char, l);
4958                 return str;
4959                 break;
4960               case NEIGHBOR_WRAPPED:
4961                 MEMCPY(s, tmp, char, l);
4962                 break;
4963               case NEIGHBOR_NOT_CHAR:
4964                 break;
4965             }
4966             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4967                 /* wrapped to \0...\0.  search next valid char. */
4968                 enc_succ_char(s, l, enc);
4969             }
4970             if (!rb_enc_asciicompat(enc)) {
4971                 MEMCPY(carry, s, char, l);
4972                 carry_len = l;
4973             }
4974             carry_pos = s - sbeg;
4975         }
4976         ENC_CODERANGE_SET(str, ENC_CODERANGE_UNKNOWN);
4977     }
4978     RESIZE_CAPA(str, slen + carry_len);
4979     sbeg = RSTRING_PTR(str);
4980     s = sbeg + carry_pos;
4981     memmove(s + carry_len, s, slen - carry_pos);
4982     memmove(s, carry, carry_len);
4983     slen += carry_len;
4984     STR_SET_LEN(str, slen);
4985     TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4986     rb_enc_str_coderange(str);
4987     return str;
4988 }
4989
4990
4991 /*
4992  *  call-seq:
4993  *    succ! -> self
4994  *
4995  *  Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4996  */
4997
4998 static VALUE
4999 rb_str_succ_bang(VALUE str)
5000 {
5001     rb_str_modify(str);
5002     str_succ(str);
5003     return str;
5004 }
5005
5006 static int
5007 all_digits_p(const char *s, long len)
5008 {
5009     while (len-- > 0) {
5010         if (!ISDIGIT(*s)) return 0;
5011         s++;
5012     }
5013     return 1;
5014 }
5015
5016 static int
5017 str_upto_i(VALUE str, VALUE arg)
5018 {
5019     rb_yield(str);
5020     return 0;
5021 }
5022
5023 /*
5024  *  call-seq:
5025  *    upto(other_string, exclusive = false) {|string| ... } -> self
5026  *    upto(other_string, exclusive = false) -> new_enumerator
5027  *
5028  *  With a block given, calls the block with each +String+ value
5029  *  returned by successive calls to String#succ;
5030  *  the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5031  *  the sequence terminates when value +other_string+ is reached;
5032  *  returns +self+:
5033  *
5034  *    'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5035  *  Output:
5036  *
5037  *    a8 a9 b0 b1 b2 b3 b4 b5 b6
5038  *
5039  *  If argument +exclusive+ is given as a truthy object, the last value is omitted:
5040  *
5041  *    'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5042  *
5043  *  Output:
5044  *
5045  *    a8 a9 b0 b1 b2 b3 b4 b5
5046  *
5047  *  If +other_string+ would not be reached, does not call the block:
5048  *
5049  *    '25'.upto('5') {|s| fail s }
5050  *    'aa'.upto('a') {|s| fail s }
5051  *
5052  *  With no block given, returns a new Enumerator:
5053  *
5054  *    'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5055  *
5056  */
5057
5058 static VALUE
5059 rb_str_upto(int argc, VALUE *argv, VALUE beg)
5060 {
5061     VALUE end, exclusive;
5062
5063     rb_scan_args(argc, argv, "11", &end, &exclusive);
5064     RETURN_ENUMERATOR(beg, argc, argv);
5065     return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5066 }
5067
5068 VALUE
5069 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5070 {
5071     VALUE current, after_end;
5072     ID succ;
5073     int n, ascii;
5074     rb_encoding *enc;
5075
5076     CONST_ID(succ, "succ");
5077     StringValue(end);
5078     enc = rb_enc_check(beg, end);
5079     ascii = (is_ascii_string(beg) && is_ascii_string(end));
5080     /* single character */
5081     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5082         char c = RSTRING_PTR(beg)[0];
5083         char e = RSTRING_PTR(end)[0];
5084
5085         if (c > e || (excl && c == e)) return beg;
5086         for (;;) {
5087             if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5088             if (!excl && c == e) break;
5089             c++;
5090             if (excl && c == e) break;
5091         }
5092         return beg;
5093     }
5094     /* both edges are all digits */
5095     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5096         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5097         all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5098         VALUE b, e;
5099         int width;
5100
5101         width = RSTRING_LENINT(beg);
5102         b = rb_str_to_inum(beg, 10, FALSE);
5103         e = rb_str_to_inum(end, 10, FALSE);
5104         if (FIXNUM_P(b) && FIXNUM_P(e)) {
5105             long bi = FIX2LONG(b);
5106             long ei = FIX2LONG(e);
5107             rb_encoding *usascii = rb_usascii_encoding();
5108
5109             while (bi <= ei) {
5110                 if (excl && bi == ei) break;
5111                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5112                 bi++;
5113             }
5114         }
5115         else {
5116             ID op = excl ? '<' : idLE;
5117             VALUE args[2], fmt = rb_fstring_lit("%.*d");
5118
5119             args[0] = INT2FIX(width);
5120             while (rb_funcall(b, op, 1, e)) {
5121                 args[1] = b;
5122                 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5123                 b = rb_funcallv(b, succ, 0, 0);
5124             }
5125         }
5126         return beg;
5127     }
5128     /* normal case */
5129     n = rb_str_cmp(beg, end);
5130     if (n > 0 || (excl && n == 0)) return beg;
5131
5132     after_end = rb_funcallv(end, succ, 0, 0);
5133     current = str_duplicate(rb_cString, beg);
5134     while (!rb_str_equal(current, after_end)) {
5135         VALUE next = Qnil;
5136         if (excl || !rb_str_equal(current, end))
5137             next = rb_funcallv(current, succ, 0, 0);
5138         if ((*each)(current, arg)) break;
5139         if (NIL_P(next)) break;
5140         current = next;
5141         StringValue(current);
5142         if (excl && rb_str_equal(current, end)) break;
5143         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5144             break;
5145     }
5146
5147     return beg;
5148 }
5149
5150 VALUE
5151 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5152 {
5153     VALUE current;
5154     ID succ;
5155
5156     CONST_ID(succ, "succ");
5157     /* both edges are all digits */
5158     if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5159         all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5160         VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5161         int width = RSTRING_LENINT(beg);
5162         b = rb_str_to_inum(beg, 10, FALSE);
5163         if (FIXNUM_P(b)) {
5164             long bi = FIX2LONG(b);
5165             rb_encoding *usascii = rb_usascii_encoding();
5166
5167             while (FIXABLE(bi)) {
5168                 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5169                 bi++;
5170             }
5171             b = LONG2NUM(bi);
5172         }
5173         args[0] = INT2FIX(width);
5174         while (1) {
5175             args[1] = b;
5176             if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5177             b = rb_funcallv(b, succ, 0, 0);
5178         }
5179     }
5180     /* normal case */
5181     current = str_duplicate(rb_cString, beg);
5182     while (1) {
5183         VALUE next = rb_funcallv(current, succ, 0, 0);
5184         if ((*each)(current, arg)) break;
5185         current = next;
5186         StringValue(current);
5187         if (RSTRING_LEN(current) == 0)
5188             break;
5189     }
5190
5191     return beg;
5192 }
5193
5194 static int
5195 include_range_i(VALUE str, VALUE arg)
5196 {
5197     VALUE *argp = (VALUE *)arg;
5198     if (!rb_equal(str, *argp)) return 0;
5199     *argp = Qnil;
5200     return 1;
5201 }
5202
5203 VALUE
5204 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5205 {
5206     beg = rb_str_new_frozen(beg);
5207     StringValue(end);
5208     end = rb_str_new_frozen(end);
5209     if (NIL_P(val)) return Qfalse;
5210     val = rb_check_string_type(val);
5211     if (NIL_P(val)) return Qfalse;
5212     if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5213         rb_enc_asciicompat(STR_ENC_GET(end)) &&
5214         rb_enc_asciicompat(STR_ENC_GET(val))) {
5215         const char *bp = RSTRING_PTR(beg);
5216         const char *ep = RSTRING_PTR(end);
5217         const char *vp = RSTRING_PTR(val);
5218         if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5219             if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5220                 return Qfalse;
5221             else {
5222                 char b = *bp;
5223                 char e = *ep;
5224                 char v = *vp;
5225
5226                 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5227                     if (b <= v && v < e) return Qtrue;
5228                     return RBOOL(!RTEST(exclusive) && v == e);
5229                 }
5230             }
5231         }
5232 #if 0
5233         /* both edges are all digits */
5234         if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5235             all_digits_p(bp, RSTRING_LEN(beg)) &&
5236             all_digits_p(ep, RSTRING_LEN(end))) {
5237             /* TODO */
5238         }
5239 #endif
5240     }
5241     rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5242
5243     return RBOOL(NIL_P(val));
5244 }
5245
5246 static VALUE
5247 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5248 {
5249     if (rb_reg_search(re, str, 0, 0) >= 0) {
5250         VALUE match = rb_backref_get();
5251         int nth = rb_reg_backref_number(match, backref);
5252         return rb_reg_nth_match(nth, match);
5253     }
5254     return Qnil;
5255 }
5256
5257 static VALUE
5258 rb_str_aref(VALUE str, VALUE indx)
5259 {
5260     long idx;
5261
5262     if (FIXNUM_P(indx)) {
5263         idx = FIX2LONG(indx);
5264     }
5265     else if (RB_TYPE_P(indx, T_REGEXP)) {
5266         return rb_str_subpat(str, indx, INT2FIX(0));
5267     }
5268     else if (RB_TYPE_P(indx, T_STRING)) {
5269         if (rb_str_index(str, indx, 0) != -1)
5270             return str_duplicate(rb_cString, indx);
5271         return Qnil;
5272     }
5273     else {
5274         /* check if indx is Range */
5275         long beg, len = str_strlen(str, NULL);
5276         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5277           case Qfalse:
5278             break;
5279           case Qnil:
5280             return Qnil;
5281           default:
5282             return rb_str_substr(str, beg, len);
5283         }
5284         idx = NUM2LONG(indx);
5285     }
5286
5287     return str_substr(str, idx, 1, FALSE);
5288 }
5289
5290
5291 /*
5292  *  call-seq:
5293  *    string[index] -> new_string or nil
5294  *    string[start, length] -> new_string or nil
5295  *    string[range] -> new_string or nil
5296  *    string[regexp, capture = 0] -> new_string or nil
5297  *    string[substring] -> new_string or nil
5298  *
5299  *  Returns the substring of +self+ specified by the arguments.
5300  *  See examples at {String Slices}[rdoc-ref:String@String+Slices].
5301  *
5302  *
5303  */
5304
5305 static VALUE
5306 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5307 {
5308     if (argc == 2) {
5309         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5310             return rb_str_subpat(str, argv[0], argv[1]);
5311         }
5312         else {
5313             long beg = NUM2LONG(argv[0]);
5314             long len = NUM2LONG(argv[1]);
5315             return rb_str_substr(str, beg, len);
5316         }
5317     }
5318     rb_check_arity(argc, 1, 2);
5319     return rb_str_aref(str, argv[0]);
5320 }
5321
5322 VALUE
5323 rb_str_drop_bytes(VALUE str, long len)
5324 {
5325     char *ptr = RSTRING_PTR(str);
5326     long olen = RSTRING_LEN(str), nlen;
5327
5328     str_modifiable(str);
5329     if (len > olen) len = olen;
5330     nlen = olen - len;
5331     if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5332         char *oldptr = ptr;
5333         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5334         STR_SET_EMBED(str);
5335         ptr = RSTRING(str)->as.embed.ary;
5336         memmove(ptr, oldptr + len, nlen);
5337         if (fl == STR_NOEMBED) xfree(oldptr);
5338     }
5339     else {
5340         if (!STR_SHARED_P(str)) {
5341             VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5342             rb_enc_cr_str_exact_copy(shared, str);
5343             OBJ_FREEZE(shared);
5344         }
5345         ptr = RSTRING(str)->as.heap.ptr += len;
5346     }
5347     STR_SET_LEN(str, nlen);
5348
5349     if (!SHARABLE_MIDDLE_SUBSTRING) {
5350         TERM_FILL(ptr + nlen, TERM_LEN(str));
5351     }
5352     ENC_CODERANGE_CLEAR(str);
5353     return str;
5354 }
5355
5356 static void
5357 rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5358 {
5359     char *sptr;
5360     long slen;
5361     int cr;
5362
5363     if (beg == 0 && vlen == 0) {
5364         rb_str_drop_bytes(str, len);
5365         return;
5366     }
5367
5368     str_modify_keep_cr(str);
5369     RSTRING_GETMEM(str, sptr, slen);
5370     if (len < vlen) {
5371         /* expand string */
5372         RESIZE_CAPA(str, slen + vlen - len);
5373         sptr = RSTRING_PTR(str);
5374     }
5375
5376     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5377         cr = rb_enc_str_coderange(val);
5378     else
5379         cr = ENC_CODERANGE_UNKNOWN;
5380
5381     if (vlen != len) {
5382         memmove(sptr + beg + vlen,
5383                 sptr + beg + len,
5384                 slen - (beg + len));
5385     }
5386     if (vlen < beg && len < 0) {
5387         MEMZERO(sptr + slen, char, -len);
5388     }
5389     if (vlen > 0) {
5390         memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5391     }
5392     slen += vlen - len;
5393     STR_SET_LEN(str, slen);
5394     TERM_FILL(&sptr[slen], TERM_LEN(str));
5395     ENC_CODERANGE_SET(str, cr);
5396 }
5397
5398 static inline void
5399 rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5400 {
5401     rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5402 }
5403
5404 void
5405 rb_str_update(VALUE str, long beg, long len, VALUE val)
5406 {
5407     long slen;
5408     char *p, *e;
5409     rb_encoding *enc;
5410     int singlebyte = single_byte_optimizable(str);
5411     int cr;
5412
5413     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5414
5415     StringValue(val);
5416     enc = rb_enc_check(str, val);
5417     slen = str_strlen(str, enc); /* rb_enc_check */
5418
5419     if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5420         rb_raise(rb_eIndexError, "index %ld out of string", beg);
5421     }
5422     if (beg < 0) {
5423         beg += slen;
5424     }
5425     RUBY_ASSERT(beg >= 0);
5426     RUBY_ASSERT(beg <= slen);
5427
5428     if (len > slen - beg) {
5429         len = slen - beg;
5430     }
5431     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5432     if (!p) p = RSTRING_END(str);
5433     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5434     if (!e) e = RSTRING_END(str);
5435     /* error check */
5436     beg = p - RSTRING_PTR(str); /* physical position */
5437     len = e - p;                /* physical length */
5438     rb_str_update_0(str, beg, len, val);
5439     rb_enc_associate(str, enc);
5440     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
5441     if (cr != ENC_CODERANGE_BROKEN)
5442         ENC_CODERANGE_SET(str, cr);
5443 }
5444
5445 static void
5446 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5447 {
5448     int nth;
5449     VALUE match;
5450     long start, end, len;
5451     rb_encoding *enc;
5452     struct re_registers *regs;
5453
5454     if (rb_reg_search(re, str, 0, 0) < 0) {
5455         rb_raise(rb_eIndexError, "regexp not matched");
5456     }
5457     match = rb_backref_get();
5458     nth = rb_reg_backref_number(match, backref);
5459     regs = RMATCH_REGS(match);
5460     if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5461         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5462     }
5463     if (nth < 0) {
5464         nth += regs->num_regs;
5465     }
5466
5467     start = BEG(nth);
5468     if (start == -1) {
5469         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5470     }
5471     end = END(nth);
5472     len = end - start;
5473     StringValue(val);
5474     enc = rb_enc_check_str(str, val);
5475     rb_str_update_0(str, start, len, val);
5476     rb_enc_associate(str, enc);
5477 }
5478
5479 static VALUE
5480 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5481 {
5482     long idx, beg;
5483
5484     switch (TYPE(indx)) {
5485       case T_REGEXP:
5486         rb_str_subpat_set(str, indx, INT2FIX(0), val);
5487         return val;
5488
5489       case T_STRING:
5490         beg = rb_str_index(str, indx, 0);
5491         if (beg < 0) {
5492             rb_raise(rb_eIndexError, "string not matched");
5493         }
5494         beg = rb_str_sublen(str, beg);
5495         rb_str_update(str, beg, str_strlen(indx, NULL), val);
5496         return val;
5497
5498       default:
5499         /* check if indx is Range */
5500         {
5501             long beg, len;
5502             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5503                 rb_str_update(str, beg, len, val);
5504                 return val;
5505             }
5506         }
5507         /* FALLTHROUGH */
5508
5509       case T_FIXNUM:
5510         idx = NUM2LONG(indx);
5511         rb_str_update(str, idx, 1, val);
5512         return val;
5513     }
5514 }
5515
5516 /*
5517  *  call-seq:
5518  *    string[index] = new_string
5519  *    string[start, length] = new_string
5520  *    string[range] = new_string
5521  *    string[regexp, capture = 0] = new_string
5522  *    string[substring] = new_string
5523  *
5524  *  Replaces all, some, or none of the contents of +self+; returns +new_string+.
5525  *  See {String Slices}[rdoc-ref:String@String+Slices].
5526  *
5527  *  A few examples:
5528  *
5529  *    s = 'foo'
5530  *    s[2] = 'rtune'     # => "rtune"
5531  *    s                  # => "fortune"
5532  *    s[1, 5] = 'init'   # => "init"
5533  *    s                  # => "finite"
5534  *    s[3..4] = 'al'     # => "al"
5535  *    s                  # => "finale"
5536  *    s[/e$/] = 'ly'     # => "ly"
5537  *    s                  # => "finally"
5538  *    s['lly'] = 'ncial' # => "ncial"
5539  *    s                  # => "financial"
5540  *
5541  */
5542
5543 static VALUE
5544 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5545 {
5546     if (argc == 3) {
5547         if (RB_TYPE_P(argv[0], T_REGEXP)) {
5548             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5549         }
5550         else {
5551             rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5552         }
5553         return argv[2];
5554     }
5555     rb_check_arity(argc, 2, 3);
5556     return rb_str_aset(str, argv[0], argv[1]);
5557 }
5558
5559 /*
5560  *  call-seq:
5561  *    insert(index, other_string) -> self
5562  *
5563  *  Inserts the given +other_string+ into +self+; returns +self+.
5564  *
5565  *  If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5566  *
5567  *    'foo'.insert(1, 'bar') # => "fbaroo"
5568  *
5569  *  If the Integer +index+ is negative, counts backward from the end of +self+
5570  *  and inserts +other_string+ at offset <tt>index+1</tt>
5571  *  (that is, _after_ <tt>self[index]</tt>):
5572  *
5573  *    'foo'.insert(-2, 'bar') # => "fobaro"
5574  *
5575  */
5576
5577 static VALUE
5578 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5579 {
5580     long pos = NUM2LONG(idx);
5581
5582     if (pos == -1) {
5583         return rb_str_append(str, str2);
5584     }
5585     else if (pos < 0) {
5586         pos++;
5587     }
5588     rb_str_update(str, pos, 0, str2);
5589     return str;
5590 }
5591
5592
5593 /*
5594  *  call-seq:
5595  *    slice!(index)               -> new_string or nil
5596  *    slice!(start, length)       -> new_string or nil
5597  *    slice!(range)               -> new_string or nil
5598  *    slice!(regexp, capture = 0) -> new_string or nil
5599  *    slice!(substring)           -> new_string or nil
5600  *
5601  *  Removes and returns the substring of +self+ specified by the arguments.
5602  *  See {String Slices}[rdoc-ref:String@String+Slices].
5603  *
5604  *  A few examples:
5605  *
5606  *     string = "This is a string"
5607  *     string.slice!(2)        #=> "i"
5608  *     string.slice!(3..6)     #=> " is "
5609  *     string.slice!(/s.*t/)   #=> "sa st"
5610  *     string.slice!("r")      #=> "r"
5611  *     string                  #=> "Thing"
5612  *
5613  */
5614
5615 static VALUE
5616 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5617 {
5618     VALUE result = Qnil;
5619     VALUE indx;
5620     long beg, len = 1;
5621     char *p;
5622
5623     rb_check_arity(argc, 1, 2);
5624     str_modify_keep_cr(str);
5625     indx = argv[0];
5626     if (RB_TYPE_P(indx, T_REGEXP)) {
5627         if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5628         VALUE match = rb_backref_get();
5629         struct re_registers *regs = RMATCH_REGS(match);
5630         int nth = 0;
5631         if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5632             if ((nth += regs->num_regs) <= 0) return Qnil;
5633         }
5634         else if (nth >= regs->num_regs) return Qnil;
5635         beg = BEG(nth);
5636         len = END(nth) - beg;
5637         goto subseq;
5638     }
5639     else if (argc == 2) {
5640         beg = NUM2LONG(indx);
5641         len = NUM2LONG(argv[1]);
5642         goto num_index;
5643     }
5644     else if (FIXNUM_P(indx)) {
5645         beg = FIX2LONG(indx);
5646         if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5647         if (!len) return Qnil;
5648         beg = p - RSTRING_PTR(str);
5649         goto subseq;
5650     }
5651     else if (RB_TYPE_P(indx, T_STRING)) {
5652         beg = rb_str_index(str, indx, 0);
5653         if (beg == -1) return Qnil;
5654         len = RSTRING_LEN(indx);
5655         result = str_duplicate(rb_cString, indx);
5656         goto squash;
5657     }
5658     else {
5659         switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5660           case Qnil:
5661             return Qnil;
5662           case Qfalse:
5663             beg = NUM2LONG(indx);
5664             if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5665             if (!len) return Qnil;
5666             beg = p - RSTRING_PTR(str);
5667             goto subseq;
5668           default:
5669             goto num_index;
5670         }
5671     }
5672
5673   num_index:
5674     if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5675     beg = p - RSTRING_PTR(str);
5676
5677   subseq:
5678     result = rb_str_new(RSTRING_PTR(str)+beg, len);
5679     rb_enc_cr_str_copy_for_substr(result, str);
5680
5681   squash:
5682     if (len > 0) {
5683         if (beg == 0) {
5684             rb_str_drop_bytes(str, len);
5685         }
5686         else {
5687             char *sptr = RSTRING_PTR(str);
5688             long slen = RSTRING_LEN(str);
5689             if (beg + len > slen) /* pathological check */
5690                 len = slen - beg;
5691             memmove(sptr + beg,
5692                     sptr + beg + len,
5693                     slen - (beg + len));
5694             slen -= len;
5695             STR_SET_LEN(str, slen);
5696             TERM_FILL(&sptr[slen], TERM_LEN(str));
5697         }
5698     }
5699     return result;
5700 }
5701
5702 static VALUE
5703 get_pat(VALUE pat)
5704 {
5705     VALUE val;
5706
5707     switch (OBJ_BUILTIN_TYPE(pat)) {
5708       case T_REGEXP:
5709         return pat;
5710
5711       case T_STRING:
5712         break;
5713
5714       default:
5715         val = rb_check_string_type(pat);
5716         if (NIL_P(val)) {
5717             Check_Type(pat, T_REGEXP);
5718         }
5719         pat = val;
5720     }
5721
5722     return rb_reg_regcomp(pat);
5723 }
5724
5725 static VALUE
5726 get_pat_quoted(VALUE pat, int check)
5727 {
5728     VALUE val;
5729
5730     switch (OBJ_BUILTIN_TYPE(pat)) {
5731       case T_REGEXP:
5732         return pat;
5733
5734       case T_STRING:
5735         break;
5736
5737       default:
5738         val = rb_check_string_type(pat);
5739         if (NIL_P(val)) {
5740             Check_Type(pat, T_REGEXP);
5741         }
5742         pat = val;
5743     }
5744     if (check && is_broken_string(pat)) {
5745         rb_exc_raise(rb_reg_check_preprocess(pat));
5746     }
5747     return pat;
5748 }
5749
5750 static long
5751 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5752 {
5753     if (BUILTIN_TYPE(pat) == T_STRING) {
5754         pos = rb_str_byteindex(str, pat, pos);
5755         if (set_backref_str) {
5756             if (pos >= 0) {
5757                 str = rb_str_new_frozen_String(str);
5758                 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5759             }
5760             else {
5761                 rb_backref_set(Qnil);
5762             }
5763         }
5764         return pos;
5765     }
5766     else {
5767         return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5768     }
5769 }
5770
5771
5772 /*
5773  *  call-seq:
5774  *    sub!(pattern, replacement)   -> self or nil
5775  *    sub!(pattern) {|match| ... } -> self or nil
5776  *
5777  *  Returns +self+ with only the first occurrence
5778  *  (not all occurrences) of the given +pattern+ replaced.
5779  *
5780  *  See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5781  *
5782  *  Related: String#sub, String#gsub, String#gsub!.
5783  *
5784  */
5785
5786 static VALUE
5787 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5788 {
5789     VALUE pat, repl, hash = Qnil;
5790     int iter = 0;
5791     long plen;
5792     int min_arity = rb_block_given_p() ? 1 : 2;
5793     long beg;
5794
5795     rb_check_arity(argc, min_arity, 2);
5796     if (argc == 1) {
5797         iter = 1;
5798     }
5799     else {
5800         repl = argv[1];
5801         hash = rb_check_hash_type(argv[1]);
5802         if (NIL_P(hash)) {
5803             StringValue(repl);
5804         }
5805     }
5806
5807     pat = get_pat_quoted(argv[0], 1);
5808
5809     str_modifiable(str);
5810     beg = rb_pat_search(pat, str, 0, 1);
5811     if (beg >= 0) {
5812         rb_encoding *enc;
5813         int cr = ENC_CODERANGE(str);
5814         long beg0, end0;
5815         VALUE match, match0 = Qnil;
5816         struct re_registers *regs;
5817         char *p, *rp;
5818         long len, rlen;
5819
5820         match = rb_backref_get();
5821         regs = RMATCH_REGS(match);
5822         if (RB_TYPE_P(pat, T_STRING)) {
5823             beg0 = beg;
5824             end0 = beg0 + RSTRING_LEN(pat);
5825             match0 = pat;
5826         }
5827         else {
5828             beg0 = BEG(0);
5829             end0 = END(0);
5830             if (iter) match0 = rb_reg_nth_match(0, match);
5831         }
5832
5833         if (iter || !NIL_P(hash)) {
5834             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5835
5836             if (iter) {
5837                 repl = rb_obj_as_string(rb_yield(match0));
5838             }
5839             else {
5840                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5841                 repl = rb_obj_as_string(repl);
5842             }
5843             str_mod_check(str, p, len);
5844             rb_check_frozen(str);
5845         }
5846         else {
5847             repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5848         }
5849
5850         enc = rb_enc_compatible(str, repl);
5851         if (!enc) {
5852             rb_encoding *str_enc = STR_ENC_GET(str);
5853             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5854             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5855                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5856                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5857                          rb_enc_inspect_name(str_enc),
5858                          rb_enc_inspect_name(STR_ENC_GET(repl)));
5859             }
5860             enc = STR_ENC_GET(repl);
5861         }
5862         rb_str_modify(str);
5863         rb_enc_associate(str, enc);
5864         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5865             int cr2 = ENC_CODERANGE(repl);
5866             if (cr2 == ENC_CODERANGE_BROKEN ||
5867                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5868                 cr = ENC_CODERANGE_UNKNOWN;
5869             else
5870                 cr = cr2;
5871         }
5872         plen = end0 - beg0;
5873         rlen = RSTRING_LEN(repl);
5874         len = RSTRING_LEN(str);
5875         if (rlen > plen) {
5876             RESIZE_CAPA(str, len + rlen - plen);
5877         }
5878         p = RSTRING_PTR(str);
5879         if (rlen != plen) {
5880             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5881         }
5882         rp = RSTRING_PTR(repl);
5883         memmove(p + beg0, rp, rlen);
5884         len += rlen - plen;
5885         STR_SET_LEN(str, len);
5886         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5887         ENC_CODERANGE_SET(str, cr);
5888
5889         RB_GC_GUARD(match);
5890
5891         return str;
5892     }
5893     return Qnil;
5894 }
5895
5896
5897 /*
5898  *  call-seq:
5899  *    sub(pattern, replacement)   -> new_string
5900  *    sub(pattern) {|match| ... } -> new_string
5901  *
5902  *  Returns a copy of +self+ with only the first occurrence
5903  *  (not all occurrences) of the given +pattern+ replaced.
5904  *
5905  *  See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5906  *
5907  *  Related: String#sub!, String#gsub, String#gsub!.
5908  *
5909  */
5910
5911 static VALUE
5912 rb_str_sub(int argc, VALUE *argv, VALUE str)
5913 {
5914     str = str_duplicate(rb_cString, str);
5915     rb_str_sub_bang(argc, argv, str);
5916     return str;
5917 }
5918
5919 static VALUE
5920 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5921 {
5922     VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
5923     long beg, beg0, end0;
5924     long offset, blen, slen, len, last;
5925     enum {STR, ITER, MAP} mode = STR;
5926     char *sp, *cp;
5927     int need_backref = -1;
5928     rb_encoding *str_enc;
5929
5930     switch (argc) {
5931       case 1:
5932         RETURN_ENUMERATOR(str, argc, argv);
5933         mode = ITER;
5934         break;
5935       case 2:
5936         repl = argv[1];
5937         hash = rb_check_hash_type(argv[1]);
5938         if (NIL_P(hash)) {
5939             StringValue(repl);
5940         }
5941         else {
5942             mode = MAP;
5943         }
5944         break;
5945       default:
5946         rb_error_arity(argc, 1, 2);
5947     }
5948
5949     pat = get_pat_quoted(argv[0], 1);
5950     beg = rb_pat_search(pat, str, 0, need_backref);
5951     if (beg < 0) {
5952         if (bang) return Qnil;  /* no match, no substitution */
5953         return str_duplicate(rb_cString, str);
5954     }
5955
5956     offset = 0;
5957     blen = RSTRING_LEN(str) + 30; /* len + margin */
5958     dest = rb_str_buf_new(blen);
5959     sp = RSTRING_PTR(str);
5960     slen = RSTRING_LEN(str);
5961     cp = sp;
5962     str_enc = STR_ENC_GET(str);
5963     rb_enc_associate(dest, str_enc);
5964     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5965
5966     do {
5967         VALUE match = rb_backref_get();
5968         struct re_registers *regs = RMATCH_REGS(match);
5969         if (RB_TYPE_P(pat, T_STRING)) {
5970             beg0 = beg;
5971             end0 = beg0 + RSTRING_LEN(pat);
5972             match0 = pat;
5973         }
5974         else {
5975             beg0 = BEG(0);
5976             end0 = END(0);
5977             if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5978         }
5979
5980         if (mode) {
5981             if (mode == ITER) {
5982                 val = rb_obj_as_string(rb_yield(match0));
5983             }
5984             else {
5985                 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5986                 val = rb_obj_as_string(val);
5987             }
5988             str_mod_check(str, sp, slen);
5989             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
5990                 rb_raise(rb_eRuntimeError, "block should not cheat");
5991             }
5992         }
5993         else if (need_backref) {
5994             val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5995             if (need_backref < 0) {
5996                 need_backref = val != repl;
5997             }
5998         }
5999         else {
6000             val = repl;
6001         }
6002
6003         len = beg0 - offset;    /* copy pre-match substr */
6004         if (len) {
6005             rb_enc_str_buf_cat(dest, cp, len, str_enc);
6006         }
6007
6008         rb_str_buf_append(dest, val);
6009
6010         last = offset;
6011         offset = end0;
6012         if (beg0 == end0) {
6013             /*
6014              * Always consume at least one character of the input string
6015              * in order to prevent infinite loops.
6016              */
6017             if (RSTRING_LEN(str) <= end0) break;
6018             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6019             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6020             offset = end0 + len;
6021         }
6022         cp = RSTRING_PTR(str) + offset;
6023         if (offset > RSTRING_LEN(str)) break;
6024         beg = rb_pat_search(pat, str, offset, need_backref);
6025
6026         RB_GC_GUARD(match);
6027     } while (beg >= 0);
6028     if (RSTRING_LEN(str) > offset) {
6029         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6030     }
6031     rb_pat_search(pat, str, last, 1);
6032     if (bang) {
6033         str_shared_replace(str, dest);
6034     }
6035     else {
6036         str = dest;
6037     }
6038
6039     return str;
6040 }
6041
6042
6043 /*
6044  *  call-seq:
6045  *     gsub!(pattern, replacement)   -> self or nil
6046  *     gsub!(pattern) {|match| ... } -> self or nil
6047  *     gsub!(pattern)                -> an_enumerator
6048  *
6049  *  Performs the specified substring replacement(s) on +self+;
6050  *  returns +self+ if any replacement occurred, +nil+ otherwise.
6051  *
6052  *  See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6053  *
6054  *  Returns an Enumerator if no +replacement+ and no block given.
6055  *
6056  *  Related: String#sub, String#gsub, String#sub!.
6057  *
6058  */
6059
6060 static VALUE
6061 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6062 {
6063     str_modify_keep_cr(str);
6064     return str_gsub(argc, argv, str, 1);
6065 }
6066
6067
6068 /*
6069  *  call-seq:
6070  *     gsub(pattern, replacement)   -> new_string
6071  *     gsub(pattern) {|match| ... } -> new_string
6072  *     gsub(pattern)                -> enumerator
6073  *
6074  *  Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6075  *
6076  *  See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6077  *
6078  *  Returns an Enumerator if no +replacement+ and no block given.
6079  *
6080  *  Related: String#sub, String#sub!, String#gsub!.
6081  *
6082  */
6083
6084 static VALUE
6085 rb_str_gsub(int argc, VALUE *argv, VALUE str)
6086 {
6087     return str_gsub(argc, argv, str, 0);
6088 }
6089
6090
6091 /*
6092  *  call-seq:
6093  *    replace(other_string) -> self
6094  *
6095  *  Replaces the contents of +self+ with the contents of +other_string+:
6096  *
6097  *    s = 'foo'        # => "foo"
6098  *    s.replace('bar') # => "bar"
6099  *
6100  */
6101
6102 VALUE
6103 rb_str_replace(VALUE str, VALUE str2)
6104 {
6105     str_modifiable(str);
6106     if (str == str2) return str;
6107
6108     StringValue(str2);
6109     str_discard(str);
6110     return str_replace(str, str2);
6111 }
6112
6113 /*
6114  *  call-seq:
6115  *    clear -> self
6116  *
6117  *  Removes the contents of +self+:
6118  *
6119  *    s = 'foo' # => "foo"
6120  *    s.clear   # => ""
6121  *
6122  */
6123
6124 static VALUE
6125 rb_str_clear(VALUE str)
6126 {
6127     str_discard(str);
6128     STR_SET_EMBED(str);
6129     STR_SET_LEN(str, 0);
6130     RSTRING_PTR(str)[0] = 0;
6131     if (rb_enc_asciicompat(STR_ENC_GET(str)))
6132         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
6133     else
6134         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
6135     return str;
6136 }
6137
6138 /*
6139  *  call-seq:
6140  *    chr -> string
6141  *
6142  *  Returns a string containing the first character of +self+:
6143  *
6144  *    s = 'foo' # => "foo"
6145  *    s.chr     # => "f"
6146  *
6147  */
6148
6149 static VALUE
6150 rb_str_chr(VALUE str)
6151 {
6152     return rb_str_substr(str, 0, 1);
6153 }
6154
6155 /*
6156  *  call-seq:
6157  *    getbyte(index) -> integer or nil
6158  *
6159  *  Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6160  *
6161  *    s = 'abcde'   # => "abcde"
6162  *    s.getbyte(0)  # => 97
6163  *    s.getbyte(-1) # => 101
6164  *    s.getbyte(5)  # => nil
6165  *
6166  *  Related: String#setbyte.
6167  */
6168 VALUE
6169 rb_str_getbyte(VALUE str, VALUE index)
6170 {
6171     long pos = NUM2LONG(index);
6172
6173     if (pos < 0)
6174         pos += RSTRING_LEN(str);
6175     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
6176         return Qnil;
6177
6178     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6179 }
6180
6181 /*
6182  *  call-seq:
6183  *    setbyte(index, integer) -> integer
6184  *
6185  *  Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6186  *
6187  *    s = 'abcde'      # => "abcde"
6188  *    s.setbyte(0, 98) # => 98
6189  *    s                # => "bbcde"
6190  *
6191  *  Related: String#getbyte.
6192  */
6193 VALUE
6194 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6195 {
6196     long pos = NUM2LONG(index);
6197     long len = RSTRING_LEN(str);
6198     char *ptr, *head, *left = 0;
6199     rb_encoding *enc;
6200     int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6201
6202     if (pos < -len || len <= pos)
6203         rb_raise(rb_eIndexError, "index %ld out of string", pos);
6204     if (pos < 0)
6205         pos += len;
6206
6207     VALUE v = rb_to_int(value);
6208     VALUE w = rb_int_and(v, INT2FIX(0xff));
6209     char byte = (char)(NUM2INT(w) & 0xFF);
6210
6211     if (!str_independent(str))
6212         str_make_independent(str);
6213     enc = STR_ENC_GET(str);
6214     head = RSTRING_PTR(str);
6215     ptr = &head[pos];
6216     if (!STR_EMBED_P(str)) {
6217         cr = ENC_CODERANGE(str);
6218         switch (cr) {
6219           case ENC_CODERANGE_7BIT:
6220             left = ptr;
6221             *ptr = byte;
6222             if (ISASCII(byte)) goto end;
6223             nlen = rb_enc_precise_mbclen(left, head+len, enc);
6224             if (!MBCLEN_CHARFOUND_P(nlen))
6225                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
6226             else
6227                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
6228             goto end;
6229           case ENC_CODERANGE_VALID:
6230             left = rb_enc_left_char_head(head, ptr, head+len, enc);
6231             width = rb_enc_precise_mbclen(left, head+len, enc);
6232             *ptr = byte;
6233             nlen = rb_enc_precise_mbclen(left, head+len, enc);
6234             if (!MBCLEN_CHARFOUND_P(nlen))
6235                 ENC_CODERANGE_SET(str, ENC_CODERANGE_BROKEN);
6236             else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6237                 ENC_CODERANGE_CLEAR(str);
6238             goto end;
6239         }
6240     }
6241     ENC_CODERANGE_CLEAR(str);
6242     *ptr = byte;
6243
6244   end:
6245     return value;
6246 }
6247
6248 static VALUE
6249 str_byte_substr(VALUE str, long beg, long len, int empty)
6250 {
6251     long n = RSTRING_LEN(str);
6252
6253     if (beg > n || len < 0) return Qnil;
6254     if (beg < 0) {
6255         beg += n;
6256         if (beg < 0) return Qnil;
6257     }
6258     if (len > n - beg)
6259         len = n - beg;
6260     if (len <= 0) {
6261         if (!empty) return Qnil;
6262         len = 0;
6263     }
6264
6265     VALUE str2 = str_subseq(str, beg, len);
6266
6267     str_enc_copy_direct(str2, str);
6268
6269     if (RSTRING_LEN(str2) == 0) {
6270         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6271             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
6272         else
6273             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6274     }
6275     else {
6276         switch (ENC_CODERANGE(str)) {
6277           case ENC_CODERANGE_7BIT:
6278             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
6279             break;
6280           default:
6281             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
6282             break;
6283         }
6284     }
6285
6286     return str2;
6287 }
6288
6289 VALUE
6290 rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6291 {
6292     return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6293 }
6294
6295 static VALUE
6296 str_byte_aref(VALUE str, VALUE indx)
6297 {
6298     long idx;
6299     if (FIXNUM_P(indx)) {
6300         idx = FIX2LONG(indx);
6301     }
6302     else {
6303         /* check if indx is Range */
6304         long beg, len = RSTRING_LEN(str);
6305
6306         switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6307           case Qfalse:
6308             break;
6309           case Qnil:
6310             return Qnil;
6311           default:
6312             return str_byte_substr(str, beg, len, TRUE);
6313         }
6314
6315         idx = NUM2LONG(indx);
6316     }
6317     return str_byte_substr(str, idx, 1, FALSE);
6318 }
6319
6320 /*
6321  *  call-seq:
6322  *    byteslice(index, length = 1) -> string or nil
6323  *    byteslice(range)             -> string or nil
6324  *
6325  *  Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6326  *
6327  *  With integer arguments +index+ and +length+ given,
6328  *  returns the substring beginning at the given +index+
6329  *  of the given +length+ (if possible),
6330  *  or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6331  *
6332  *    s = '0123456789' # => "0123456789"
6333  *    s.byteslice(2)   # => "2"
6334  *    s.byteslice(200) # => nil
6335  *    s.byteslice(4, 3)  # => "456"
6336  *    s.byteslice(4, 30) # => "456789"
6337  *    s.byteslice(4, -1) # => nil
6338  *    s.byteslice(40, 2) # => nil
6339  *
6340  *  In either case above, counts backwards from the end of +self+
6341  *  if +index+ is negative:
6342  *
6343  *    s = '0123456789'   # => "0123456789"
6344  *    s.byteslice(-4)    # => "6"
6345  *    s.byteslice(-4, 3) # => "678"
6346  *
6347  *  With Range argument +range+ given, returns
6348  *  <tt>byteslice(range.begin, range.size)</tt>:
6349  *
6350  *    s = '0123456789'    # => "0123456789"
6351  *    s.byteslice(4..6)   # => "456"
6352  *    s.byteslice(-6..-4) # => "456"
6353  *    s.byteslice(5..2)   # => "" # range.size is zero.
6354  *    s.byteslice(40..42) # => nil
6355  *
6356  *  In all cases, a returned string has the same encoding as +self+:
6357  *
6358  *    s.encoding              # => #<Encoding:UTF-8>
6359  *    s.byteslice(4).encoding # => #<Encoding:UTF-8>
6360  *
6361  */
6362
6363 static VALUE
6364 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6365 {
6366     if (argc == 2) {
6367         long beg = NUM2LONG(argv[0]);
6368         long len = NUM2LONG(argv[1]);
6369         return str_byte_substr(str, beg, len, TRUE);
6370     }
6371     rb_check_arity(argc, 1, 2);
6372     return str_byte_aref(str, argv[0]);
6373 }
6374
6375 static void
6376 str_check_beg_len(VALUE str, long *beg, long *len)
6377 {
6378     long end, slen = RSTRING_LEN(str);
6379
6380     if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6381     if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6382         rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6383     }
6384     if (*beg < 0) {
6385         *beg += slen;
6386     }
6387     RUBY_ASSERT(*beg >= 0);
6388     RUBY_ASSERT(*beg <= slen);
6389
6390     if (*len > slen - *beg) {
6391         *len = slen - *beg;
6392     }
6393     end = *beg + *len;
6394     str_ensure_byte_pos(str, *beg);
6395     str_ensure_byte_pos(str, end);
6396 }
6397
6398 /*
6399  *  call-seq:
6400  *    bytesplice(index, length, str) -> string
6401  *    bytesplice(index, length, str, str_index, str_length) -> string
6402  *    bytesplice(range, str) -> string
6403  *    bytesplice(range, str, str_range) -> string
6404  *
6405  *  Replaces some or all of the content of +self+ with +str+, and returns +self+.
6406  *  The portion of the string affected is determined using
6407  *  the same criteria as String#byteslice, except that +length+ cannot be omitted.
6408  *  If the replacement string is not the same length as the text it is replacing,
6409  *  the string will be adjusted accordingly.
6410  *
6411  *  If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6412  *
6413  *  The form that take an Integer will raise an IndexError if the value is out
6414  *  of range; the Range form will raise a RangeError.
6415  *  If the beginning or ending offset does not land on character (codepoint)
6416  *  boundary, an IndexError will be raised.
6417  */
6418
6419 static VALUE
6420 rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6421 {
6422     long beg, len, vbeg, vlen;
6423     VALUE val;
6424     rb_encoding *enc;
6425     int cr;
6426
6427     rb_check_arity(argc, 2, 5);
6428     if (!(argc == 2 || argc == 3 || argc == 5)) {
6429         rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6430     }
6431     if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6432         if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6433             rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6434                      rb_builtin_class_name(argv[0]));
6435         }
6436         val = argv[1];
6437         StringValue(val);
6438         if (argc == 2) {
6439             /* bytesplice(range, str) */
6440             vbeg = 0;
6441             vlen = RSTRING_LEN(val);
6442         }
6443         else {
6444             /* bytesplice(range, str, str_range) */
6445             if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6446                 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6447                          rb_builtin_class_name(argv[2]));
6448             }
6449         }
6450     }
6451     else {
6452         beg = NUM2LONG(argv[0]);
6453         len = NUM2LONG(argv[1]);
6454         val = argv[2];
6455         StringValue(val);
6456         if (argc == 3) {
6457             /* bytesplice(index, length, str) */
6458             vbeg = 0;
6459             vlen = RSTRING_LEN(val);
6460         }
6461         else {
6462             /* bytesplice(index, length, str, str_index, str_length) */
6463             vbeg = NUM2LONG(argv[3]);
6464             vlen = NUM2LONG(argv[4]);
6465         }
6466     }
6467     str_check_beg_len(str, &beg, &len);
6468     str_check_beg_len(val, &vbeg, &vlen);
6469     enc = rb_enc_check(str, val);
6470     str_modify_keep_cr(str);
6471     rb_str_update_1(str, beg, len, val, vbeg, vlen);
6472     rb_enc_associate(str, enc);
6473     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
6474     if (cr != ENC_CODERANGE_BROKEN)
6475         ENC_CODERANGE_SET(str, cr);
6476     return str;
6477 }
6478
6479 /*
6480  *  call-seq:
6481  *    reverse -> string
6482  *
6483  *  Returns a new string with the characters from +self+ in reverse order.
6484  *
6485  *    'stressed'.reverse # => "desserts"
6486  *
6487  */
6488
6489 static VALUE
6490 rb_str_reverse(VALUE str)
6491 {
6492     rb_encoding *enc;
6493     VALUE rev;
6494     char *s, *e, *p;
6495     int cr;
6496
6497     if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6498     enc = STR_ENC_GET(str);
6499     rev = rb_str_new(0, RSTRING_LEN(str));
6500     s = RSTRING_PTR(str); e = RSTRING_END(str);
6501     p = RSTRING_END(rev);
6502     cr = ENC_CODERANGE(str);
6503
6504     if (RSTRING_LEN(str) > 1) {
6505         if (single_byte_optimizable(str)) {
6506             while (s < e) {
6507                 *--p = *s++;
6508             }
6509         }
6510         else if (cr == ENC_CODERANGE_VALID) {
6511             while (s < e) {
6512                 int clen = rb_enc_fast_mbclen(s, e, enc);
6513
6514                 p -= clen;
6515                 memcpy(p, s, clen);
6516                 s += clen;
6517             }
6518         }
6519         else {
6520             cr = rb_enc_asciicompat(enc) ?
6521                 ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6522             while (s < e) {
6523                 int clen = rb_enc_mbclen(s, e, enc);
6524
6525                 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6526                 p -= clen;
6527                 memcpy(p, s, clen);
6528                 s += clen;
6529             }
6530         }
6531     }
6532     STR_SET_LEN(rev, RSTRING_LEN(str));
6533     str_enc_copy_direct(rev, str);
6534     ENC_CODERANGE_SET(rev, cr);
6535
6536     return rev;
6537 }
6538
6539
6540 /*
6541  *  call-seq:
6542  *    reverse! -> self
6543  *
6544  *  Returns +self+ with its characters reversed:
6545  *
6546  *    s = 'stressed'
6547  *    s.reverse! # => "desserts"
6548  *    s          # => "desserts"
6549  *
6550  */
6551
6552 static VALUE
6553 rb_str_reverse_bang(VALUE str)
6554 {
6555     if (RSTRING_LEN(str) > 1) {
6556         if (single_byte_optimizable(str)) {
6557             char *s, *e, c;
6558
6559             str_modify_keep_cr(str);
6560             s = RSTRING_PTR(str);
6561             e = RSTRING_END(str) - 1;
6562             while (s < e) {
6563                 c = *s;
6564                 *s++ = *e;
6565                 *e-- = c;
6566             }
6567         }
6568         else {
6569             str_shared_replace(str, rb_str_reverse(str));
6570         }
6571     }
6572     else {
6573         str_modify_keep_cr(str);
6574     }
6575     return str;
6576 }
6577
6578
6579 /*
6580  *  call-seq:
6581  *    include?(other_string) -> true or false
6582  *
6583  *  Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6584  *
6585  *    s = 'foo'
6586  *    s.include?('f')    # => true
6587  *    s.include?('fo')   # => true
6588  *    s.include?('food') # => false
6589  *
6590  */
6591
6592 VALUE
6593 rb_str_include(VALUE str, VALUE arg)
6594 {
6595     long i;
6596
6597     StringValue(arg);
6598     i = rb_str_index(str, arg, 0);
6599
6600     return RBOOL(i != -1);
6601 }
6602
6603
6604 /*
6605  *  call-seq:
6606  *    to_i(base = 10) -> integer
6607  *
6608  *  Returns the result of interpreting leading characters in +self+
6609  *  as an integer in the given +base+ (which must be in (0, 2..36)):
6610  *
6611  *    '123456'.to_i     # => 123456
6612  *    '123def'.to_i(16) # => 1195503
6613  *
6614  *  With +base+ zero, string +object+ may contain leading characters
6615  *  to specify the actual base:
6616  *
6617  *    '123def'.to_i(0)   # => 123
6618  *    '0123def'.to_i(0)  # => 83
6619  *    '0b123def'.to_i(0) # => 1
6620  *    '0o123def'.to_i(0) # => 83
6621  *    '0d123def'.to_i(0) # => 123
6622  *    '0x123def'.to_i(0) # => 1195503
6623  *
6624  *  Characters past a leading valid number (in the given +base+) are ignored:
6625  *
6626  *    '12.345'.to_i   # => 12
6627  *    '12345'.to_i(2) # => 1
6628  *
6629  *  Returns zero if there is no leading valid number:
6630  *
6631  *    'abcdef'.to_i # => 0
6632  *    '2'.to_i(2)   # => 0
6633  *
6634  */
6635
6636 static VALUE
6637 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6638 {
6639     int base = 10;
6640
6641     if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6642         rb_raise(rb_eArgError, "invalid radix %d", base);
6643     }
6644     return rb_str_to_inum(str, base, FALSE);
6645 }
6646
6647
6648 /*
6649  *  call-seq:
6650  *    to_f -> float
6651  *
6652  *  Returns the result of interpreting leading characters in +self+ as a Float:
6653  *
6654  *    '3.14159'.to_f  # => 3.14159
6655  *    '1.234e-2'.to_f # => 0.01234
6656  *
6657  *  Characters past a leading valid number (in the given +base+) are ignored:
6658  *
6659  *    '3.14 (pi to two places)'.to_f # => 3.14
6660  *
6661  *  Returns zero if there is no leading valid number:
6662  *
6663  *    'abcdef'.to_f # => 0.0
6664  *
6665  */
6666
6667 static VALUE
6668 rb_str_to_f(VALUE str)
6669 {
6670     return DBL2NUM(rb_str_to_dbl(str, FALSE));
6671 }
6672
6673
6674 /*
6675  *  call-seq:
6676  *    to_s -> self or string
6677  *
6678  *  Returns +self+ if +self+ is a +String+,
6679  *  or +self+ converted to a +String+ if +self+ is a subclass of +String+.
6680  */
6681
6682 static VALUE
6683 rb_str_to_s(VALUE str)
6684 {
6685     if (rb_obj_class(str) != rb_cString) {
6686         return str_duplicate(rb_cString, str);
6687     }
6688     return str;
6689 }
6690
6691 #if 0
6692 static void
6693 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6694 {
6695     char s[RUBY_MAX_CHAR_LEN];
6696     int n = rb_enc_codelen(c, enc);
6697
6698     rb_enc_mbcput(c, s, enc);
6699     rb_enc_str_buf_cat(str, s, n, enc);
6700 }
6701 #endif
6702
6703 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6704
6705 int
6706 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6707 {
6708     char buf[CHAR_ESC_LEN + 1];
6709     int l;
6710
6711 #if SIZEOF_INT > 4
6712     c &= 0xffffffff;
6713 #endif
6714     if (unicode_p) {
6715         if (c < 0x7F && ISPRINT(c)) {
6716             snprintf(buf, CHAR_ESC_LEN, "%c", c);
6717         }
6718         else if (c < 0x10000) {
6719             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6720         }
6721         else {
6722             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6723         }
6724     }
6725     else {
6726         if (c < 0x100) {
6727             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6728         }
6729         else {
6730             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6731         }
6732     }
6733     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
6734     rb_str_buf_cat(result, buf, l);
6735     return l;
6736 }
6737
6738 const char *
6739 ruby_escaped_char(int c)
6740 {
6741     switch (c) {
6742       case '\0': return "\\0";
6743       case '\n': return "\\n";
6744       case '\r': return "\\r";
6745       case '\t': return "\\t";
6746       case '\f': return "\\f";
6747       case '\013': return "\\v";
6748       case '\010': return "\\b";
6749       case '\007': return "\\a";
6750       case '\033': return "\\e";
6751       case '\x7f': return "\\c?";
6752     }
6753     return NULL;
6754 }
6755
6756 VALUE
6757 rb_str_escape(VALUE str)
6758 {
6759     int encidx = ENCODING_GET(str);
6760     rb_encoding *enc = rb_enc_from_index(encidx);
6761     const char *p = RSTRING_PTR(str);
6762     const char *pend = RSTRING_END(str);
6763     const char *prev = p;
6764     char buf[CHAR_ESC_LEN + 1];
6765     VALUE result = rb_str_buf_new(0);
6766     int unicode_p = rb_enc_unicode_p(enc);
6767     int asciicompat = rb_enc_asciicompat(enc);
6768
6769     while (p < pend) {
6770         unsigned int c;
6771         const char *cc;
6772         int n = rb_enc_precise_mbclen(p, pend, enc);
6773         if (!MBCLEN_CHARFOUND_P(n)) {
6774             if (p > prev) str_buf_cat(result, prev, p - prev);
6775             n = rb_enc_mbminlen(enc);
6776             if (pend < p + n)
6777                 n = (int)(pend - p);
6778             while (n--) {
6779                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6780                 str_buf_cat(result, buf, strlen(buf));
6781                 prev = ++p;
6782             }
6783             continue;
6784         }
6785         n = MBCLEN_CHARFOUND_LEN(n);
6786         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6787         p += n;
6788         cc = ruby_escaped_char(c);
6789         if (cc) {
6790             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6791             str_buf_cat(result, cc, strlen(cc));
6792             prev = p;
6793         }
6794         else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6795         }
6796         else {
6797             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6798             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6799             prev = p;
6800         }
6801     }
6802     if (p > prev) str_buf_cat(result, prev, p - prev);
6803     ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6804
6805     return result;
6806 }
6807
6808 /*
6809  *  call-seq:
6810  *    inspect -> string
6811  *
6812  *  Returns a printable version of +self+, enclosed in double-quotes,
6813  *  and with special characters escaped:
6814  *
6815  *    s = "foo\tbar\tbaz\n"
6816  *    s.inspect
6817  *    # => "\"foo\\tbar\\tbaz\\n\""
6818  *
6819  */
6820
6821 VALUE
6822 rb_str_inspect(VALUE str)
6823 {
6824     int encidx = ENCODING_GET(str);
6825     rb_encoding *enc = rb_enc_from_index(encidx);
6826     const char *p, *pend, *prev;
6827     char buf[CHAR_ESC_LEN + 1];
6828     VALUE result = rb_str_buf_new(0);
6829     rb_encoding *resenc = rb_default_internal_encoding();
6830     int unicode_p = rb_enc_unicode_p(enc);
6831     int asciicompat = rb_enc_asciicompat(enc);
6832
6833     if (resenc == NULL) resenc = rb_default_external_encoding();
6834     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6835     rb_enc_associate(result, resenc);
6836     str_buf_cat2(result, "\"");
6837
6838     p = RSTRING_PTR(str); pend = RSTRING_END(str);
6839     prev = p;
6840     while (p < pend) {
6841         unsigned int c, cc;
6842         int n;
6843
6844         n = rb_enc_precise_mbclen(p, pend, enc);
6845         if (!MBCLEN_CHARFOUND_P(n)) {
6846             if (p > prev) str_buf_cat(result, prev, p - prev);
6847             n = rb_enc_mbminlen(enc);
6848             if (pend < p + n)
6849                 n = (int)(pend - p);
6850             while (n--) {
6851                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6852                 str_buf_cat(result, buf, strlen(buf));
6853                 prev = ++p;
6854             }
6855             continue;
6856         }
6857         n = MBCLEN_CHARFOUND_LEN(n);
6858         c = rb_enc_mbc_to_codepoint(p, pend, enc);
6859         p += n;
6860         if ((asciicompat || unicode_p) &&
6861           (c == '"'|| c == '\\' ||
6862             (c == '#' &&
6863              p < pend &&
6864              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6865              (cc = rb_enc_codepoint(p,pend,enc),
6866               (cc == '$' || cc == '@' || cc == '{'))))) {
6867             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6868             str_buf_cat2(result, "\\");
6869             if (asciicompat || enc == resenc) {
6870                 prev = p - n;
6871                 continue;
6872             }
6873         }
6874         switch (c) {
6875           case '\n': cc = 'n'; break;
6876           case '\r': cc = 'r'; break;
6877           case '\t': cc = 't'; break;
6878           case '\f': cc = 'f'; break;
6879           case '\013': cc = 'v'; break;
6880           case '\010': cc = 'b'; break;
6881           case '\007': cc = 'a'; break;
6882           case 033: cc = 'e'; break;
6883           default: cc = 0; break;
6884         }
6885         if (cc) {
6886             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6887             buf[0] = '\\';
6888             buf[1] = (char)cc;
6889             str_buf_cat(result, buf, 2);
6890             prev = p;
6891             continue;
6892         }
6893         /* The special casing of 0x85 (NEXT_LINE) here is because
6894          * Oniguruma historically treats it as printable, but it
6895          * doesn't match the print POSIX bracket class or character
6896          * property in regexps.
6897          *
6898          * See Ruby Bug #16842 for details:
6899          * https://bugs.ruby-lang.org/issues/16842
6900          */
6901         if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6902             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6903             continue;
6904         }
6905         else {
6906             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6907             rb_str_buf_cat_escaped_char(result, c, unicode_p);
6908             prev = p;
6909             continue;
6910         }
6911     }
6912     if (p > prev) str_buf_cat(result, prev, p - prev);
6913     str_buf_cat2(result, "\"");
6914
6915     return result;
6916 }
6917
6918 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6919
6920 /*
6921  *  call-seq:
6922  *    dump -> string
6923  *
6924  *  Returns a printable version of +self+, enclosed in double-quotes,
6925  *  with special characters escaped, and with non-printing characters
6926  *  replaced by hexadecimal notation:
6927  *
6928  *    "hello \n ''".dump    # => "\"hello \\n ''\""
6929  *    "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6930  *
6931  *  Related: String#undump (inverse of String#dump).
6932  *
6933  */
6934
6935 VALUE
6936 rb_str_dump(VALUE str)
6937 {
6938     int encidx = rb_enc_get_index(str);
6939     rb_encoding *enc = rb_enc_from_index(encidx);
6940     long len;
6941     const char *p, *pend;
6942     char *q, *qend;
6943     VALUE result;
6944     int u8 = (encidx == rb_utf8_encindex());
6945     static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6946
6947     len = 2;                    /* "" */
6948     if (!rb_enc_asciicompat(enc)) {
6949         len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6950         len += strlen(enc->name);
6951     }
6952
6953     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6954     while (p < pend) {
6955         int clen;
6956         unsigned char c = *p++;
6957
6958         switch (c) {
6959           case '"':  case '\\':
6960           case '\n': case '\r':
6961           case '\t': case '\f':
6962           case '\013': case '\010': case '\007': case '\033':
6963             clen = 2;
6964             break;
6965
6966           case '#':
6967             clen = IS_EVSTR(p, pend) ? 2 : 1;
6968             break;
6969
6970           default:
6971             if (ISPRINT(c)) {
6972                 clen = 1;
6973             }
6974             else {
6975                 if (u8 && c > 0x7F) {   /* \u notation */
6976                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
6977                     if (MBCLEN_CHARFOUND_P(n)) {
6978                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6979                         if (cc <= 0xFFFF)
6980                             clen = 6;  /* \uXXXX */
6981                         else if (cc <= 0xFFFFF)
6982                             clen = 9;  /* \u{XXXXX} */
6983                         else
6984                             clen = 10; /* \u{XXXXXX} */
6985                         p += MBCLEN_CHARFOUND_LEN(n)-1;
6986                         break;
6987                     }
6988                 }
6989                 clen = 4;       /* \xNN */
6990             }
6991             break;
6992         }
6993
6994         if (clen > LONG_MAX - len) {
6995             rb_raise(rb_eRuntimeError, "string size too big");
6996         }
6997         len += clen;
6998     }
6999
7000     result = rb_str_new(0, len);
7001     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7002     q = RSTRING_PTR(result); qend = q + len + 1;
7003
7004     *q++ = '"';
7005     while (p < pend) {
7006         unsigned char c = *p++;
7007
7008         if (c == '"' || c == '\\') {
7009             *q++ = '\\';
7010             *q++ = c;
7011         }
7012         else if (c == '#') {
7013             if (IS_EVSTR(p, pend)) *q++ = '\\';
7014             *q++ = '#';
7015         }
7016         else if (c == '\n') {
7017             *q++ = '\\';
7018             *q++ = 'n';
7019         }
7020         else if (c == '\r') {
7021             *q++ = '\\';
7022             *q++ = 'r';
7023         }
7024         else if (c == '\t') {
7025             *q++ = '\\';
7026             *q++ = 't';
7027         }
7028         else if (c == '\f') {
7029             *q++ = '\\';
7030             *q++ = 'f';
7031         }
7032         else if (c == '\013') {
7033             *q++ = '\\';
7034             *q++ = 'v';
7035         }
7036         else if (c == '\010') {
7037             *q++ = '\\';
7038             *q++ = 'b';
7039         }
7040         else if (c == '\007') {
7041             *q++ = '\\';
7042             *q++ = 'a';
7043         }
7044         else if (c == '\033') {
7045             *q++ = '\\';
7046             *q++ = 'e';
7047         }
7048         else if (ISPRINT(c)) {
7049             *q++ = c;
7050         }
7051         else {
7052             *q++ = '\\';
7053             if (u8) {
7054                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7055                 if (MBCLEN_CHARFOUND_P(n)) {
7056                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7057                     p += n;
7058                     if (cc <= 0xFFFF)
7059                         snprintf(q, qend-q, "u%04X", cc);    /* \uXXXX */
7060                     else
7061                         snprintf(q, qend-q, "u{%X}", cc);  /* \u{XXXXX} or \u{XXXXXX} */
7062                     q += strlen(q);
7063                     continue;
7064                 }
7065             }
7066             snprintf(q, qend-q, "x%02X", c);
7067             q += 3;
7068         }
7069     }
7070     *q++ = '"';
7071     *q = '\0';
7072     if (!rb_enc_asciicompat(enc)) {
7073         snprintf(q, qend-q, nonascii_suffix, enc->name);
7074         encidx = rb_ascii8bit_encindex();
7075     }
7076     /* result from dump is ASCII */
7077     rb_enc_associate_index(result, encidx);
7078     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
7079     return result;
7080 }
7081
7082 static int
7083 unescape_ascii(unsigned int c)
7084 {
7085     switch (c) {
7086       case 'n':
7087         return '\n';
7088       case 'r':
7089         return '\r';
7090       case 't':
7091         return '\t';
7092       case 'f':
7093         return '\f';
7094       case 'v':
7095         return '\13';
7096       case 'b':
7097         return '\010';
7098       case 'a':
7099         return '\007';
7100       case 'e':
7101         return 033;
7102     }
7103     UNREACHABLE_RETURN(-1);
7104 }
7105
7106 static void
7107 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7108 {
7109     const char *s = *ss;
7110     unsigned int c;
7111     int codelen;
7112     size_t hexlen;
7113     unsigned char buf[6];
7114     static rb_encoding *enc_utf8 = NULL;
7115
7116     switch (*s) {
7117       case '\\':
7118       case '"':
7119       case '#':
7120         rb_str_cat(undumped, s, 1); /* cat itself */
7121         s++;
7122         break;
7123       case 'n':
7124       case 'r':
7125       case 't':
7126       case 'f':
7127       case 'v':
7128       case 'b':
7129       case 'a':
7130       case 'e':
7131         *buf = unescape_ascii(*s);
7132         rb_str_cat(undumped, (char *)buf, 1);
7133         s++;
7134         break;
7135       case 'u':
7136         if (*binary) {
7137             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7138         }
7139         *utf8 = true;
7140         if (++s >= s_end) {
7141             rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7142         }
7143         if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7144         if (*penc != enc_utf8) {
7145             *penc = enc_utf8;
7146             rb_enc_associate(undumped, enc_utf8);
7147         }
7148         if (*s == '{') { /* handle \u{...} form */
7149             s++;
7150             for (;;) {
7151                 if (s >= s_end) {
7152                     rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7153                 }
7154                 if (*s == '}') {
7155                     s++;
7156                     break;
7157                 }
7158                 if (ISSPACE(*s)) {
7159                     s++;
7160                     continue;
7161                 }
7162                 c = scan_hex(s, s_end-s, &hexlen);
7163                 if (hexlen == 0 || hexlen > 6) {
7164                     rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7165                 }
7166                 if (c > 0x10ffff) {
7167                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7168                 }
7169                 if (0xd800 <= c && c <= 0xdfff) {
7170                     rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7171                 }
7172                 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7173                 rb_str_cat(undumped, (char *)buf, codelen);
7174                 s += hexlen;
7175             }
7176         }
7177         else { /* handle \uXXXX form */
7178             c = scan_hex(s, 4, &hexlen);
7179             if (hexlen != 4) {
7180                 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7181             }
7182             if (0xd800 <= c && c <= 0xdfff) {
7183                 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7184             }
7185             codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7186             rb_str_cat(undumped, (char *)buf, codelen);
7187             s += hexlen;
7188         }
7189         break;
7190       case 'x':
7191         if (*utf8) {
7192             rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7193         }
7194         *binary = true;
7195         if (++s >= s_end) {
7196             rb_raise(rb_eRuntimeError, "invalid hex escape");
7197         }
7198         *buf = scan_hex(s, 2, &hexlen);
7199         if (hexlen != 2) {
7200             rb_raise(rb_eRuntimeError, "invalid hex escape");
7201         }
7202         rb_str_cat(undumped, (char *)buf, 1);
7203         s += hexlen;
7204         break;
7205       default:
7206         rb_str_cat(undumped, s-1, 2);
7207         s++;
7208     }
7209
7210     *ss = s;
7211 }
7212
7213 static VALUE rb_str_is_ascii_only_p(VALUE str);
7214
7215 /*
7216  *  call-seq:
7217  *    undump -> string
7218  *
7219  *  Returns an unescaped version of +self+:
7220  *
7221  *    s_orig = "\f\x00\xff\\\""    # => "\f\u0000\xFF\\\""
7222  *    s_dumped = s_orig.dump       # => "\"\\f\\x00\\xFF\\\\\\\"\""
7223  *    s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7224  *    s_undumped == s_orig         # => true
7225  *
7226  *  Related: String#dump (inverse of String#undump).
7227  *
7228  */
7229
7230 static VALUE
7231 str_undump(VALUE str)
7232 {
7233     const char *s = RSTRING_PTR(str);
7234     const char *s_end = RSTRING_END(str);
7235     rb_encoding *enc = rb_enc_get(str);
7236     VALUE undumped = rb_enc_str_new(s, 0L, enc);
7237     bool utf8 = false;
7238     bool binary = false;
7239     int w;
7240
7241     rb_must_asciicompat(str);
7242     if (rb_str_is_ascii_only_p(str) == Qfalse) {
7243         rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7244     }
7245     if (!str_null_check(str, &w)) {
7246         rb_raise(rb_eRuntimeError, "string contains null byte");
7247     }
7248     if (RSTRING_LEN(str) < 2) goto invalid_format;
7249     if (*s != '"') goto invalid_format;
7250
7251     /* strip '"' at the start */
7252     s++;
7253
7254     for (;;) {
7255         if (s >= s_end) {
7256             rb_raise(rb_eRuntimeError, "unterminated dumped string");
7257         }
7258
7259         if (*s == '"') {
7260             /* epilogue */
7261             s++;
7262             if (s == s_end) {
7263                 /* ascii compatible dumped string */
7264                 break;
7265             }
7266             else {
7267                 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7268                 static const char dup_suffix[] = ".dup";
7269                 const char *encname;
7270                 int encidx;
7271                 ptrdiff_t size;
7272
7273                 /* check separately for strings dumped by older versions */
7274                 size = sizeof(dup_suffix) - 1;
7275                 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7276
7277                 size = sizeof(force_encoding_suffix) - 1;
7278                 if (s_end - s <= size) goto invalid_format;
7279                 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7280                 s += size;
7281
7282                 if (utf8) {
7283                     rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7284                 }
7285
7286                 encname = s;
7287                 s = memchr(s, '"', s_end-s);
7288                 size = s - encname;
7289                 if (!s) goto invalid_format;
7290                 if (s_end - s != 2) goto invalid_format;
7291                 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7292
7293                 encidx = rb_enc_find_index2(encname, (long)size);
7294                 if (encidx < 0) {
7295                     rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7296                 }
7297                 rb_enc_associate_index(undumped, encidx);
7298             }
7299             break;
7300         }
7301
7302         if (*s == '\\') {
7303             s++;
7304             if (s >= s_end) {
7305                 rb_raise(rb_eRuntimeError, "invalid escape");
7306             }
7307             undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7308         }
7309         else {
7310             rb_str_cat(undumped, s++, 1);
7311         }
7312     }
7313
7314     RB_GC_GUARD(str);
7315
7316     return undumped;
7317 invalid_format:
7318     rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7319 }
7320
7321 static void
7322 rb_str_check_dummy_enc(rb_encoding *enc)
7323 {
7324     if (rb_enc_dummy_p(enc)) {
7325         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7326                  rb_enc_name(enc));
7327     }
7328 }
7329
7330 static rb_encoding *
7331 str_true_enc(VALUE str)
7332 {
7333     rb_encoding *enc = STR_ENC_GET(str);
7334     rb_str_check_dummy_enc(enc);
7335     return enc;
7336 }
7337
7338 static OnigCaseFoldType
7339 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7340 {
7341     if (argc==0)
7342         return flags;
7343     if (argc>2)
7344         rb_raise(rb_eArgError, "too many options");
7345     if (argv[0]==sym_turkic) {
7346         flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7347         if (argc==2) {
7348             if (argv[1]==sym_lithuanian)
7349                 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7350             else
7351                 rb_raise(rb_eArgError, "invalid second option");
7352         }
7353     }
7354     else if (argv[0]==sym_lithuanian) {
7355         flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7356         if (argc==2) {
7357             if (argv[1]==sym_turkic)
7358                 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7359             else
7360                 rb_raise(rb_eArgError, "invalid second option");
7361         }
7362     }
7363     else if (argc>1)
7364         rb_raise(rb_eArgError, "too many options");
7365     else if (argv[0]==sym_ascii)
7366         flags |= ONIGENC_CASE_ASCII_ONLY;
7367     else if (argv[0]==sym_fold) {
7368         if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7369             flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7370         else
7371             rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7372     }
7373     else
7374         rb_raise(rb_eArgError, "invalid option");
7375     return flags;
7376 }
7377
7378 static inline bool
7379 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7380 {
7381     if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7382         return true;
7383     return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7384 }
7385
7386 /* 16 should be long enough to absorb any kind of single character length increase */
7387 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7388 #ifndef CASEMAP_DEBUG
7389 # define CASEMAP_DEBUG 0
7390 #endif
7391
7392 struct mapping_buffer;
7393 typedef struct mapping_buffer {
7394     size_t capa;
7395     size_t used;
7396     struct mapping_buffer *next;
7397     OnigUChar space[FLEX_ARY_LEN];
7398 } mapping_buffer;
7399
7400 static void
7401 mapping_buffer_free(void *p)
7402 {
7403     mapping_buffer *previous_buffer;
7404     mapping_buffer *current_buffer = p;
7405     while (current_buffer) {
7406         previous_buffer = current_buffer;
7407         current_buffer  = current_buffer->next;
7408         ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7409     }
7410 }
7411
7412 static const rb_data_type_t mapping_buffer_type = {
7413     "mapping_buffer",
7414     {0, mapping_buffer_free,},
7415     0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7416 };
7417
7418 static VALUE
7419 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7420 {
7421     VALUE target;
7422
7423     const OnigUChar *source_current, *source_end;
7424     int target_length = 0;
7425     VALUE buffer_anchor;
7426     mapping_buffer *current_buffer = 0;
7427     mapping_buffer **pre_buffer;
7428     size_t buffer_count = 0;
7429     int buffer_length_or_invalid;
7430
7431     if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7432
7433     source_current = (OnigUChar*)RSTRING_PTR(source);
7434     source_end = (OnigUChar*)RSTRING_END(source);
7435
7436     buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7437     pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7438     while (source_current < source_end) {
7439         /* increase multiplier using buffer count to converge quickly */
7440         size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7441         if (CASEMAP_DEBUG) {
7442             fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7443         }
7444         current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7445         *pre_buffer = current_buffer;
7446         pre_buffer = &current_buffer->next;
7447         current_buffer->next = NULL;
7448         current_buffer->capa = capa;
7449         buffer_length_or_invalid = enc->case_map(flags,
7450                                    &source_current, source_end,
7451                                    current_buffer->space,
7452                                    current_buffer->space+current_buffer->capa,
7453                                    enc);
7454         if (buffer_length_or_invalid < 0) {
7455             current_buffer = DATA_PTR(buffer_anchor);
7456             DATA_PTR(buffer_anchor) = 0;
7457             mapping_buffer_free(current_buffer);
7458             rb_raise(rb_eArgError, "input string invalid");
7459         }
7460         target_length  += current_buffer->used = buffer_length_or_invalid;
7461     }
7462     if (CASEMAP_DEBUG) {
7463         fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7464     }
7465
7466     if (buffer_count==1) {
7467         target = rb_str_new((const char*)current_buffer->space, target_length);
7468     }
7469     else {
7470         char *target_current;
7471
7472         target = rb_str_new(0, target_length);
7473         target_current = RSTRING_PTR(target);
7474         current_buffer = DATA_PTR(buffer_anchor);
7475         while (current_buffer) {
7476             memcpy(target_current, current_buffer->space, current_buffer->used);
7477             target_current += current_buffer->used;
7478             current_buffer  = current_buffer->next;
7479         }
7480     }
7481     current_buffer = DATA_PTR(buffer_anchor);
7482     DATA_PTR(buffer_anchor) = 0;
7483     mapping_buffer_free(current_buffer);
7484
7485     RB_GC_GUARD(buffer_anchor);
7486
7487     /* TODO: check about string terminator character */
7488     str_enc_copy_direct(target, source);
7489     /*ENC_CODERANGE_SET(mapped, cr);*/
7490
7491     return target;
7492 }
7493
7494 static VALUE
7495 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7496 {
7497     const OnigUChar *source_current, *source_end;
7498     OnigUChar *target_current, *target_end;
7499     long old_length = RSTRING_LEN(source);
7500     int length_or_invalid;
7501
7502     if (old_length == 0) return Qnil;
7503
7504     source_current = (OnigUChar*)RSTRING_PTR(source);
7505     source_end = (OnigUChar*)RSTRING_END(source);
7506     if (source == target) {
7507         target_current = (OnigUChar*)source_current;
7508         target_end = (OnigUChar*)source_end;
7509     }
7510     else {
7511         target_current = (OnigUChar*)RSTRING_PTR(target);
7512         target_end = (OnigUChar*)RSTRING_END(target);
7513     }
7514
7515     length_or_invalid = onigenc_ascii_only_case_map(flags,
7516                                &source_current, source_end,
7517                                target_current, target_end, enc);
7518     if (length_or_invalid < 0)
7519         rb_raise(rb_eArgError, "input string invalid");
7520     if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7521         fprintf(stderr, "problem with rb_str_ascii_casemap"
7522                 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7523         rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7524                  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7525     }
7526
7527     str_enc_copy(target, source);
7528
7529     return target;
7530 }
7531
7532 static bool
7533 upcase_single(VALUE str)
7534 {
7535     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7536     bool modified = false;
7537
7538     while (s < send) {
7539         unsigned int c = *(unsigned char*)s;
7540
7541         if ('a' <= c && c <= 'z') {
7542             *s = 'A' + (c - 'a');
7543             modified = true;
7544         }
7545         s++;
7546     }
7547     return modified;
7548 }
7549
7550 /*
7551  *  call-seq:
7552  *    upcase!(*options) -> self or nil
7553  *
7554  *  Upcases the characters in +self+;
7555  *  returns +self+ if any changes were made, +nil+ otherwise:
7556  *
7557  *    s = 'Hello World!' # => "Hello World!"
7558  *    s.upcase!          # => "HELLO WORLD!"
7559  *    s                  # => "HELLO WORLD!"
7560  *    s.upcase!          # => nil
7561  *
7562  *  The casing may be affected by the given +options+;
7563  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7564  *
7565  *  Related: String#upcase, String#downcase, String#downcase!.
7566  *
7567  */
7568
7569 static VALUE
7570 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7571 {
7572     rb_encoding *enc;
7573     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7574
7575     flags = check_case_options(argc, argv, flags);
7576     str_modify_keep_cr(str);
7577     enc = str_true_enc(str);
7578     if (case_option_single_p(flags, enc, str)) {
7579         if (upcase_single(str))
7580             flags |= ONIGENC_CASE_MODIFIED;
7581     }
7582     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7583         rb_str_ascii_casemap(str, str, &flags, enc);
7584     else
7585         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7586
7587     if (ONIGENC_CASE_MODIFIED&flags) return str;
7588     return Qnil;
7589 }
7590
7591
7592 /*
7593  *  call-seq:
7594  *    upcase(*options) -> string
7595  *
7596  *  Returns a string containing the upcased characters in +self+:
7597  *
7598  *     s = 'Hello World!' # => "Hello World!"
7599  *     s.upcase           # => "HELLO WORLD!"
7600  *
7601  *  The casing may be affected by the given +options+;
7602  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7603  *
7604  *  Related: String#upcase!, String#downcase, String#downcase!.
7605  *
7606  */
7607
7608 static VALUE
7609 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7610 {
7611     rb_encoding *enc;
7612     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7613     VALUE ret;
7614
7615     flags = check_case_options(argc, argv, flags);
7616     enc = str_true_enc(str);
7617     if (case_option_single_p(flags, enc, str)) {
7618         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7619         str_enc_copy_direct(ret, str);
7620         upcase_single(ret);
7621     }
7622     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7623         ret = rb_str_new(0, RSTRING_LEN(str));
7624         rb_str_ascii_casemap(str, ret, &flags, enc);
7625     }
7626     else {
7627         ret = rb_str_casemap(str, &flags, enc);
7628     }
7629
7630     return ret;
7631 }
7632
7633 static bool
7634 downcase_single(VALUE str)
7635 {
7636     char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7637     bool modified = false;
7638
7639     while (s < send) {
7640         unsigned int c = *(unsigned char*)s;
7641
7642         if ('A' <= c && c <= 'Z') {
7643             *s = 'a' + (c - 'A');
7644             modified = true;
7645         }
7646         s++;
7647     }
7648
7649     return modified;
7650 }
7651
7652 /*
7653  *  call-seq:
7654  *    downcase!(*options) -> self or nil
7655  *
7656  *  Downcases the characters in +self+;
7657  *  returns +self+ if any changes were made, +nil+ otherwise:
7658  *
7659  *    s = 'Hello World!' # => "Hello World!"
7660  *    s.downcase!        # => "hello world!"
7661  *    s                  # => "hello world!"
7662  *    s.downcase!        # => nil
7663  *
7664  *  The casing may be affected by the given +options+;
7665  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7666  *
7667  *  Related: String#downcase, String#upcase, String#upcase!.
7668  *
7669  */
7670
7671 static VALUE
7672 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7673 {
7674     rb_encoding *enc;
7675     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7676
7677     flags = check_case_options(argc, argv, flags);
7678     str_modify_keep_cr(str);
7679     enc = str_true_enc(str);
7680     if (case_option_single_p(flags, enc, str)) {
7681         if (downcase_single(str))
7682             flags |= ONIGENC_CASE_MODIFIED;
7683     }
7684     else if (flags&ONIGENC_CASE_ASCII_ONLY)
7685         rb_str_ascii_casemap(str, str, &flags, enc);
7686     else
7687         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7688
7689     if (ONIGENC_CASE_MODIFIED&flags) return str;
7690     return Qnil;
7691 }
7692
7693
7694 /*
7695  *  call-seq:
7696  *    downcase(*options) -> string
7697  *
7698  *  Returns a string containing the downcased characters in +self+:
7699  *
7700  *     s = 'Hello World!' # => "Hello World!"
7701  *     s.downcase         # => "hello world!"
7702  *
7703  *  The casing may be affected by the given +options+;
7704  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7705  *
7706  *  Related: String#downcase!, String#upcase, String#upcase!.
7707  *
7708  */
7709
7710 static VALUE
7711 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7712 {
7713     rb_encoding *enc;
7714     OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7715     VALUE ret;
7716
7717     flags = check_case_options(argc, argv, flags);
7718     enc = str_true_enc(str);
7719     if (case_option_single_p(flags, enc, str)) {
7720         ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7721         str_enc_copy_direct(ret, str);
7722         downcase_single(ret);
7723     }
7724     else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7725         ret = rb_str_new(0, RSTRING_LEN(str));
7726         rb_str_ascii_casemap(str, ret, &flags, enc);
7727     }
7728     else {
7729         ret = rb_str_casemap(str, &flags, enc);
7730     }
7731
7732     return ret;
7733 }
7734
7735
7736 /*
7737  *  call-seq:
7738  *    capitalize!(*options) -> self or nil
7739  *
7740  *  Upcases the first character in +self+;
7741  *  downcases the remaining characters;
7742  *  returns +self+ if any changes were made, +nil+ otherwise:
7743  *
7744  *    s = 'hello World!' # => "hello World!"
7745  *    s.capitalize!      # => "Hello world!"
7746  *    s                  # => "Hello world!"
7747  *    s.capitalize!      # => nil
7748  *
7749  *  The casing may be affected by the given +options+;
7750  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7751  *
7752  *  Related: String#capitalize.
7753  *
7754  */
7755
7756 static VALUE
7757 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7758 {
7759     rb_encoding *enc;
7760     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7761
7762     flags = check_case_options(argc, argv, flags);
7763     str_modify_keep_cr(str);
7764     enc = str_true_enc(str);
7765     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7766     if (flags&ONIGENC_CASE_ASCII_ONLY)
7767         rb_str_ascii_casemap(str, str, &flags, enc);
7768     else
7769         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7770
7771     if (ONIGENC_CASE_MODIFIED&flags) return str;
7772     return Qnil;
7773 }
7774
7775
7776 /*
7777  *  call-seq:
7778  *    capitalize(*options) -> string
7779  *
7780  *  Returns a string containing the characters in +self+;
7781  *  the first character is upcased;
7782  *  the remaining characters are downcased:
7783  *
7784  *     s = 'hello World!' # => "hello World!"
7785  *     s.capitalize       # => "Hello world!"
7786  *
7787  *  The casing may be affected by the given +options+;
7788  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7789  *
7790  *  Related: String#capitalize!.
7791  *
7792  */
7793
7794 static VALUE
7795 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7796 {
7797     rb_encoding *enc;
7798     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7799     VALUE ret;
7800
7801     flags = check_case_options(argc, argv, flags);
7802     enc = str_true_enc(str);
7803     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7804     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7805         ret = rb_str_new(0, RSTRING_LEN(str));
7806         rb_str_ascii_casemap(str, ret, &flags, enc);
7807     }
7808     else {
7809         ret = rb_str_casemap(str, &flags, enc);
7810     }
7811     return ret;
7812 }
7813
7814
7815 /*
7816  *  call-seq:
7817  *    swapcase!(*options) -> self or nil
7818  *
7819  *  Upcases each lowercase character in +self+;
7820  *  downcases uppercase character;
7821  *  returns +self+ if any changes were made, +nil+ otherwise:
7822  *
7823  *    s = 'Hello World!' # => "Hello World!"
7824  *    s.swapcase!        # => "hELLO wORLD!"
7825  *    s                  # => "hELLO wORLD!"
7826  *    ''.swapcase!       # => nil
7827  *
7828  *  The casing may be affected by the given +options+;
7829  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7830  *
7831  *  Related: String#swapcase.
7832  *
7833  */
7834
7835 static VALUE
7836 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7837 {
7838     rb_encoding *enc;
7839     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7840
7841     flags = check_case_options(argc, argv, flags);
7842     str_modify_keep_cr(str);
7843     enc = str_true_enc(str);
7844     if (flags&ONIGENC_CASE_ASCII_ONLY)
7845         rb_str_ascii_casemap(str, str, &flags, enc);
7846     else
7847         str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7848
7849     if (ONIGENC_CASE_MODIFIED&flags) return str;
7850     return Qnil;
7851 }
7852
7853
7854 /*
7855  *  call-seq:
7856  *    swapcase(*options) -> string
7857  *
7858  *  Returns a string containing the characters in +self+, with cases reversed;
7859  *  each uppercase character is downcased;
7860  *  each lowercase character is upcased:
7861  *
7862  *     s = 'Hello World!' # => "Hello World!"
7863  *     s.swapcase         # => "hELLO wORLD!"
7864  *
7865  *  The casing may be affected by the given +options+;
7866  *  see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7867  *
7868  *  Related: String#swapcase!.
7869  *
7870  */
7871
7872 static VALUE
7873 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7874 {
7875     rb_encoding *enc;
7876     OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7877     VALUE ret;
7878
7879     flags = check_case_options(argc, argv, flags);
7880     enc = str_true_enc(str);
7881     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7882     if (flags&ONIGENC_CASE_ASCII_ONLY) {
7883         ret = rb_str_new(0, RSTRING_LEN(str));
7884         rb_str_ascii_casemap(str, ret, &flags, enc);
7885     }
7886     else {
7887         ret = rb_str_casemap(str, &flags, enc);
7888     }
7889     return ret;
7890 }
7891
7892 typedef unsigned char *USTR;
7893
7894 struct tr {
7895     int gen;
7896     unsigned int now, max;
7897     char *p, *pend;
7898 };
7899
7900 static unsigned int
7901 trnext(struct tr *t, rb_encoding *enc)
7902 {
7903     int n;
7904
7905     for (;;) {
7906       nextpart:
7907         if (!t->gen) {
7908             if (t->p == t->pend) return -1;
7909             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7910                 t->p += n;
7911             }
7912             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7913             t->p += n;
7914             if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7915                 t->p += n;
7916                 if (t->p < t->pend) {
7917                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7918                     t->p += n;
7919                     if (t->now > c) {
7920                         if (t->now < 0x80 && c < 0x80) {
7921                             rb_raise(rb_eArgError,
7922                                      "invalid range \"%c-%c\" in string transliteration",
7923                                      t->now, c);
7924                         }
7925                         else {
7926                             rb_raise(rb_eArgError, "invalid range in string transliteration");
7927                         }
7928                         continue; /* not reached */
7929                     }
7930                     else if (t->now < c) {
7931                         t->gen = 1;
7932                         t->max = c;
7933                     }
7934                 }
7935             }
7936             return t->now;
7937         }
7938         else {
7939             while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7940                 if (t->now == t->max) {
7941                     t->gen = 0;
7942                     goto nextpart;
7943                 }
7944             }
7945             if (t->now < t->max) {
7946                 return t->now;
7947             }
7948             else {
7949                 t->gen = 0;
7950                 return t->max;
7951             }
7952         }
7953     }
7954 }
7955
7956 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7957
7958 static VALUE
7959 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7960 {
7961     const unsigned int errc = -1;
7962     unsigned int trans[256];
7963     rb_encoding *enc, *e1, *e2;
7964     struct tr trsrc, trrepl;
7965     int cflag = 0;
7966     unsigned int c, c0, last = 0;
7967     int modify = 0, i, l;
7968     unsigned char *s, *send;
7969     VALUE hash = 0;
7970     int singlebyte = single_byte_optimizable(str);
7971     int termlen;
7972     int cr;
7973
7974 #define CHECK_IF_ASCII(c) \
7975     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7976            (cr = ENC_CODERANGE_VALID) : 0)
7977
7978     StringValue(src);
7979     StringValue(repl);
7980     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7981     if (RSTRING_LEN(repl) == 0) {
7982         return rb_str_delete_bang(1, &src, str);
7983     }
7984
7985     cr = ENC_CODERANGE(str);
7986     e1 = rb_enc_check(str, src);
7987     e2 = rb_enc_check(str, repl);
7988     if (e1 == e2) {
7989         enc = e1;
7990     }
7991     else {
7992         enc = rb_enc_check(src, repl);
7993     }
7994     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7995     if (RSTRING_LEN(src) > 1 &&
7996         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7997         trsrc.p + l < trsrc.pend) {
7998         cflag = 1;
7999         trsrc.p += l;
8000     }
8001     trrepl.p = RSTRING_PTR(repl);
8002     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8003     trsrc.gen = trrepl.gen = 0;
8004     trsrc.now = trrepl.now = 0;
8005     trsrc.max = trrepl.max = 0;
8006
8007     if (cflag) {
8008         for (i=0; i<256; i++) {
8009             trans[i] = 1;
8010         }
8011         while ((c = trnext(&trsrc, enc)) != errc) {
8012             if (c < 256) {
8013                 trans[c] = errc;
8014             }
8015             else {
8016                 if (!hash) hash = rb_hash_new();
8017                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8018             }
8019         }
8020         while ((c = trnext(&trrepl, enc)) != errc)
8021             /* retrieve last replacer */;
8022         last = trrepl.now;
8023         for (i=0; i<256; i++) {
8024             if (trans[i] != errc) {
8025                 trans[i] = last;
8026             }
8027         }
8028     }
8029     else {
8030         unsigned int r;
8031
8032         for (i=0; i<256; i++) {
8033             trans[i] = errc;
8034         }
8035         while ((c = trnext(&trsrc, enc)) != errc) {
8036             r = trnext(&trrepl, enc);
8037             if (r == errc) r = trrepl.now;
8038             if (c < 256) {
8039                 trans[c] = r;
8040                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8041             }
8042             else {
8043                 if (!hash) hash = rb_hash_new();
8044                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8045             }
8046         }
8047     }
8048
8049     if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8050         cr = ENC_CODERANGE_7BIT;
8051     str_modify_keep_cr(str);
8052     s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8053     termlen = rb_enc_mbminlen(enc);
8054     if (sflag) {
8055         int clen, tlen;
8056         long offset, max = RSTRING_LEN(str);
8057         unsigned int save = -1;
8058         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8059
8060         while (s < send) {
8061             int may_modify = 0;
8062
8063             int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8064             if (!MBCLEN_CHARFOUND_P(r)) {
8065                 xfree(buf);
8066                 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8067             }
8068             clen = MBCLEN_CHARFOUND_LEN(r);
8069             c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8070
8071             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8072
8073             s += clen;
8074             if (c < 256) {
8075                 c = trans[c];
8076             }
8077             else if (hash) {
8078                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8079                 if (NIL_P(tmp)) {
8080                     if (cflag) c = last;
8081                     else c = errc;
8082                 }
8083                 else if (cflag) c = errc;
8084                 else c = NUM2INT(tmp);
8085             }
8086             else {
8087                 c = errc;
8088             }
8089             if (c != (unsigned int)-1) {
8090                 if (save == c) {
8091                     CHECK_IF_ASCII(c);
8092                     continue;
8093                 }
8094                 save = c;
8095                 tlen = rb_enc_codelen(c, enc);
8096                 modify = 1;
8097             }
8098             else {
8099                 save = -1;
8100                 c = c0;
8101                 if (enc != e1) may_modify = 1;
8102             }
8103             if ((offset = t - buf) + tlen > max) {
8104                 size_t MAYBE_UNUSED(old) = max + termlen;
8105                 max = offset + tlen + (send - s);
8106                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8107                 t = buf + offset;
8108             }
8109             rb_enc_mbcput(c, t, enc);
8110             if (may_modify && memcmp(s, t, tlen) != 0) {
8111                 modify = 1;
8112             }
8113             CHECK_IF_ASCII(c);
8114             t += tlen;
8115         }
8116         if (!STR_EMBED_P(str)) {
8117             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8118         }
8119         TERM_FILL((char *)t, termlen);
8120         RSTRING(str)->as.heap.ptr = (char *)buf;
8121         STR_SET_LEN(str, t - buf);
8122         STR_SET_NOEMBED(str);
8123         RSTRING(str)->as.heap.aux.capa = max;
8124     }
8125     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8126         while (s < send) {
8127             c = (unsigned char)*s;
8128             if (trans[c] != errc) {
8129                 if (!cflag) {
8130                     c = trans[c];
8131                     *s = c;
8132                     modify = 1;
8133                 }
8134                 else {
8135                     *s = last;
8136                     modify = 1;
8137                 }
8138             }
8139             CHECK_IF_ASCII(c);
8140             s++;
8141         }
8142     }
8143     else {
8144         int clen, tlen;
8145         long offset, max = (long)((send - s) * 1.2);
8146         unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8147
8148         while (s < send) {
8149             int may_modify = 0;
8150
8151             int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8152             if (!MBCLEN_CHARFOUND_P(r)) {
8153                 xfree(buf);
8154                 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8155             }
8156             clen = MBCLEN_CHARFOUND_LEN(r);
8157             c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8158
8159             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8160
8161             if (c < 256) {
8162                 c = trans[c];
8163             }
8164             else if (hash) {
8165                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8166                 if (NIL_P(tmp)) {
8167                     if (cflag) c = last;
8168                     else c = errc;
8169                 }
8170                 else if (cflag) c = errc;
8171                 else c = NUM2INT(tmp);
8172             }
8173             else {
8174                 c = cflag ? last : errc;
8175             }
8176             if (c != errc) {
8177                 tlen = rb_enc_codelen(c, enc);
8178                 modify = 1;
8179             }
8180             else {
8181                 c = c0;
8182                 if (enc != e1) may_modify = 1;
8183             }
8184             if ((offset = t - buf) + tlen > max) {
8185                 size_t MAYBE_UNUSED(old) = max + termlen;
8186                 max = offset + tlen + (long)((send - s) * 1.2);
8187                 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8188                 t = buf + offset;
8189             }
8190             if (s != t) {
8191                 rb_enc_mbcput(c, t, enc);
8192                 if (may_modify && memcmp(s, t, tlen) != 0) {
8193                     modify = 1;
8194                 }
8195             }
8196             CHECK_IF_ASCII(c);
8197             s += clen;
8198             t += tlen;
8199         }
8200         if (!STR_EMBED_P(str)) {
8201             ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8202         }
8203         TERM_FILL((char *)t, termlen);
8204         RSTRING(str)->as.heap.ptr = (char *)buf;
8205         STR_SET_LEN(str, t - buf);
8206         STR_SET_NOEMBED(str);
8207         RSTRING(str)->as.heap.aux.capa = max;
8208     }
8209
8210     if (modify) {
8211         if (cr != ENC_CODERANGE_BROKEN)
8212             ENC_CODERANGE_SET(str, cr);
8213         rb_enc_associate(str, enc);
8214         return str;
8215     }
8216     return Qnil;
8217 }
8218
8219
8220 /*
8221  *  call-seq:
8222  *    tr!(selector, replacements) -> self or nil
8223  *
8224  *  Like String#tr, but modifies +self+ in place.
8225  *  Returns +self+ if any changes were made, +nil+ otherwise.
8226  *
8227  */
8228
8229 static VALUE
8230 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8231 {
8232     return tr_trans(str, src, repl, 0);
8233 }
8234
8235
8236 /*
8237  *  call-seq:
8238  *    tr(selector, replacements) -> new_string
8239  *
8240  *  Returns a copy of +self+ with each character specified by string +selector+
8241  *  translated to the corresponding character in string +replacements+.
8242  *  The correspondence is _positional_:
8243  *
8244  *  - Each occurrence of the first character specified by +selector+
8245  *    is translated to the first character in +replacements+.
8246  *  - Each occurrence of the second character specified by +selector+
8247  *    is translated to the second character in +replacements+.
8248  *  - And so on.
8249  *
8250  *  Example:
8251  *
8252  *    'hello'.tr('el', 'ip') #=> "hippo"
8253  *
8254  *  If +replacements+ is shorter than +selector+,
8255  *  it is implicitly padded with its own last character:
8256  *
8257  *    'hello'.tr('aeiou', '-')   # => "h-ll-"
8258  *    'hello'.tr('aeiou', 'AA-') # => "hAll-"
8259  *
8260  *  Arguments +selector+ and +replacements+ must be valid character selectors
8261  *  (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8262  *  and may use any of its valid forms, including negation, ranges, and escaping:
8263  *
8264  *    # Negation.
8265  *    'hello'.tr('^aeiou', '-') # => "-e--o"
8266  *    # Ranges.
8267  *    'ibm'.tr('b-z', 'a-z') # => "hal"
8268  *    # Escapes.
8269  *    'hel^lo'.tr('\^aeiou', '-')     # => "h-l-l-"    # Escaped leading caret.
8270  *    'i-b-m'.tr('b\-z', 'a-z')       # => "ibabm"     # Escaped embedded hyphen.
8271  *    'foo\\bar'.tr('ab\\', 'XYZ')    # => "fooZYXr"   # Escaped backslash.
8272  *
8273  */
8274
8275 static VALUE
8276 rb_str_tr(VALUE str, VALUE src, VALUE repl)
8277 {
8278     str = str_duplicate(rb_cString, str);
8279     tr_trans(str, src, repl, 0);
8280     return str;
8281 }
8282
8283 #define TR_TABLE_MAX (UCHAR_MAX+1)
8284 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8285 static void
8286 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8287                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8288 {
8289     const unsigned int errc = -1;
8290     char buf[TR_TABLE_MAX];
8291     struct tr tr;
8292     unsigned int c;
8293     VALUE table = 0, ptable = 0;
8294     int i, l, cflag = 0;
8295
8296     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8297     tr.gen = tr.now = tr.max = 0;
8298
8299     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8300         cflag = 1;
8301         tr.p += l;
8302     }
8303     if (first) {
8304         for (i=0; i<TR_TABLE_MAX; i++) {
8305             stable[i] = 1;
8306         }
8307         stable[TR_TABLE_MAX] = cflag;
8308     }
8309     else if (stable[TR_TABLE_MAX] && !cflag) {
8310         stable[TR_TABLE_MAX] = 0;
8311     }
8312     for (i=0; i<TR_TABLE_MAX; i++) {
8313         buf[i] = cflag;
8314     }
8315
8316     while ((c = trnext(&tr, enc)) != errc) {
8317         if (c < TR_TABLE_MAX) {
8318             buf[(unsigned char)c] = !cflag;
8319         }
8320         else {
8321             VALUE key = UINT2NUM(c);
8322
8323             if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8324                 if (cflag) {
8325                     ptable = *ctablep;
8326                     table = ptable ? ptable : rb_hash_new();
8327                     *ctablep = table;
8328                 }
8329                 else {
8330                     table = rb_hash_new();
8331                     ptable = *tablep;
8332                     *tablep = table;
8333                 }
8334             }
8335             if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8336                 rb_hash_aset(table, key, Qtrue);
8337             }
8338         }
8339     }
8340     for (i=0; i<TR_TABLE_MAX; i++) {
8341         stable[i] = stable[i] && buf[i];
8342     }
8343     if (!table && !cflag) {
8344         *tablep = 0;
8345     }
8346 }
8347
8348
8349 static int
8350 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8351 {
8352     if (c < TR_TABLE_MAX) {
8353         return table[c] != 0;
8354     }
8355     else {
8356         VALUE v = UINT2NUM(c);
8357
8358         if (del) {
8359             if (!NIL_P(rb_hash_lookup(del, v)) &&
8360                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8361                 return TRUE;
8362             }
8363         }
8364         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8365             return FALSE;
8366         }
8367         return table[TR_TABLE_MAX] ? TRUE : FALSE;
8368     }
8369 }
8370
8371 /*
8372  *  call-seq:
8373  *    delete!(*selectors) -> self or nil
8374  *
8375  *  Like String#delete, but modifies +self+ in place.
8376  *  Returns +self+ if any changes were made, +nil+ otherwise.
8377  *
8378  */
8379
8380 static VALUE
8381 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8382 {
8383     char squeez[TR_TABLE_SIZE];
8384     rb_encoding *enc = 0;
8385     char *s, *send, *t;
8386     VALUE del = 0, nodel = 0;
8387     int modify = 0;
8388     int i, ascompat, cr;
8389
8390     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8391     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8392     for (i=0; i<argc; i++) {
8393         VALUE s = argv[i];
8394
8395         StringValue(s);
8396         enc = rb_enc_check(str, s);
8397         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8398     }
8399
8400     str_modify_keep_cr(str);
8401     ascompat = rb_enc_asciicompat(enc);
8402     s = t = RSTRING_PTR(str);
8403     send = RSTRING_END(str);
8404     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8405     while (s < send) {
8406         unsigned int c;
8407         int clen;
8408
8409         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8410             if (squeez[c]) {
8411                 modify = 1;
8412             }
8413             else {
8414                 if (t != s) *t = c;
8415                 t++;
8416             }
8417             s++;
8418         }
8419         else {
8420             c = rb_enc_codepoint_len(s, send, &clen, enc);
8421
8422             if (tr_find(c, squeez, del, nodel)) {
8423                 modify = 1;
8424             }
8425             else {
8426                 if (t != s) rb_enc_mbcput(c, t, enc);
8427                 t += clen;
8428                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8429             }
8430             s += clen;
8431         }
8432     }
8433     TERM_FILL(t, TERM_LEN(str));
8434     STR_SET_LEN(str, t - RSTRING_PTR(str));
8435     ENC_CODERANGE_SET(str, cr);
8436
8437     if (modify) return str;
8438     return Qnil;
8439 }
8440
8441
8442 /*
8443  *  call-seq:
8444  *    delete(*selectors) -> new_string
8445  *
8446  *  Returns a copy of +self+ with characters specified by +selectors+ removed
8447  *  (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8448  *
8449  *     "hello".delete "l","lo"        #=> "heo"
8450  *     "hello".delete "lo"            #=> "he"
8451  *     "hello".delete "aeiou", "^e"   #=> "hell"
8452  *     "hello".delete "ej-m"          #=> "ho"
8453  *
8454  */
8455
8456 static VALUE
8457 rb_str_delete(int argc, VALUE *argv, VALUE str)
8458 {
8459     str = str_duplicate(rb_cString, str);
8460     rb_str_delete_bang(argc, argv, str);
8461     return str;
8462 }
8463
8464
8465 /*
8466  *  call-seq:
8467  *    squeeze!(*selectors) -> self or nil
8468  *
8469  *  Like String#squeeze, but modifies +self+ in place.
8470  *  Returns +self+ if any changes were made, +nil+ otherwise.
8471  */
8472
8473 static VALUE
8474 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8475 {
8476     char squeez[TR_TABLE_SIZE];
8477     rb_encoding *enc = 0;
8478     VALUE del = 0, nodel = 0;
8479     unsigned char *s, *send, *t;
8480     int i, modify = 0;
8481     int ascompat, singlebyte = single_byte_optimizable(str);
8482     unsigned int save;
8483
8484     if (argc == 0) {
8485         enc = STR_ENC_GET(str);
8486     }
8487     else {
8488         for (i=0; i<argc; i++) {
8489             VALUE s = argv[i];
8490
8491             StringValue(s);
8492             enc = rb_enc_check(str, s);
8493             if (singlebyte && !single_byte_optimizable(s))
8494                 singlebyte = 0;
8495             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8496         }
8497     }
8498
8499     str_modify_keep_cr(str);
8500     s = t = (unsigned char *)RSTRING_PTR(str);
8501     if (!s || RSTRING_LEN(str) == 0) return Qnil;
8502     send = (unsigned char *)RSTRING_END(str);
8503     save = -1;
8504     ascompat = rb_enc_asciicompat(enc);
8505
8506     if (singlebyte) {
8507         while (s < send) {
8508             unsigned int c = *s++;
8509             if (c != save || (argc > 0 && !squeez[c])) {
8510                 *t++ = save = c;
8511             }
8512         }
8513     }
8514     else {
8515         while (s < send) {
8516             unsigned int c;
8517             int clen;
8518
8519             if (ascompat && (c = *s) < 0x80) {
8520                 if (c != save || (argc > 0 && !squeez[c])) {
8521                     *t++ = save = c;
8522                 }
8523                 s++;
8524             }
8525             else {
8526                 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8527
8528                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8529                     if (t != s) rb_enc_mbcput(c, t, enc);
8530                     save = c;
8531                     t += clen;
8532                 }
8533                 s += clen;
8534             }
8535         }
8536     }
8537
8538     TERM_FILL((char *)t, TERM_LEN(str));
8539     if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8540         STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8541         modify = 1;
8542     }
8543
8544     if (modify) return str;
8545     return Qnil;
8546 }
8547
8548
8549 /*
8550  *  call-seq:
8551  *    squeeze(*selectors) -> new_string
8552  *
8553  *  Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8554  *  (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8555  *
8556  *  "Squeezed" means that each multiple-character run of a selected character
8557  *  is squeezed down to a single character;
8558  *  with no arguments given, squeezes all characters:
8559  *
8560  *     "yellow moon".squeeze                  #=> "yelow mon"
8561  *     "  now   is  the".squeeze(" ")         #=> " now is the"
8562  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
8563  *
8564  */
8565
8566 static VALUE
8567 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8568 {
8569     str = str_duplicate(rb_cString, str);
8570     rb_str_squeeze_bang(argc, argv, str);
8571     return str;
8572 }
8573
8574
8575 /*
8576  *  call-seq:
8577  *    tr_s!(selector, replacements) -> self or nil
8578  *
8579  *  Like String#tr_s, but modifies +self+ in place.
8580  *  Returns +self+ if any changes were made, +nil+ otherwise.
8581  *
8582  *  Related: String#squeeze!.
8583  */
8584
8585 static VALUE
8586 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8587 {
8588     return tr_trans(str, src, repl, 1);
8589 }
8590
8591
8592 /*
8593  *  call-seq:
8594  *    tr_s(selector, replacements) -> string
8595  *
8596  *  Like String#tr, but also squeezes the modified portions of the translated string;
8597  *  returns a new string (translated and squeezed).
8598  *
8599  *    'hello'.tr_s('l', 'r')   #=> "hero"
8600  *    'hello'.tr_s('el', '-')  #=> "h-o"
8601  *    'hello'.tr_s('el', 'hx') #=> "hhxo"
8602  *
8603  *  Related: String#squeeze.
8604  *
8605  */
8606
8607 static VALUE
8608 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8609 {
8610     str = str_duplicate(rb_cString, str);
8611     tr_trans(str, src, repl, 1);
8612     return str;
8613 }
8614
8615
8616 /*
8617  *  call-seq:
8618  *    count(*selectors) -> integer
8619  *
8620  *  Returns the total number of characters in +self+
8621  *  that are specified by the given +selectors+
8622  *  (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8623  *
8624  *     a = "hello world"
8625  *     a.count "lo"                   #=> 5
8626  *     a.count "lo", "o"              #=> 2
8627  *     a.count "hello", "^l"          #=> 4
8628  *     a.count "ej-m"                 #=> 4
8629  *
8630  *     "hello^world".count "\\^aeiou" #=> 4
8631  *     "hello-world".count "a\\-eo"   #=> 4
8632  *
8633  *     c = "hello world\\r\\n"
8634  *     c.count "\\"                   #=> 2
8635  *     c.count "\\A"                  #=> 0
8636  *     c.count "X-\\w"                #=> 3
8637  */
8638
8639 static VALUE
8640 rb_str_count(int argc, VALUE *argv, VALUE str)
8641 {
8642     char table[TR_TABLE_SIZE];
8643     rb_encoding *enc = 0;
8644     VALUE del = 0, nodel = 0, tstr;
8645     char *s, *send;
8646     int i;
8647     int ascompat;
8648     size_t n = 0;
8649
8650     rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8651
8652     tstr = argv[0];
8653     StringValue(tstr);
8654     enc = rb_enc_check(str, tstr);
8655     if (argc == 1) {
8656         const char *ptstr;
8657         if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8658             (ptstr = RSTRING_PTR(tstr),
8659              ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8660             !is_broken_string(str)) {
8661             int clen;
8662             unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8663
8664             s = RSTRING_PTR(str);
8665             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8666             send = RSTRING_END(str);
8667             while (s < send) {
8668                 if (*(unsigned char*)s++ == c) n++;
8669             }
8670             return SIZET2NUM(n);
8671         }
8672     }
8673
8674     tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8675     for (i=1; i<argc; i++) {
8676         tstr = argv[i];
8677         StringValue(tstr);
8678         enc = rb_enc_check(str, tstr);
8679         tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8680     }
8681
8682     s = RSTRING_PTR(str);
8683     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8684     send = RSTRING_END(str);
8685     ascompat = rb_enc_asciicompat(enc);
8686     while (s < send) {
8687         unsigned int c;
8688
8689         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8690             if (table[c]) {
8691                 n++;
8692             }
8693             s++;
8694         }
8695         else {
8696             int clen;
8697             c = rb_enc_codepoint_len(s, send, &clen, enc);
8698             if (tr_find(c, table, del, nodel)) {
8699                 n++;
8700             }
8701             s += clen;
8702         }
8703     }
8704
8705     return SIZET2NUM(n);
8706 }
8707
8708 static VALUE
8709 rb_fs_check(VALUE val)
8710 {
8711     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8712         val = rb_check_string_type(val);
8713         if (NIL_P(val)) return 0;
8714     }
8715     return val;
8716 }
8717
8718 static const char isspacetable[256] = {
8719     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8720     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8721     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8722     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8723     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8724     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8725     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8726     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8727     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8732     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8733     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8734     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8735 };
8736
8737 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8738
8739 static long
8740 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8741 {
8742     if (empty_count >= 0 && len == 0) {
8743         return empty_count + 1;
8744     }
8745     if (empty_count > 0) {
8746         /* make different substrings */
8747         if (result) {
8748             do {
8749                 rb_ary_push(result, str_new_empty_String(str));
8750             } while (--empty_count > 0);
8751         }
8752         else {
8753             do {
8754                 rb_yield(str_new_empty_String(str));
8755             } while (--empty_count > 0);
8756         }
8757     }
8758     str = rb_str_subseq(str, beg, len);
8759     if (result) {
8760         rb_ary_push(result, str);
8761     }
8762     else {
8763         rb_yield(str);
8764     }
8765     return empty_count;
8766 }
8767
8768 typedef enum {
8769     SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8770 } split_type_t;
8771
8772 static split_type_t
8773 literal_split_pattern(VALUE spat, split_type_t default_type)
8774 {
8775     rb_encoding *enc = STR_ENC_GET(spat);
8776     const char *ptr;
8777     long len;
8778     RSTRING_GETMEM(spat, ptr, len);
8779     if (len == 0) {
8780         /* Special case - split into chars */
8781         return SPLIT_TYPE_CHARS;
8782     }
8783     else if (rb_enc_asciicompat(enc)) {
8784         if (len == 1 && ptr[0] == ' ') {
8785             return SPLIT_TYPE_AWK;
8786         }
8787     }
8788     else {
8789         int l;
8790         if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8791             return SPLIT_TYPE_AWK;
8792         }
8793     }
8794     return default_type;
8795 }
8796
8797 /*
8798  *  call-seq:
8799  *    split(field_sep = $;, limit = nil) -> array
8800  *    split(field_sep = $;, limit = nil) {|substring| ... } -> self
8801  *
8802  *  :include: doc/string/split.rdoc
8803  *
8804  */
8805
8806 static VALUE
8807 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8808 {
8809     rb_encoding *enc;
8810     VALUE spat;
8811     VALUE limit;
8812     split_type_t split_type;
8813     long beg, end, i = 0, empty_count = -1;
8814     int lim = 0;
8815     VALUE result, tmp;
8816
8817     result = rb_block_given_p() ? Qfalse : Qnil;
8818     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8819         lim = NUM2INT(limit);
8820         if (lim <= 0) limit = Qnil;
8821         else if (lim == 1) {
8822             if (RSTRING_LEN(str) == 0)
8823                 return result ? rb_ary_new2(0) : str;
8824             tmp = str_duplicate(rb_cString, str);
8825             if (!result) {
8826                 rb_yield(tmp);
8827                 return str;
8828             }
8829             return rb_ary_new3(1, tmp);
8830         }
8831         i = 1;
8832     }
8833     if (NIL_P(limit) && !lim) empty_count = 0;
8834
8835     enc = STR_ENC_GET(str);
8836     split_type = SPLIT_TYPE_REGEXP;
8837     if (!NIL_P(spat)) {
8838         spat = get_pat_quoted(spat, 0);
8839     }
8840     else if (NIL_P(spat = rb_fs)) {
8841         split_type = SPLIT_TYPE_AWK;
8842     }
8843     else if (!(spat = rb_fs_check(spat))) {
8844         rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8845     }
8846     else {
8847         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8848     }
8849     if (split_type != SPLIT_TYPE_AWK) {
8850         switch (BUILTIN_TYPE(spat)) {
8851           case T_REGEXP:
8852             rb_reg_options(spat); /* check if uninitialized */
8853             tmp = RREGEXP_SRC(spat);
8854             split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8855             if (split_type == SPLIT_TYPE_AWK) {
8856                 spat = tmp;
8857                 split_type = SPLIT_TYPE_STRING;
8858             }
8859             break;
8860
8861           case T_STRING:
8862             mustnot_broken(spat);
8863             split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8864             break;
8865
8866           default:
8867             UNREACHABLE_RETURN(Qnil);
8868         }
8869     }
8870
8871 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8872
8873     beg = 0;
8874     char *ptr = RSTRING_PTR(str);
8875     char *eptr = RSTRING_END(str);
8876     if (split_type == SPLIT_TYPE_AWK) {
8877         char *bptr = ptr;
8878         int skip = 1;
8879         unsigned int c;
8880
8881         if (result) result = rb_ary_new();
8882         end = beg;
8883         if (is_ascii_string(str)) {
8884             while (ptr < eptr) {
8885                 c = (unsigned char)*ptr++;
8886                 if (skip) {
8887                     if (ascii_isspace(c)) {
8888                         beg = ptr - bptr;
8889                     }
8890                     else {
8891                         end = ptr - bptr;
8892                         skip = 0;
8893                         if (!NIL_P(limit) && lim <= i) break;
8894                     }
8895                 }
8896                 else if (ascii_isspace(c)) {
8897                     SPLIT_STR(beg, end-beg);
8898                     skip = 1;
8899                     beg = ptr - bptr;
8900                     if (!NIL_P(limit)) ++i;
8901                 }
8902                 else {
8903                     end = ptr - bptr;
8904                 }
8905             }
8906         }
8907         else {
8908             while (ptr < eptr) {
8909                 int n;
8910
8911                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8912                 ptr += n;
8913                 if (skip) {
8914                     if (rb_isspace(c)) {
8915                         beg = ptr - bptr;
8916                     }
8917                     else {
8918                         end = ptr - bptr;
8919                         skip = 0;
8920                         if (!NIL_P(limit) && lim <= i) break;
8921                     }
8922                 }
8923                 else if (rb_isspace(c)) {
8924                     SPLIT_STR(beg, end-beg);
8925                     skip = 1;
8926                     beg = ptr - bptr;
8927                     if (!NIL_P(limit)) ++i;
8928                 }
8929                 else {
8930                     end = ptr - bptr;
8931                 }
8932             }
8933         }
8934     }
8935     else if (split_type == SPLIT_TYPE_STRING) {
8936         char *str_start = ptr;
8937         char *substr_start = ptr;
8938         char *sptr = RSTRING_PTR(spat);
8939         long slen = RSTRING_LEN(spat);
8940
8941         if (result) result = rb_ary_new();
8942         mustnot_broken(str);
8943         enc = rb_enc_check(str, spat);
8944         while (ptr < eptr &&
8945                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8946             /* Check we are at the start of a char */
8947             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8948             if (t != ptr + end) {
8949                 ptr = t;
8950                 continue;
8951             }
8952             SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8953             ptr += end + slen;
8954             substr_start = ptr;
8955             if (!NIL_P(limit) && lim <= ++i) break;
8956         }
8957         beg = ptr - str_start;
8958     }
8959     else if (split_type == SPLIT_TYPE_CHARS) {
8960         char *str_start = ptr;
8961         int n;
8962
8963         if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
8964         mustnot_broken(str);
8965         enc = rb_enc_get(str);
8966         while (ptr < eptr &&
8967                (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8968             SPLIT_STR(ptr - str_start, n);
8969             ptr += n;
8970             if (!NIL_P(limit) && lim <= ++i) break;
8971         }
8972         beg = ptr - str_start;
8973     }
8974     else {
8975         if (result) result = rb_ary_new();
8976         long len = RSTRING_LEN(str);
8977         long start = beg;
8978         long idx;
8979         int last_null = 0;
8980         struct re_registers *regs;
8981         VALUE match = 0;
8982
8983         for (; rb_reg_search(spat, str, start, 0) >= 0;
8984              (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8985             match = rb_backref_get();
8986             if (!result) rb_match_busy(match);
8987             regs = RMATCH_REGS(match);
8988             end = BEG(0);
8989             if (start == end && BEG(0) == END(0)) {
8990                 if (!ptr) {
8991                     SPLIT_STR(0, 0);
8992                     break;
8993                 }
8994                 else if (last_null == 1) {
8995                     SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8996                     beg = start;
8997                 }
8998                 else {
8999                     if (start == len)
9000                         start++;
9001                     else
9002                         start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9003                     last_null = 1;
9004                     continue;
9005                 }
9006             }
9007             else {
9008                 SPLIT_STR(beg, end-beg);
9009                 beg = start = END(0);
9010             }
9011             last_null = 0;
9012
9013             for (idx=1; idx < regs->num_regs; idx++) {
9014                 if (BEG(idx) == -1) continue;
9015                 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9016             }
9017             if (!NIL_P(limit) && lim <= ++i) break;
9018         }
9019         if (match) rb_match_unbusy(match);
9020     }
9021     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9022         SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9023     }
9024
9025     return result ? result : str;
9026 }
9027
9028 VALUE
9029 rb_str_split(VALUE str, const char *sep0)
9030 {
9031     VALUE sep;
9032
9033     StringValue(str);
9034     sep = rb_str_new_cstr(sep0);
9035     return rb_str_split_m(1, &sep, str);
9036 }
9037
9038 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9039
9040 static inline int
9041 enumerator_element(VALUE ary, VALUE e)
9042 {
9043     if (ary) {
9044         rb_ary_push(ary, e);
9045         return 0;
9046     }
9047     else {
9048         rb_yield(e);
9049         return 1;
9050     }
9051 }
9052
9053 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9054
9055 static const char *
9056 chomp_newline(const char *p, const char *e, rb_encoding *enc)
9057 {
9058     const char *prev = rb_enc_prev_char(p, e, e, enc);
9059     if (rb_enc_is_newline(prev, e, enc)) {
9060         e = prev;
9061         prev = rb_enc_prev_char(p, e, e, enc);
9062         if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9063             e = prev;
9064     }
9065     return e;
9066 }
9067
9068 static VALUE
9069 get_rs(void)
9070 {
9071     VALUE rs = rb_rs;
9072     if (!NIL_P(rs) &&
9073         (!RB_TYPE_P(rs, T_STRING) ||
9074          RSTRING_LEN(rs) != 1 ||
9075          RSTRING_PTR(rs)[0] != '\n')) {
9076         rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9077     }
9078     return rs;
9079 }
9080
9081 #define rb_rs get_rs()
9082
9083 static VALUE
9084 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9085 {
9086     rb_encoding *enc;
9087     VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9088     const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9089     long pos, len, rslen;
9090     int rsnewline = 0;
9091
9092     if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9093         rs = rb_rs;
9094     if (!NIL_P(opts)) {
9095         static ID keywords[1];
9096         if (!keywords[0]) {
9097             keywords[0] = rb_intern_const("chomp");
9098         }
9099         rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9100         chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9101     }
9102
9103     if (NIL_P(rs)) {
9104         if (!ENUM_ELEM(ary, str)) {
9105             return ary;
9106         }
9107         else {
9108             return orig;
9109         }
9110     }
9111
9112     if (!RSTRING_LEN(str)) goto end;
9113     str = rb_str_new_frozen(str);
9114     ptr = subptr = RSTRING_PTR(str);
9115     pend = RSTRING_END(str);
9116     len = RSTRING_LEN(str);
9117     StringValue(rs);
9118     rslen = RSTRING_LEN(rs);
9119
9120     if (rs == rb_default_rs)
9121         enc = rb_enc_get(str);
9122     else
9123         enc = rb_enc_check(str, rs);
9124
9125     if (rslen == 0) {
9126         /* paragraph mode */
9127         int n;
9128         const char *eol = NULL;
9129         subend = subptr;
9130         while (subend < pend) {
9131             long chomp_rslen = 0;
9132             do {
9133                 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9134                     n = 0;
9135                 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9136                 if (rb_enc_is_newline(subend + n, pend, enc)) {
9137                     if (eol == subend) break;
9138                     subend += rslen;
9139                     if (subptr) {
9140                         eol = subend;
9141                         chomp_rslen = -rslen;
9142                     }
9143                 }
9144                 else {
9145                     if (!subptr) subptr = subend;
9146                     subend += rslen;
9147                 }
9148                 rslen = 0;
9149             } while (subend < pend);
9150             if (!subptr) break;
9151             if (rslen == 0) chomp_rslen = 0;
9152             line = rb_str_subseq(str, subptr - ptr,
9153                                  subend - subptr + (chomp ? chomp_rslen : rslen));
9154             if (ENUM_ELEM(ary, line)) {
9155                 str_mod_check(str, ptr, len);
9156             }
9157             subptr = eol = NULL;
9158         }
9159         goto end;
9160     }
9161     else {
9162         rsptr = RSTRING_PTR(rs);
9163         if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9164             rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9165             rsnewline = 1;
9166         }
9167     }
9168
9169     if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9170         rs = rb_str_new(rsptr, rslen);
9171         rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9172         rsptr = RSTRING_PTR(rs);
9173         rslen = RSTRING_LEN(rs);
9174     }
9175
9176     while (subptr < pend) {
9177         pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9178         if (pos < 0) break;
9179         hit = subptr + pos;
9180         adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9181         if (hit != adjusted) {
9182             subptr = adjusted;
9183             continue;
9184         }
9185         subend = hit += rslen;
9186         if (chomp) {
9187             if (rsnewline) {
9188                 subend = chomp_newline(subptr, subend, enc);
9189             }
9190             else {
9191                 subend -= rslen;
9192             }
9193         }
9194         line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9195         if (ENUM_ELEM(ary, line)) {
9196             str_mod_check(str, ptr, len);
9197         }
9198         subptr = hit;
9199     }
9200
9201     if (subptr != pend) {
9202         if (chomp) {
9203             if (rsnewline) {
9204                 pend = chomp_newline(subptr, pend, enc);
9205             }
9206             else if (pend - subptr >= rslen &&
9207                      memcmp(pend - rslen, rsptr, rslen) == 0) {
9208                 pend -= rslen;
9209             }
9210         }
9211         line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9212         ENUM_ELEM(ary, line);
9213         RB_GC_GUARD(str);
9214     }
9215
9216   end:
9217     if (ary)
9218         return ary;
9219     else
9220         return orig;
9221 }
9222
9223 /*
9224  *  call-seq:
9225  *    each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9226  *    each_line(line_sep = $/, chomp: false)                    -> enumerator
9227  *
9228  *  :include: doc/string/each_line.rdoc
9229  *
9230  */
9231
9232 static VALUE
9233 rb_str_each_line(int argc, VALUE *argv, VALUE str)
9234 {
9235     RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9236     return rb_str_enumerate_lines(argc, argv, str, 0);
9237 }
9238
9239 /*
9240  *  call-seq:
9241  *    lines(Line_sep = $/, chomp: false) -> array_of_strings
9242  *
9243  *  Forms substrings ("lines") of +self+ according to the given arguments
9244  *  (see String#each_line for details); returns the lines in an array.
9245  *
9246  */
9247
9248 static VALUE
9249 rb_str_lines(int argc, VALUE *argv, VALUE str)
9250 {
9251     VALUE ary = WANTARRAY("lines", 0);
9252     return rb_str_enumerate_lines(argc, argv, str, ary);
9253 }
9254
9255 static VALUE
9256 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9257 {
9258     return LONG2FIX(RSTRING_LEN(str));
9259 }
9260
9261 static VALUE
9262 rb_str_enumerate_bytes(VALUE str, VALUE ary)
9263 {
9264     long i;
9265
9266     for (i=0; i<RSTRING_LEN(str); i++) {
9267         ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9268     }
9269     if (ary)
9270         return ary;
9271     else
9272         return str;
9273 }
9274
9275 /*
9276  *  call-seq:
9277  *    each_byte {|byte| ... } -> self
9278  *    each_byte               -> enumerator
9279  *
9280  *  :include: doc/string/each_byte.rdoc
9281  *
9282  */
9283
9284 static VALUE
9285 rb_str_each_byte(VALUE str)
9286 {
9287     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9288     return rb_str_enumerate_bytes(str, 0);
9289 }
9290
9291 /*
9292  *  call-seq:
9293  *    bytes -> array_of_bytes
9294  *
9295  *  :include: doc/string/bytes.rdoc
9296  *
9297  */
9298
9299 static VALUE
9300 rb_str_bytes(VALUE str)
9301 {
9302     VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9303     return rb_str_enumerate_bytes(str, ary);
9304 }
9305
9306 static VALUE
9307 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9308 {
9309     return rb_str_length(str);
9310 }
9311
9312 static VALUE
9313 rb_str_enumerate_chars(VALUE str, VALUE ary)
9314 {
9315     VALUE orig = str;
9316     long i, len, n;
9317     const char *ptr;
9318     rb_encoding *enc;
9319
9320     str = rb_str_new_frozen(str);
9321     ptr = RSTRING_PTR(str);
9322     len = RSTRING_LEN(str);
9323     enc = rb_enc_get(str);
9324
9325     if (ENC_CODERANGE_CLEAN_P(ENC_CODERANGE(str))) {
9326         for (i = 0; i < len; i += n) {
9327             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9328             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9329         }
9330     }
9331     else {
9332         for (i = 0; i < len; i += n) {
9333             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9334             ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9335         }
9336     }
9337     RB_GC_GUARD(str);
9338     if (ary)
9339         return ary;
9340     else
9341         return orig;
9342 }
9343
9344 /*
9345  *  call-seq:
9346  *    each_char {|c| ... } -> self
9347  *    each_char            -> enumerator
9348  *
9349  *  :include: doc/string/each_char.rdoc
9350  *
9351  */
9352
9353 static VALUE
9354 rb_str_each_char(VALUE str)
9355 {
9356     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9357     return rb_str_enumerate_chars(str, 0);
9358 }
9359
9360 /*
9361  *  call-seq:
9362  *    chars -> array_of_characters
9363  *
9364  *  :include: doc/string/chars.rdoc
9365  *
9366  */
9367
9368 static VALUE
9369 rb_str_chars(VALUE str)
9370 {
9371     VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9372     return rb_str_enumerate_chars(str, ary);
9373 }
9374
9375 static VALUE
9376 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9377 {
9378     VALUE orig = str;
9379     int n;
9380     unsigned int c;
9381     const char *ptr, *end;
9382     rb_encoding *enc;
9383
9384     if (single_byte_optimizable(str))
9385         return rb_str_enumerate_bytes(str, ary);
9386
9387     str = rb_str_new_frozen(str);
9388     ptr = RSTRING_PTR(str);
9389     end = RSTRING_END(str);
9390     enc = STR_ENC_GET(str);
9391
9392     while (ptr < end) {
9393         c = rb_enc_codepoint_len(ptr, end, &n, enc);
9394         ENUM_ELEM(ary, UINT2NUM(c));
9395         ptr += n;
9396     }
9397     RB_GC_GUARD(str);
9398     if (ary)
9399         return ary;
9400     else
9401         return orig;
9402 }
9403
9404 /*
9405  *  call-seq:
9406  *    each_codepoint {|integer| ... } -> self
9407  *    each_codepoint                  -> enumerator
9408  *
9409  *  :include: doc/string/each_codepoint.rdoc
9410  *
9411  */
9412
9413 static VALUE
9414 rb_str_each_codepoint(VALUE str)
9415 {
9416     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9417     return rb_str_enumerate_codepoints(str, 0);
9418 }
9419
9420 /*
9421  *  call-seq:
9422  *    codepoints -> array_of_integers
9423  *
9424  *  :include: doc/string/codepoints.rdoc
9425  *
9426  */
9427
9428 static VALUE
9429 rb_str_codepoints(VALUE str)
9430 {
9431     VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9432     return rb_str_enumerate_codepoints(str, ary);
9433 }
9434
9435 static regex_t *
9436 get_reg_grapheme_cluster(rb_encoding *enc)
9437 {
9438     int encidx = rb_enc_to_index(enc);
9439
9440     const OnigUChar source_ascii[] = "\\X";
9441     const OnigUChar *source = source_ascii;
9442     size_t source_len = sizeof(source_ascii) - 1;
9443
9444     switch (encidx) {
9445 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9446 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9447 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9448 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9449 #define CASE_UTF(e) \
9450       case ENCINDEX_UTF_##e: { \
9451         static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9452         source = source_UTF_##e; \
9453         source_len = sizeof(source_UTF_##e); \
9454         break; \
9455       }
9456         CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9457 #undef CASE_UTF
9458 #undef CHARS_16BE
9459 #undef CHARS_16LE
9460 #undef CHARS_32BE
9461 #undef CHARS_32LE
9462     }
9463
9464     regex_t *reg_grapheme_cluster;
9465     OnigErrorInfo einfo;
9466     int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9467                         ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9468     if (r) {
9469         UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9470         onig_error_code_to_str(message, r, &einfo);
9471         rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9472     }
9473
9474     return reg_grapheme_cluster;
9475 }
9476
9477 static regex_t *
9478 get_cached_reg_grapheme_cluster(rb_encoding *enc)
9479 {
9480     int encidx = rb_enc_to_index(enc);
9481     static regex_t *reg_grapheme_cluster_utf8 = NULL;
9482
9483     if (encidx == rb_utf8_encindex()) {
9484         if (!reg_grapheme_cluster_utf8) {
9485             reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9486         }
9487
9488         return reg_grapheme_cluster_utf8;
9489     }
9490
9491     return NULL;
9492 }
9493
9494 static VALUE
9495 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9496 {
9497     size_t grapheme_cluster_count = 0;
9498     rb_encoding *enc = get_encoding(str);
9499     const char *ptr, *end;
9500
9501     if (!rb_enc_unicode_p(enc)) {
9502         return rb_str_length(str);
9503     }
9504
9505     bool cached_reg_grapheme_cluster = true;
9506     regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9507     if (!reg_grapheme_cluster) {
9508         reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9509         cached_reg_grapheme_cluster = false;
9510     }
9511
9512     ptr = RSTRING_PTR(str);
9513     end = RSTRING_END(str);
9514
9515     while (ptr < end) {
9516         OnigPosition len = onig_match(reg_grapheme_cluster,
9517                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9518                                       (const OnigUChar *)ptr, NULL, 0);
9519         if (len <= 0) break;
9520         grapheme_cluster_count++;
9521         ptr += len;
9522     }
9523
9524     if (!cached_reg_grapheme_cluster) {
9525         onig_free(reg_grapheme_cluster);
9526     }
9527
9528     return SIZET2NUM(grapheme_cluster_count);
9529 }
9530
9531 static VALUE
9532 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9533 {
9534     VALUE orig = str;
9535     rb_encoding *enc = get_encoding(str);
9536     const char *ptr0, *ptr, *end;
9537
9538     if (!rb_enc_unicode_p(enc)) {
9539         return rb_str_enumerate_chars(str, ary);
9540     }
9541
9542     if (!ary) str = rb_str_new_frozen(str);
9543
9544     bool cached_reg_grapheme_cluster = true;
9545     regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9546     if (!reg_grapheme_cluster) {
9547         reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9548         cached_reg_grapheme_cluster = false;
9549     }
9550
9551     ptr0 = ptr = RSTRING_PTR(str);
9552     end = RSTRING_END(str);
9553
9554     while (ptr < end) {
9555         OnigPosition len = onig_match(reg_grapheme_cluster,
9556                                       (const OnigUChar *)ptr, (const OnigUChar *)end,
9557                                       (const OnigUChar *)ptr, NULL, 0);
9558         if (len <= 0) break;
9559         ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9560         ptr += len;
9561     }
9562
9563     if (!cached_reg_grapheme_cluster) {
9564         onig_free(reg_grapheme_cluster);
9565     }
9566
9567     RB_GC_GUARD(str);
9568     if (ary)
9569         return ary;
9570     else
9571         return orig;
9572 }
9573
9574 /*
9575  *  call-seq:
9576  *    each_grapheme_cluster {|gc| ... } -> self
9577  *    each_grapheme_cluster             -> enumerator
9578  *
9579  *  :include: doc/string/each_grapheme_cluster.rdoc
9580  *
9581  */
9582
9583 static VALUE
9584 rb_str_each_grapheme_cluster(VALUE str)
9585 {
9586     RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9587     return rb_str_enumerate_grapheme_clusters(str, 0);
9588 }
9589
9590 /*
9591  *  call-seq:
9592  *    grapheme_clusters -> array_of_grapheme_clusters
9593  *
9594  *  :include: doc/string/grapheme_clusters.rdoc
9595  *
9596  */
9597
9598 static VALUE
9599 rb_str_grapheme_clusters(VALUE str)
9600 {
9601     VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9602     return rb_str_enumerate_grapheme_clusters(str, ary);
9603 }
9604
9605 static long
9606 chopped_length(VALUE str)
9607 {
9608     rb_encoding *enc = STR_ENC_GET(str);
9609     const char *p, *p2, *beg, *end;
9610
9611     beg = RSTRING_PTR(str);
9612     end = beg + RSTRING_LEN(str);
9613     if (beg >= end) return 0;
9614     p = rb_enc_prev_char(beg, end, end, enc);
9615     if (!p) return 0;
9616     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9617         p2 = rb_enc_prev_char(beg, p, end, enc);
9618         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9619     }
9620     return p - beg;
9621 }
9622
9623 /*
9624  *  call-seq:
9625  *    chop! -> self or nil
9626  *
9627  *  Like String#chop, but modifies +self+ in place;
9628  *  returns +nil+ if +self+ is empty, +self+ otherwise.
9629  *
9630  *  Related: String#chomp!.
9631  */
9632
9633 static VALUE
9634 rb_str_chop_bang(VALUE str)
9635 {
9636     str_modify_keep_cr(str);
9637     if (RSTRING_LEN(str) > 0) {
9638         long len;
9639         len = chopped_length(str);
9640         STR_SET_LEN(str, len);
9641         TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9642         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9643             ENC_CODERANGE_CLEAR(str);
9644         }
9645         return str;
9646     }
9647     return Qnil;
9648 }
9649
9650
9651 /*
9652  *  call-seq:
9653  *    chop -> new_string
9654  *
9655  *  :include: doc/string/chop.rdoc
9656  *
9657  */
9658
9659 static VALUE
9660 rb_str_chop(VALUE str)
9661 {
9662     return rb_str_subseq(str, 0, chopped_length(str));
9663 }
9664
9665 static long
9666 smart_chomp(VALUE str, const char *e, const char *p)
9667 {
9668     rb_encoding *enc = rb_enc_get(str);
9669     if (rb_enc_mbminlen(enc) > 1) {
9670         const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9671         if (rb_enc_is_newline(pp, e, enc)) {
9672             e = pp;
9673         }
9674         pp = e - rb_enc_mbminlen(enc);
9675         if (pp >= p) {
9676             pp = rb_enc_left_char_head(p, pp, e, enc);
9677             if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9678                 e = pp;
9679             }
9680         }
9681     }
9682     else {
9683         switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9684           case '\n':
9685             if (--e > p && *(e-1) == '\r') {
9686                 --e;
9687             }
9688             break;
9689           case '\r':
9690             --e;
9691             break;
9692         }
9693     }
9694     return e - p;
9695 }
9696
9697 static long
9698 chompped_length(VALUE str, VALUE rs)
9699 {
9700     rb_encoding *enc;
9701     int newline;
9702     char *pp, *e, *rsptr;
9703     long rslen;
9704     char *const p = RSTRING_PTR(str);
9705     long len = RSTRING_LEN(str);
9706
9707     if (len == 0) return 0;
9708     e = p + len;
9709     if (rs == rb_default_rs) {
9710         return smart_chomp(str, e, p);
9711     }
9712
9713     enc = rb_enc_get(str);
9714     RSTRING_GETMEM(rs, rsptr, rslen);
9715     if (rslen == 0) {
9716         if (rb_enc_mbminlen(enc) > 1) {
9717             while (e > p) {
9718                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9719                 if (!rb_enc_is_newline(pp, e, enc)) break;
9720                 e = pp;
9721                 pp -= rb_enc_mbminlen(enc);
9722                 if (pp >= p) {
9723                     pp = rb_enc_left_char_head(p, pp, e, enc);
9724                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9725                         e = pp;
9726                     }
9727                 }
9728             }
9729         }
9730         else {
9731             while (e > p && *(e-1) == '\n') {
9732                 --e;
9733                 if (e > p && *(e-1) == '\r')
9734                     --e;
9735             }
9736         }
9737         return e - p;
9738     }
9739     if (rslen > len) return len;
9740
9741     enc = rb_enc_get(rs);
9742     newline = rsptr[rslen-1];
9743     if (rslen == rb_enc_mbminlen(enc)) {
9744         if (rslen == 1) {
9745             if (newline == '\n')
9746                 return smart_chomp(str, e, p);
9747         }
9748         else {
9749             if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9750                 return smart_chomp(str, e, p);
9751         }
9752     }
9753
9754     enc = rb_enc_check(str, rs);
9755     if (is_broken_string(rs)) {
9756         return len;
9757     }
9758     pp = e - rslen;
9759     if (p[len-1] == newline &&
9760         (rslen <= 1 ||
9761          memcmp(rsptr, pp, rslen) == 0)) {
9762         if (at_char_boundary(p, pp, e, enc))
9763             return len - rslen;
9764         RB_GC_GUARD(rs);
9765     }
9766     return len;
9767 }
9768
9769 /*!
9770  * Returns the separator for arguments of rb_str_chomp.
9771  *
9772  * @return returns rb_rs ($/) as default, the default value of rb_rs ($/) is "\n".
9773  */
9774 static VALUE
9775 chomp_rs(int argc, const VALUE *argv)
9776 {
9777     rb_check_arity(argc, 0, 1);
9778     if (argc > 0) {
9779         VALUE rs = argv[0];
9780         if (!NIL_P(rs)) StringValue(rs);
9781         return rs;
9782     }
9783     else {
9784         return rb_rs;
9785     }
9786 }
9787
9788 VALUE
9789 rb_str_chomp_string(VALUE str, VALUE rs)
9790 {
9791     long olen = RSTRING_LEN(str);
9792     long len = chompped_length(str, rs);
9793     if (len >= olen) return Qnil;
9794     str_modify_keep_cr(str);
9795     STR_SET_LEN(str, len);
9796     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9797     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9798         ENC_CODERANGE_CLEAR(str);
9799     }
9800     return str;
9801 }
9802
9803 /*
9804  *  call-seq:
9805  *    chomp!(line_sep = $/) -> self or nil
9806  *
9807  *  Like String#chomp, but modifies +self+ in place;
9808  *  returns +nil+ if no modification made, +self+ otherwise.
9809  *
9810  */
9811
9812 static VALUE
9813 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9814 {
9815     VALUE rs;
9816     str_modifiable(str);
9817     if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
9818     rs = chomp_rs(argc, argv);
9819     if (NIL_P(rs)) return Qnil;
9820     return rb_str_chomp_string(str, rs);
9821 }
9822
9823
9824 /*
9825  *  call-seq:
9826  *    chomp(line_sep = $/) -> new_string
9827  *
9828  *  :include: doc/string/chomp.rdoc
9829  *
9830  */
9831
9832 static VALUE
9833 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9834 {
9835     VALUE rs = chomp_rs(argc, argv);
9836     if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9837     return rb_str_subseq(str, 0, chompped_length(str, rs));
9838 }
9839
9840 static long
9841 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9842 {
9843     const char *const start = s;
9844
9845     if (!s || s >= e) return 0;
9846
9847     /* remove spaces at head */
9848     if (single_byte_optimizable(str)) {
9849         while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9850     }
9851     else {
9852         while (s < e) {
9853             int n;
9854             unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9855
9856             if (cc && !rb_isspace(cc)) break;
9857             s += n;
9858         }
9859     }
9860     return s - start;
9861 }
9862
9863 /*
9864  *  call-seq:
9865  *    lstrip! -> self or nil
9866  *
9867  *  Like String#lstrip, except that any modifications are made in +self+;
9868  *  returns +self+ if any modification are made, +nil+ otherwise.
9869  *
9870  *  Related: String#rstrip!, String#strip!.
9871  */
9872
9873 static VALUE
9874 rb_str_lstrip_bang(VALUE str)
9875 {
9876     rb_encoding *enc;
9877     char *start, *s;
9878     long olen, loffset;
9879
9880     str_modify_keep_cr(str);
9881     enc = STR_ENC_GET(str);
9882     RSTRING_GETMEM(str, start, olen);
9883     loffset = lstrip_offset(str, start, start+olen, enc);
9884     if (loffset > 0) {
9885         long len = olen-loffset;
9886         s = start + loffset;
9887         memmove(start, s, len);
9888         STR_SET_LEN(str, len);
9889         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9890         return str;
9891     }
9892     return Qnil;
9893 }
9894
9895
9896 /*
9897  *  call-seq:
9898  *    lstrip -> new_string
9899  *
9900  *  Returns a copy of +self+ with leading whitespace removed;
9901  *  see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9902  *
9903  *    whitespace = "\x00\t\n\v\f\r "
9904  *    s = whitespace + 'abc' + whitespace
9905  *    s        # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9906  *    s.lstrip # => "abc\u0000\t\n\v\f\r "
9907  *
9908  *  Related: String#rstrip, String#strip.
9909  */
9910
9911 static VALUE
9912 rb_str_lstrip(VALUE str)
9913 {
9914     char *start;
9915     long len, loffset;
9916     RSTRING_GETMEM(str, start, len);
9917     loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9918     if (loffset <= 0) return str_duplicate(rb_cString, str);
9919     return rb_str_subseq(str, loffset, len - loffset);
9920 }
9921
9922 static long
9923 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9924 {
9925     const char *t;
9926
9927     rb_str_check_dummy_enc(enc);
9928     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
9929         rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9930     }
9931     if (!s || s >= e) return 0;
9932     t = e;
9933
9934     /* remove trailing spaces or '\0's */
9935     if (single_byte_optimizable(str)) {
9936         unsigned char c;
9937         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9938     }
9939     else {
9940         char *tp;
9941
9942         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9943             unsigned int c = rb_enc_codepoint(tp, e, enc);
9944             if (c && !rb_isspace(c)) break;
9945             t = tp;
9946         }
9947     }
9948     return e - t;
9949 }
9950
9951 /*
9952  *  call-seq:
9953  *    rstrip! -> self or nil
9954  *
9955  *  Like String#rstrip, except that any modifications are made in +self+;
9956  *  returns +self+ if any modification are made, +nil+ otherwise.
9957  *
9958  *  Related: String#lstrip!, String#strip!.
9959  */
9960
9961 static VALUE
9962 rb_str_rstrip_bang(VALUE str)
9963 {
9964     rb_encoding *enc;
9965     char *start;
9966     long olen, roffset;
9967
9968     str_modify_keep_cr(str);
9969     enc = STR_ENC_GET(str);
9970     RSTRING_GETMEM(str, start, olen);
9971     roffset = rstrip_offset(str, start, start+olen, enc);
9972     if (roffset > 0) {
9973         long len = olen - roffset;
9974
9975         STR_SET_LEN(str, len);
9976         TERM_FILL(start+len, rb_enc_mbminlen(enc));
9977         return str;
9978     }
9979     return Qnil;
9980 }
9981
9982
9983 /*
9984  *  call-seq:
9985  *    rstrip -> new_string
9986  *
9987  *  Returns a copy of the receiver with trailing whitespace removed;
9988  *  see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9989  *
9990  *    whitespace = "\x00\t\n\v\f\r "
9991  *    s = whitespace + 'abc' + whitespace
9992  *    s        # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9993  *    s.rstrip # => "\u0000\t\n\v\f\r abc"
9994  *
9995  *  Related: String#lstrip, String#strip.
9996  */
9997
9998 static VALUE
9999 rb_str_rstrip(VALUE str)
10000 {
10001     rb_encoding *enc;
10002     char *start;
10003     long olen, roffset;
10004
10005     enc = STR_ENC_GET(str);
10006     RSTRING_GETMEM(str, start, olen);
10007     roffset = rstrip_offset(str, start, start+olen, enc);
10008
10009     if (roffset <= 0) return str_duplicate(rb_cString, str);
10010     return rb_str_subseq(str, 0, olen-roffset);
10011 }
10012
10013
10014 /*
10015  *  call-seq:
10016  *    strip! -> self or nil
10017  *
10018  *  Like String#strip, except that any modifications are made in +self+;
10019  *  returns +self+ if any modification are made, +nil+ otherwise.
10020  *
10021  *  Related: String#lstrip!, String#strip!.
10022  */
10023
10024 static VALUE
10025 rb_str_strip_bang(VALUE str)
10026 {
10027     char *start;
10028     long olen, loffset, roffset;
10029     rb_encoding *enc;
10030
10031     str_modify_keep_cr(str);
10032     enc = STR_ENC_GET(str);
10033     RSTRING_GETMEM(str, start, olen);
10034     loffset = lstrip_offset(str, start, start+olen, enc);
10035     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10036
10037     if (loffset > 0 || roffset > 0) {
10038         long len = olen-roffset;
10039         if (loffset > 0) {
10040             len -= loffset;
10041             memmove(start, start + loffset, len);
10042         }
10043         STR_SET_LEN(str, len);
10044         TERM_FILL(start+len, rb_enc_mbminlen(enc));
10045         return str;
10046     }
10047     return Qnil;
10048 }
10049
10050
10051 /*
10052  *  call-seq:
10053  *    strip -> new_string
10054  *
10055  *  Returns a copy of the receiver with leading and trailing whitespace removed;
10056  *  see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10057  *
10058  *    whitespace = "\x00\t\n\v\f\r "
10059  *    s = whitespace + 'abc' + whitespace
10060  *    s       # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10061  *    s.strip # => "abc"
10062  *
10063  *  Related: String#lstrip, String#rstrip.
10064  */
10065
10066 static VALUE
10067 rb_str_strip(VALUE str)
10068 {
10069     char *start;
10070     long olen, loffset, roffset;
10071     rb_encoding *enc = STR_ENC_GET(str);
10072
10073     RSTRING_GETMEM(str, start, olen);
10074     loffset = lstrip_offset(str, start, start+olen, enc);
10075     roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10076
10077     if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10078     return rb_str_subseq(str, loffset, olen-loffset-roffset);
10079 }
10080
10081 static VALUE
10082 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10083 {
10084     VALUE result = Qnil;
10085     long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10086     if (pos >= 0) {
10087         VALUE match;
10088         struct re_registers *regs;
10089         if (BUILTIN_TYPE(pat) == T_STRING) {
10090             regs = NULL;
10091             end = pos + RSTRING_LEN(pat);
10092         }
10093         else {
10094             match = rb_backref_get();
10095             regs = RMATCH_REGS(match);
10096             pos = BEG(0);
10097             end = END(0);
10098         }
10099
10100         if (pos == end) {
10101             rb_encoding *enc = STR_ENC_GET(str);
10102             /*
10103              * Always consume at least one character of the input string
10104              */
10105             if (RSTRING_LEN(str) > end)
10106                 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10107                                                   RSTRING_END(str), enc);
10108             else
10109                 *start = end + 1;
10110         }
10111         else {
10112             *start = end;
10113         }
10114
10115         if (!regs || regs->num_regs == 1) {
10116             result = rb_str_subseq(str, pos, end - pos);
10117             return result;
10118         }
10119         else {
10120             result = rb_ary_new2(regs->num_regs);
10121             for (int i = 1; i < regs->num_regs; i++) {
10122                 VALUE s = Qnil;
10123                 if (BEG(i) >= 0) {
10124                     s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10125                 }
10126
10127                 rb_ary_push(result, s);
10128             }
10129         }
10130
10131         RB_GC_GUARD(match);
10132     }
10133
10134     return result;
10135 }
10136
10137
10138 /*
10139  *  call-seq:
10140  *    scan(string_or_regexp) -> array
10141  *    scan(string_or_regexp) {|matches| ... } -> self
10142  *
10143  *  Matches a pattern against +self+; the pattern is:
10144  *
10145  *  - +string_or_regexp+ itself, if it is a Regexp.
10146  *  - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10147  *
10148  *  Iterates through +self+, generating a collection of matching results:
10149  *
10150  *  - If the pattern contains no groups, each result is the
10151  *    matched string, <code>$&</code>.
10152  *  - If the pattern contains groups, each result is an array
10153  *    containing one entry per group.
10154  *
10155  *  With no block given, returns an array of the results:
10156  *
10157  *    s = 'cruel world'
10158  *    s.scan(/\w+/)      # => ["cruel", "world"]
10159  *    s.scan(/.../)      # => ["cru", "el ", "wor"]
10160  *    s.scan(/(...)/)    # => [["cru"], ["el "], ["wor"]]
10161  *    s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10162  *
10163  *  With a block given, calls the block with each result; returns +self+:
10164  *
10165  *    s.scan(/\w+/) {|w| print "<<#{w}>> " }
10166  *    print "\n"
10167  *    s.scan(/(.)(.)/) {|x,y| print y, x }
10168  *    print "\n"
10169  *
10170  *  Output:
10171  *
10172  *     <<cruel>> <<world>>
10173  *     rceu lowlr
10174  *
10175  */
10176
10177 static VALUE
10178 rb_str_scan(VALUE str, VALUE pat)
10179 {
10180     VALUE result;
10181     long start = 0;
10182     long last = -1, prev = 0;
10183     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10184
10185     pat = get_pat_quoted(pat, 1);
10186     mustnot_broken(str);
10187     if (!rb_block_given_p()) {
10188         VALUE ary = rb_ary_new();
10189
10190         while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10191             last = prev;
10192             prev = start;
10193             rb_ary_push(ary, result);
10194         }
10195         if (last >= 0) rb_pat_search(pat, str, last, 1);
10196         else rb_backref_set(Qnil);
10197         return ary;
10198     }
10199
10200     while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10201         last = prev;
10202         prev = start;
10203         rb_yield(result);
10204         str_mod_check(str, p, len);
10205     }
10206     if (last >= 0) rb_pat_search(pat, str, last, 1);
10207     return str;
10208 }
10209
10210
10211 /*
10212  *  call-seq:
10213  *    hex -> integer
10214  *
10215  *  Interprets the leading substring of +self+ as a string of hexadecimal digits
10216  *  (with an optional sign and an optional <code>0x</code>) and returns the
10217  *  corresponding number;
10218  *  returns zero if there is no such leading substring:
10219  *
10220  *    '0x0a'.hex        # => 10
10221  *    '-1234'.hex       # => -4660
10222  *    '0'.hex           # => 0
10223  *    'non-numeric'.hex # => 0
10224  *
10225  *  Related: String#oct.
10226  *
10227  */
10228
10229 static VALUE
10230 rb_str_hex(VALUE str)
10231 {
10232     return rb_str_to_inum(str, 16, FALSE);
10233 }
10234
10235
10236 /*
10237  *  call-seq:
10238  *    oct -> integer
10239  *
10240  *  Interprets the leading substring of +self+ as a string of octal digits
10241  *  (with an optional sign) and returns the corresponding number;
10242  *  returns zero if there is no such leading substring:
10243  *
10244  *    '123'.oct             # => 83
10245  *    '-377'.oct            # => -255
10246  *    '0377non-numeric'.oct # => 255
10247  *    'non-numeric'.oct     # => 0
10248  *
10249  *  If +self+ starts with <tt>0</tt>, radix indicators are honored;
10250  *  see Kernel#Integer.
10251  *
10252  *  Related: String#hex.
10253  *
10254  */
10255
10256 static VALUE
10257 rb_str_oct(VALUE str)
10258 {
10259     return rb_str_to_inum(str, -8, FALSE);
10260 }
10261
10262 #ifndef HAVE_CRYPT_R
10263 # include "ruby/thread_native.h"
10264 # include "ruby/atomic.h"
10265
10266 static struct {
10267     rb_nativethread_lock_t lock;
10268 } crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10269
10270 static void
10271 crypt_mutex_initialize(void)
10272 {
10273 }
10274 #endif
10275
10276 /*
10277  *  call-seq:
10278  *    crypt(salt_str) -> new_string
10279  *
10280  *  Returns the string generated by calling <code>crypt(3)</code>
10281  *  standard library function with <code>str</code> and
10282  *  <code>salt_str</code>, in this order, as its arguments.  Please do
10283  *  not use this method any longer.  It is legacy; provided only for
10284  *  backward compatibility with ruby scripts in earlier days.  It is
10285  *  bad to use in contemporary programs for several reasons:
10286  *
10287  *  * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10288  *    run.  The generated string lacks data portability.
10289  *
10290  *  * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10291  *    (i.e. silently ends up in unexpected results).
10292  *
10293  *  * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10294  *    thread safe.
10295  *
10296  *  * So-called "traditional" usage of <code>crypt(3)</code> is very
10297  *    very very weak.  According to its manpage, Linux's traditional
10298  *    <code>crypt(3)</code> output has only 2**56 variations; too
10299  *    easy to brute force today.  And this is the default behaviour.
10300  *
10301  *  * In order to make things robust some OSes implement so-called
10302  *    "modular" usage. To go through, you have to do a complex
10303  *    build-up of the <code>salt_str</code> parameter, by hand.
10304  *    Failure in generation of a proper salt string tends not to
10305  *    yield any errors; typos in parameters are normally not
10306  *    detectable.
10307  *
10308  *    * For instance, in the following example, the second invocation
10309  *      of String#crypt is wrong; it has a typo in "round=" (lacks
10310  *      "s").  However the call does not fail and something unexpected
10311  *      is generated.
10312  *
10313  *         "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10314  *         "foo".crypt("$5$round=1000$salt$")  # Typo not detected
10315  *
10316  *  * Even in the "modular" mode, some hash functions are considered
10317  *    archaic and no longer recommended at all; for instance module
10318  *    <code>$1$</code> is officially abandoned by its author: see
10319  *    http://phk.freebsd.dk/sagas/md5crypt_eol/ .  For another
10320  *    instance module <code>$3$</code> is considered completely
10321  *    broken: see the manpage of FreeBSD.
10322  *
10323  *  * On some OS such as Mac OS, there is no modular mode. Yet, as
10324  *    written above, <code>crypt(3)</code> on Mac OS never fails.
10325  *    This means even if you build up a proper salt string it
10326  *    generates a traditional DES hash anyways, and there is no way
10327  *    for you to be aware of.
10328  *
10329  *        "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10330  *
10331  *  If for some reason you cannot migrate to other secure contemporary
10332  *  password hashing algorithms, install the string-crypt gem and
10333  *  <code>require 'string/crypt'</code> to continue using it.
10334  */
10335
10336 static VALUE
10337 rb_str_crypt(VALUE str, VALUE salt)
10338 {
10339 #ifdef HAVE_CRYPT_R
10340     VALUE databuf;
10341     struct crypt_data *data;
10342 #   define CRYPT_END() ALLOCV_END(databuf)
10343 #else
10344     extern char *crypt(const char *, const char *);
10345 #   define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10346 #endif
10347     VALUE result;
10348     const char *s, *saltp;
10349     char *res;
10350 #ifdef BROKEN_CRYPT
10351     char salt_8bit_clean[3];
10352 #endif
10353
10354     StringValue(salt);
10355     mustnot_wchar(str);
10356     mustnot_wchar(salt);
10357     s = StringValueCStr(str);
10358     saltp = RSTRING_PTR(salt);
10359     if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10360         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10361     }
10362
10363 #ifdef BROKEN_CRYPT
10364     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10365         salt_8bit_clean[0] = saltp[0] & 0x7f;
10366         salt_8bit_clean[1] = saltp[1] & 0x7f;
10367         salt_8bit_clean[2] = '\0';
10368         saltp = salt_8bit_clean;
10369     }
10370 #endif
10371 #ifdef HAVE_CRYPT_R
10372     data = ALLOCV(databuf, sizeof(struct crypt_data));
10373 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10374     data->initialized = 0;
10375 # endif
10376     res = crypt_r(s, saltp, data);
10377 #else
10378     crypt_mutex_initialize();
10379     rb_nativethread_lock_lock(&crypt_mutex.lock);
10380     res = crypt(s, saltp);
10381 #endif
10382     if (!res) {
10383         int err = errno;
10384         CRYPT_END();
10385         rb_syserr_fail(err, "crypt");
10386     }
10387     result = rb_str_new_cstr(res);
10388     CRYPT_END();
10389     return result;
10390 }
10391
10392
10393 /*
10394  *  call-seq:
10395  *    ord -> integer
10396  *
10397  *  :include: doc/string/ord.rdoc
10398  *
10399  */
10400
10401 static VALUE
10402 rb_str_ord(VALUE s)
10403 {
10404     unsigned int c;
10405
10406     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10407     return UINT2NUM(c);
10408 }
10409 /*
10410  *  call-seq:
10411  *    sum(n = 16) -> integer
10412  *
10413  *  :include: doc/string/sum.rdoc
10414  *
10415  */
10416
10417 static VALUE
10418 rb_str_sum(int argc, VALUE *argv, VALUE str)
10419 {
10420     int bits = 16;
10421     char *ptr, *p, *pend;
10422     long len;
10423     VALUE sum = INT2FIX(0);
10424     unsigned long sum0 = 0;
10425
10426     if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10427         bits = 0;
10428     }
10429     ptr = p = RSTRING_PTR(str);
10430     len = RSTRING_LEN(str);
10431     pend = p + len;
10432
10433     while (p < pend) {
10434         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10435             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10436             str_mod_check(str, ptr, len);
10437             sum0 = 0;
10438         }
10439         sum0 += (unsigned char)*p;
10440         p++;
10441     }
10442
10443     if (bits == 0) {
10444         if (sum0) {
10445             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10446         }
10447     }
10448     else {
10449         if (sum == INT2FIX(0)) {
10450             if (bits < (int)sizeof(long)*CHAR_BIT) {
10451                 sum0 &= (((unsigned long)1)<<bits)-1;
10452             }
10453             sum = LONG2FIX(sum0);
10454         }
10455         else {
10456             VALUE mod;
10457
10458             if (sum0) {
10459                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10460             }
10461
10462             mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10463             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10464             sum = rb_funcall(sum, '&', 1, mod);
10465         }
10466     }
10467     return sum;
10468 }
10469
10470 static VALUE
10471 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10472 {
10473     rb_encoding *enc;
10474     VALUE w;
10475     long width, len, flen = 1, fclen = 1;
10476     VALUE res;
10477     char *p;
10478     const char *f = " ";
10479     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10480     VALUE pad;
10481     int singlebyte = 1, cr;
10482     int termlen;
10483
10484     rb_scan_args(argc, argv, "11", &w, &pad);
10485     enc = STR_ENC_GET(str);
10486     termlen = rb_enc_mbminlen(enc);
10487     width = NUM2LONG(w);
10488     if (argc == 2) {
10489         StringValue(pad);
10490         enc = rb_enc_check(str, pad);
10491         f = RSTRING_PTR(pad);
10492         flen = RSTRING_LEN(pad);
10493         fclen = str_strlen(pad, enc); /* rb_enc_check */
10494         singlebyte = single_byte_optimizable(pad);
10495         if (flen == 0 || fclen == 0) {
10496             rb_raise(rb_eArgError, "zero width padding");
10497         }
10498     }
10499     len = str_strlen(str, enc); /* rb_enc_check */
10500     if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10501     n = width - len;
10502     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10503     rlen = n - llen;
10504     cr = ENC_CODERANGE(str);
10505     if (flen > 1) {
10506        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10507        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10508     }
10509     size = RSTRING_LEN(str);
10510     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10511        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10512        (len += llen2 + rlen2) >= LONG_MAX - size) {
10513        rb_raise(rb_eArgError, "argument too big");
10514     }
10515     len += size;
10516     res = str_new0(rb_cString, 0, len, termlen);
10517     p = RSTRING_PTR(res);
10518     if (flen <= 1) {
10519        memset(p, *f, llen);
10520        p += llen;
10521     }
10522     else {
10523        while (llen >= fclen) {
10524             memcpy(p,f,flen);
10525             p += flen;
10526             llen -= fclen;
10527         }
10528        if (llen > 0) {
10529            memcpy(p, f, llen2);
10530            p += llen2;
10531         }
10532     }
10533     memcpy(p, RSTRING_PTR(str), size);
10534     p += size;
10535     if (flen <= 1) {
10536        memset(p, *f, rlen);
10537        p += rlen;
10538     }
10539     else {
10540        while (rlen >= fclen) {
10541             memcpy(p,f,flen);
10542             p += flen;
10543             rlen -= fclen;
10544         }
10545        if (rlen > 0) {
10546            memcpy(p, f, rlen2);
10547            p += rlen2;
10548         }
10549     }
10550     TERM_FILL(p, termlen);
10551     STR_SET_LEN(res, p-RSTRING_PTR(res));
10552     rb_enc_associate(res, enc);
10553     if (argc == 2)
10554         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10555     if (cr != ENC_CODERANGE_BROKEN)
10556         ENC_CODERANGE_SET(res, cr);
10557
10558     RB_GC_GUARD(pad);
10559     return res;
10560 }
10561
10562
10563 /*
10564  *  call-seq:
10565  *    ljust(size, pad_string = ' ') -> new_string
10566  *
10567  *  :include: doc/string/ljust.rdoc
10568  *
10569  *  Related: String#rjust, String#center.
10570  *
10571  */
10572
10573 static VALUE
10574 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10575 {
10576     return rb_str_justify(argc, argv, str, 'l');
10577 }
10578
10579 /*
10580  *  call-seq:
10581  *    rjust(size, pad_string = ' ') -> new_string
10582  *
10583  *  :include: doc/string/rjust.rdoc
10584  *
10585  *  Related: String#ljust, String#center.
10586  *
10587  */
10588
10589 static VALUE
10590 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10591 {
10592     return rb_str_justify(argc, argv, str, 'r');
10593 }
10594
10595
10596 /*
10597  *  call-seq:
10598  *    center(size, pad_string = ' ') -> new_string
10599  *
10600  *  :include: doc/string/center.rdoc
10601  *
10602  *  Related: String#ljust, String#rjust.
10603  *
10604  */
10605
10606 static VALUE
10607 rb_str_center(int argc, VALUE *argv, VALUE str)
10608 {
10609     return rb_str_justify(argc, argv, str, 'c');
10610 }
10611
10612 /*
10613  *  call-seq:
10614  *    partition(string_or_regexp) -> [head, match, tail]
10615  *
10616  *  :include: doc/string/partition.rdoc
10617  *
10618  */
10619
10620 static VALUE
10621 rb_str_partition(VALUE str, VALUE sep)
10622 {
10623     long pos;
10624
10625     sep = get_pat_quoted(sep, 0);
10626     if (RB_TYPE_P(sep, T_REGEXP)) {
10627         if (rb_reg_search(sep, str, 0, 0) < 0) {
10628             goto failed;
10629         }
10630         VALUE match = rb_backref_get();
10631         struct re_registers *regs = RMATCH_REGS(match);
10632
10633         pos = BEG(0);
10634         sep = rb_str_subseq(str, pos, END(0) - pos);
10635     }
10636     else {
10637         pos = rb_str_index(str, sep, 0);
10638         if (pos < 0) goto failed;
10639     }
10640     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10641                           sep,
10642                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10643                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10644
10645   failed:
10646     return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10647 }
10648
10649 /*
10650  *  call-seq:
10651  *    rpartition(sep) -> [head, match, tail]
10652  *
10653  *  :include: doc/string/rpartition.rdoc
10654  *
10655  */
10656
10657 static VALUE
10658 rb_str_rpartition(VALUE str, VALUE sep)
10659 {
10660     long pos = RSTRING_LEN(str);
10661
10662     sep = get_pat_quoted(sep, 0);
10663     if (RB_TYPE_P(sep, T_REGEXP)) {
10664         if (rb_reg_search(sep, str, pos, 1) < 0) {
10665             goto failed;
10666         }
10667         VALUE match = rb_backref_get();
10668         struct re_registers *regs = RMATCH_REGS(match);
10669
10670         pos = BEG(0);
10671         sep = rb_str_subseq(str, pos, END(0) - pos);
10672     }
10673     else {
10674         pos = rb_str_sublen(str, pos);
10675         pos = rb_str_rindex(str, sep, pos);
10676         if (pos < 0) {
10677             goto failed;
10678         }
10679     }
10680
10681     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10682                           sep,
10683                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
10684                                         RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10685   failed:
10686     return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10687 }
10688
10689 /*
10690  *  call-seq:
10691  *    start_with?(*string_or_regexp) -> true or false
10692  *
10693  *  :include: doc/string/start_with_p.rdoc
10694  *
10695  */
10696
10697 static VALUE
10698 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10699 {
10700     int i;
10701
10702     for (i=0; i<argc; i++) {
10703         VALUE tmp = argv[i];
10704         if (RB_TYPE_P(tmp, T_REGEXP)) {
10705             if (rb_reg_start_with_p(tmp, str))
10706                 return Qtrue;
10707         }
10708         else {
10709             const char *p, *s, *e;
10710             long slen, tlen;
10711             rb_encoding *enc;
10712
10713             StringValue(tmp);
10714             enc = rb_enc_check(str, tmp);
10715             if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10716             if ((slen = RSTRING_LEN(str)) < tlen) continue;
10717             p = RSTRING_PTR(str);
10718             e = p + slen;
10719             s = p + tlen;
10720             if (!at_char_right_boundary(p, s, e, enc))
10721                 continue;
10722             if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
10723                 return Qtrue;
10724         }
10725     }
10726     return Qfalse;
10727 }
10728
10729 /*
10730  *  call-seq:
10731  *    end_with?(*strings) -> true or false
10732  *
10733  *  :include: doc/string/end_with_p.rdoc
10734  *
10735  */
10736
10737 static VALUE
10738 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10739 {
10740     int i;
10741
10742     for (i=0; i<argc; i++) {
10743         VALUE tmp = argv[i];
10744         const char *p, *s, *e;
10745         long slen, tlen;
10746         rb_encoding *enc;
10747
10748         StringValue(tmp);
10749         enc = rb_enc_check(str, tmp);
10750         if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10751         if ((slen = RSTRING_LEN(str)) < tlen) continue;
10752         p = RSTRING_PTR(str);
10753         e = p + slen;
10754         s = e - tlen;
10755         if (!at_char_boundary(p, s, e, enc))
10756             continue;
10757         if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
10758             return Qtrue;
10759     }
10760     return Qfalse;
10761 }
10762
10763 /*!
10764  * Returns the length of the <i>prefix</i> to be deleted in the given <i>str</i>,
10765  * returning 0 if <i>str</i> does not start with the <i>prefix</i>.
10766  *
10767  * @param str the target
10768  * @param prefix the prefix
10769  * @retval 0 if the given <i>str</i> does not start with the given <i>prefix</i>
10770  * @retval Positive-Integer otherwise
10771  */
10772 static long
10773 deleted_prefix_length(VALUE str, VALUE prefix)
10774 {
10775     const char *strptr, *prefixptr;
10776     long olen, prefixlen;
10777     rb_encoding *enc = rb_enc_get(str);
10778
10779     StringValue(prefix);
10780
10781     if (!is_broken_string(prefix) ||
10782         !rb_enc_asciicompat(enc) ||
10783         !rb_enc_asciicompat(rb_enc_get(prefix))) {
10784         enc = rb_enc_check(str, prefix);
10785     }
10786
10787     /* return 0 if not start with prefix */
10788     prefixlen = RSTRING_LEN(prefix);
10789     if (prefixlen <= 0) return 0;
10790     olen = RSTRING_LEN(str);
10791     if (olen < prefixlen) return 0;
10792     strptr = RSTRING_PTR(str);
10793     prefixptr = RSTRING_PTR(prefix);
10794     if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10795     if (is_broken_string(prefix)) {
10796         if (!is_broken_string(str)) {
10797             /* prefix in a valid string cannot be broken */
10798             return 0;
10799         }
10800         const char *strend = strptr + olen;
10801         const char *after_prefix = strptr + prefixlen;
10802         if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
10803             /* prefix does not end at char-boundary */
10804             return 0;
10805         }
10806     }
10807     /* prefix part in `str` also should be valid. */
10808
10809     return prefixlen;
10810 }
10811
10812 /*
10813  *  call-seq:
10814  *    delete_prefix!(prefix) -> self or nil
10815  *
10816  *  Like String#delete_prefix, except that +self+ is modified in place.
10817  *  Returns +self+ if the prefix is removed, +nil+ otherwise.
10818  *
10819  */
10820
10821 static VALUE
10822 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10823 {
10824     long prefixlen;
10825     str_modify_keep_cr(str);
10826
10827     prefixlen = deleted_prefix_length(str, prefix);
10828     if (prefixlen <= 0) return Qnil;
10829
10830     return rb_str_drop_bytes(str, prefixlen);
10831 }
10832
10833 /*
10834  *  call-seq:
10835  *    delete_prefix(prefix) -> new_string
10836  *
10837  *  :include: doc/string/delete_prefix.rdoc
10838  *
10839  */
10840
10841 static VALUE
10842 rb_str_delete_prefix(VALUE str, VALUE prefix)
10843 {
10844     long prefixlen;
10845
10846     prefixlen = deleted_prefix_length(str, prefix);
10847     if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10848
10849     return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10850 }
10851
10852 /*!
10853  * Returns the length of the <i>suffix</i> to be deleted in the given <i>str</i>,
10854  * returning 0 if <i>str</i> does not end with the <i>suffix</i>.
10855  *
10856  * @param str the target
10857  * @param suffix the suffix
10858  * @retval 0 if the given <i>str</i> does not end with the given <i>suffix</i>
10859  * @retval Positive-Integer otherwise
10860  */
10861 static long
10862 deleted_suffix_length(VALUE str, VALUE suffix)
10863 {
10864     const char *strptr, *suffixptr;
10865     long olen, suffixlen;
10866     rb_encoding *enc;
10867
10868     StringValue(suffix);
10869     if (is_broken_string(suffix)) return 0;
10870     enc = rb_enc_check(str, suffix);
10871
10872     /* return 0 if not start with suffix */
10873     suffixlen = RSTRING_LEN(suffix);
10874     if (suffixlen <= 0) return 0;
10875     olen = RSTRING_LEN(str);
10876     if (olen < suffixlen) return 0;
10877     strptr = RSTRING_PTR(str);
10878     suffixptr = RSTRING_PTR(suffix);
10879     const char *strend = strptr + olen;
10880     const char *before_suffix = strend - suffixlen;
10881     if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
10882     if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
10883
10884     return suffixlen;
10885 }
10886
10887 /*
10888  *  call-seq:
10889  *    delete_suffix!(suffix) -> self or nil
10890  *
10891  *  Like String#delete_suffix, except that +self+ is modified in place.
10892  *  Returns +self+ if the suffix is removed, +nil+ otherwise.
10893  *
10894  */
10895
10896 static VALUE
10897 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10898 {
10899     long olen, suffixlen, len;
10900     str_modifiable(str);
10901
10902     suffixlen = deleted_suffix_length(str, suffix);
10903     if (suffixlen <= 0) return Qnil;
10904
10905     olen = RSTRING_LEN(str);
10906     str_modify_keep_cr(str);
10907     len = olen - suffixlen;
10908     STR_SET_LEN(str, len);
10909     TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10910     if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10911         ENC_CODERANGE_CLEAR(str);
10912     }
10913     return str;
10914 }
10915
10916 /*
10917  *  call-seq:
10918  *    delete_suffix(suffix) -> new_string
10919  *
10920  *  :include: doc/string/delete_suffix.rdoc
10921  *
10922  */
10923
10924 static VALUE
10925 rb_str_delete_suffix(VALUE str, VALUE suffix)
10926 {
10927     long suffixlen;
10928
10929     suffixlen = deleted_suffix_length(str, suffix);
10930     if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10931
10932     return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10933 }
10934
10935 void
10936 rb_str_setter(VALUE val, ID id, VALUE *var)
10937 {
10938     if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10939         rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10940     }
10941     *var = val;
10942 }
10943
10944 static void
10945 rb_fs_setter(VALUE val, ID id, VALUE *var)
10946 {
10947     val = rb_fs_check(val);
10948     if (!val) {
10949         rb_raise(rb_eTypeError,
10950                  "value of %"PRIsVALUE" must be String or Regexp",
10951                  rb_id2str(id));
10952     }
10953     if (!NIL_P(val)) {
10954         rb_warn_deprecated("'$;'", NULL);
10955     }
10956     *var = val;
10957 }
10958
10959
10960 /*
10961  *  call-seq:
10962  *    force_encoding(encoding) -> self
10963  *
10964  *  :include: doc/string/force_encoding.rdoc
10965  *
10966  */
10967
10968 static VALUE
10969 rb_str_force_encoding(VALUE str, VALUE enc)
10970 {
10971     str_modifiable(str);
10972
10973     rb_encoding *encoding = rb_to_encoding(enc);
10974     int idx = rb_enc_to_index(encoding);
10975
10976     // If the encoding is unchanged, we do nothing.
10977     if (ENCODING_GET(str) == idx) {
10978         return str;
10979     }
10980
10981     rb_enc_associate_index(str, idx);
10982
10983     // If the coderange was 7bit and the new encoding is ASCII-compatible
10984     // we can keep the coderange.
10985     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
10986         return str;
10987     }
10988
10989     ENC_CODERANGE_CLEAR(str);
10990     return str;
10991 }
10992
10993 /*
10994  *  call-seq:
10995  *    b -> string
10996  *
10997  *  :include: doc/string/b.rdoc
10998  *
10999  */
11000
11001 static VALUE
11002 rb_str_b(VALUE str)
11003 {
11004     VALUE str2;
11005     if (STR_EMBED_P(str)) {
11006         str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11007     }
11008     else {
11009         str2 = str_alloc_heap(rb_cString);
11010     }
11011     str_replace_shared_without_enc(str2, str);
11012
11013     if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11014         // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11015         // If we know the receiver's code range then we know the result's code range.
11016         int cr = ENC_CODERANGE(str);
11017         switch (cr) {
11018           case ENC_CODERANGE_7BIT:
11019             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
11020             break;
11021           case ENC_CODERANGE_BROKEN:
11022           case ENC_CODERANGE_VALID:
11023             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
11024             break;
11025           default:
11026             ENC_CODERANGE_CLEAR(str2);
11027             break;
11028         }
11029     }
11030
11031     return str2;
11032 }
11033
11034 /*
11035  *  call-seq:
11036  *    valid_encoding? -> true or false
11037  *
11038  *  Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11039  *
11040  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11041  *    "\xc2".force_encoding("UTF-8").valid_encoding?     # => false
11042  *    "\x80".force_encoding("UTF-8").valid_encoding?     # => false
11043  */
11044
11045 static VALUE
11046 rb_str_valid_encoding_p(VALUE str)
11047 {
11048     int cr = rb_enc_str_coderange(str);
11049
11050     return RBOOL(cr != ENC_CODERANGE_BROKEN);
11051 }
11052
11053 /*
11054  *  call-seq:
11055  *    ascii_only? -> true or false
11056  *
11057  *  Returns +true+ if +self+ contains only ASCII characters,
11058  *  +false+ otherwise:
11059  *
11060  *    'abc'.ascii_only?         # => true
11061  *    "abc\u{6666}".ascii_only? # => false
11062  *
11063  */
11064
11065 static VALUE
11066 rb_str_is_ascii_only_p(VALUE str)
11067 {
11068     int cr = rb_enc_str_coderange(str);
11069
11070     return RBOOL(cr == ENC_CODERANGE_7BIT);
11071 }
11072
11073 VALUE
11074 rb_str_ellipsize(VALUE str, long len)
11075 {
11076     static const char ellipsis[] = "...";
11077     const long ellipsislen = sizeof(ellipsis) - 1;
11078     rb_encoding *const enc = rb_enc_get(str);
11079     const long blen = RSTRING_LEN(str);
11080     const char *const p = RSTRING_PTR(str), *e = p + blen;
11081     VALUE estr, ret = 0;
11082
11083     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11084     if (len * rb_enc_mbminlen(enc) >= blen ||
11085         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11086         ret = str;
11087     }
11088     else if (len <= ellipsislen ||
11089              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11090         if (rb_enc_asciicompat(enc)) {
11091             ret = rb_str_new(ellipsis, len);
11092             rb_enc_associate(ret, enc);
11093         }
11094         else {
11095             estr = rb_usascii_str_new(ellipsis, len);
11096             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11097         }
11098     }
11099     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11100         rb_str_cat(ret, ellipsis, ellipsislen);
11101     }
11102     else {
11103         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11104                              rb_enc_from_encoding(enc), 0, Qnil);
11105         rb_str_append(ret, estr);
11106     }
11107     return ret;
11108 }
11109
11110 static VALUE
11111 str_compat_and_valid(VALUE str, rb_encoding *enc)
11112 {
11113     int cr;
11114     str = StringValue(str);
11115     cr = rb_enc_str_coderange(str);
11116     if (cr == ENC_CODERANGE_BROKEN) {
11117         rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11118     }
11119     else {
11120         rb_encoding *e = STR_ENC_GET(str);
11121         if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11122             rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11123                      rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11124         }
11125     }
11126     return str;
11127 }
11128
11129 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11130
11131 VALUE
11132 rb_str_scrub(VALUE str, VALUE repl)
11133 {
11134     rb_encoding *enc = STR_ENC_GET(str);
11135     return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11136 }
11137
11138 VALUE
11139 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11140 {
11141     int cr = ENC_CODERANGE_UNKNOWN;
11142     if (enc == STR_ENC_GET(str)) {
11143         /* cached coderange makes sense only when enc equals the
11144          * actual encoding of str */
11145         cr = ENC_CODERANGE(str);
11146     }
11147     return enc_str_scrub(enc, str, repl, cr);
11148 }
11149
11150 static VALUE
11151 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11152 {
11153     int encidx;
11154     VALUE buf = Qnil;
11155     const char *rep, *p, *e, *p1, *sp;
11156     long replen = -1;
11157     long slen;
11158
11159     if (rb_block_given_p()) {
11160         if (!NIL_P(repl))
11161             rb_raise(rb_eArgError, "both of block and replacement given");
11162         replen = 0;
11163     }
11164
11165     if (ENC_CODERANGE_CLEAN_P(cr))
11166         return Qnil;
11167
11168     if (!NIL_P(repl)) {
11169         repl = str_compat_and_valid(repl, enc);
11170     }
11171
11172     if (rb_enc_dummy_p(enc)) {
11173         return Qnil;
11174     }
11175     encidx = rb_enc_to_index(enc);
11176
11177 #define DEFAULT_REPLACE_CHAR(str) do { \
11178         static const char replace[sizeof(str)-1] = str; \
11179         rep = replace; replen = (int)sizeof(replace); \
11180     } while (0)
11181
11182     slen = RSTRING_LEN(str);
11183     p = RSTRING_PTR(str);
11184     e = RSTRING_END(str);
11185     p1 = p;
11186     sp = p;
11187
11188     if (rb_enc_asciicompat(enc)) {
11189         int rep7bit_p;
11190         if (!replen) {
11191             rep = NULL;
11192             rep7bit_p = FALSE;
11193         }
11194         else if (!NIL_P(repl)) {
11195             rep = RSTRING_PTR(repl);
11196             replen = RSTRING_LEN(repl);
11197             rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11198         }
11199         else if (encidx == rb_utf8_encindex()) {
11200             DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11201             rep7bit_p = FALSE;
11202         }
11203         else {
11204             DEFAULT_REPLACE_CHAR("?");
11205             rep7bit_p = TRUE;
11206         }
11207         cr = ENC_CODERANGE_7BIT;
11208
11209         p = search_nonascii(p, e);
11210         if (!p) {
11211             p = e;
11212         }
11213         while (p < e) {
11214             int ret = rb_enc_precise_mbclen(p, e, enc);
11215             if (MBCLEN_NEEDMORE_P(ret)) {
11216                 break;
11217             }
11218             else if (MBCLEN_CHARFOUND_P(ret)) {
11219                 cr = ENC_CODERANGE_VALID;
11220                 p += MBCLEN_CHARFOUND_LEN(ret);
11221             }
11222             else if (MBCLEN_INVALID_P(ret)) {
11223                 /*
11224                  * p1~p: valid ascii/multibyte chars
11225                  * p ~e: invalid bytes + unknown bytes
11226                  */
11227                 long clen = rb_enc_mbmaxlen(enc);
11228                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11229                 if (p > p1) {
11230                     rb_str_buf_cat(buf, p1, p - p1);
11231                 }
11232
11233                 if (e - p < clen) clen = e - p;
11234                 if (clen <= 2) {
11235                     clen = 1;
11236                 }
11237                 else {
11238                     const char *q = p;
11239                     clen--;
11240                     for (; clen > 1; clen--) {
11241                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
11242                         if (MBCLEN_NEEDMORE_P(ret)) break;
11243                         if (MBCLEN_INVALID_P(ret)) continue;
11244                         UNREACHABLE;
11245                     }
11246                 }
11247                 if (rep) {
11248                     rb_str_buf_cat(buf, rep, replen);
11249                     if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11250                 }
11251                 else {
11252                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
11253                     str_mod_check(str, sp, slen);
11254                     repl = str_compat_and_valid(repl, enc);
11255                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11256                     if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11257                         cr = ENC_CODERANGE_VALID;
11258                 }
11259                 p += clen;
11260                 p1 = p;
11261                 p = search_nonascii(p, e);
11262                 if (!p) {
11263                     p = e;
11264                     break;
11265                 }
11266             }
11267             else {
11268                 UNREACHABLE;
11269             }
11270         }
11271         if (NIL_P(buf)) {
11272             if (p == e) {
11273                 ENC_CODERANGE_SET(str, cr);
11274                 return Qnil;
11275             }
11276             buf = rb_str_buf_new(RSTRING_LEN(str));
11277         }
11278         if (p1 < p) {
11279             rb_str_buf_cat(buf, p1, p - p1);
11280         }
11281         if (p < e) {
11282             if (rep) {
11283                 rb_str_buf_cat(buf, rep, replen);
11284                 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11285             }
11286             else {
11287                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11288                 str_mod_check(str, sp, slen);
11289                 repl = str_compat_and_valid(repl, enc);
11290                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11291                 if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11292                     cr = ENC_CODERANGE_VALID;
11293             }
11294         }
11295     }
11296     else {
11297         /* ASCII incompatible */
11298         long mbminlen = rb_enc_mbminlen(enc);
11299         if (!replen) {
11300             rep = NULL;
11301         }
11302         else if (!NIL_P(repl)) {
11303             rep = RSTRING_PTR(repl);
11304             replen = RSTRING_LEN(repl);
11305         }
11306         else if (encidx == ENCINDEX_UTF_16BE) {
11307             DEFAULT_REPLACE_CHAR("\xFF\xFD");
11308         }
11309         else if (encidx == ENCINDEX_UTF_16LE) {
11310             DEFAULT_REPLACE_CHAR("\xFD\xFF");
11311         }
11312         else if (encidx == ENCINDEX_UTF_32BE) {
11313             DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11314         }
11315         else if (encidx == ENCINDEX_UTF_32LE) {
11316             DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11317         }
11318         else {
11319             DEFAULT_REPLACE_CHAR("?");
11320         }
11321
11322         while (p < e) {
11323             int ret = rb_enc_precise_mbclen(p, e, enc);
11324             if (MBCLEN_NEEDMORE_P(ret)) {
11325                 break;
11326             }
11327             else if (MBCLEN_CHARFOUND_P(ret)) {
11328                 p += MBCLEN_CHARFOUND_LEN(ret);
11329             }
11330             else if (MBCLEN_INVALID_P(ret)) {
11331                 const char *q = p;
11332                 long clen = rb_enc_mbmaxlen(enc);
11333                 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11334                 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11335
11336                 if (e - p < clen) clen = e - p;
11337                 if (clen <= mbminlen * 2) {
11338                     clen = mbminlen;
11339                 }
11340                 else {
11341                     clen -= mbminlen;
11342                     for (; clen > mbminlen; clen-=mbminlen) {
11343                         ret = rb_enc_precise_mbclen(q, q + clen, enc);
11344                         if (MBCLEN_NEEDMORE_P(ret)) break;
11345                         if (MBCLEN_INVALID_P(ret)) continue;
11346                         UNREACHABLE;
11347                     }
11348                 }
11349                 if (rep) {
11350                     rb_str_buf_cat(buf, rep, replen);
11351                 }
11352                 else {
11353                     repl = rb_yield(rb_enc_str_new(p, clen, enc));
11354                     str_mod_check(str, sp, slen);
11355                     repl = str_compat_and_valid(repl, enc);
11356                     rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11357                 }
11358                 p += clen;
11359                 p1 = p;
11360             }
11361             else {
11362                 UNREACHABLE;
11363             }
11364         }
11365         if (NIL_P(buf)) {
11366             if (p == e) {
11367                 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
11368                 return Qnil;
11369             }
11370             buf = rb_str_buf_new(RSTRING_LEN(str));
11371         }
11372         if (p1 < p) {
11373             rb_str_buf_cat(buf, p1, p - p1);
11374         }
11375         if (p < e) {
11376             if (rep) {
11377                 rb_str_buf_cat(buf, rep, replen);
11378             }
11379             else {
11380                 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11381                 str_mod_check(str, sp, slen);
11382                 repl = str_compat_and_valid(repl, enc);
11383                 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11384             }
11385         }
11386         cr = ENC_CODERANGE_VALID;
11387     }
11388     ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11389     return buf;
11390 }
11391
11392 /*
11393  *  call-seq:
11394  *    scrub(replacement_string = default_replacement) -> new_string
11395  *    scrub{|bytes| ... } -> new_string
11396  *
11397  *  :include: doc/string/scrub.rdoc
11398  *
11399  */
11400 static VALUE
11401 str_scrub(int argc, VALUE *argv, VALUE str)
11402 {
11403     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11404     VALUE new = rb_str_scrub(str, repl);
11405     return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11406 }
11407
11408 /*
11409  *  call-seq:
11410  *    scrub! -> self
11411  *    scrub!(replacement_string = default_replacement) -> self
11412  *    scrub!{|bytes| ... } -> self
11413  *
11414  *  Like String#scrub, except that any replacements are made in +self+.
11415  *
11416  */
11417 static VALUE
11418 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11419 {
11420     VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11421     VALUE new = rb_str_scrub(str, repl);
11422     if (!NIL_P(new)) rb_str_replace(str, new);
11423     return str;
11424 }
11425
11426 static ID id_normalize;
11427 static ID id_normalized_p;
11428 static VALUE mUnicodeNormalize;
11429
11430 static VALUE
11431 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11432 {
11433     static int UnicodeNormalizeRequired = 0;
11434     VALUE argv2[2];
11435
11436     if (!UnicodeNormalizeRequired) {
11437         rb_require("unicode_normalize/normalize.rb");
11438         UnicodeNormalizeRequired = 1;
11439     }
11440     argv2[0] = str;
11441     if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11442     return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11443 }
11444
11445 /*
11446  *  call-seq:
11447  *    unicode_normalize(form = :nfc) -> string
11448  *
11449  *  Returns a copy of +self+ with
11450  *  {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11451  *
11452  *  Argument +form+ must be one of the following symbols
11453  *  (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11454  *
11455  *  - +:nfc+: Canonical decomposition, followed by canonical composition.
11456  *  - +:nfd+: Canonical decomposition.
11457  *  - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11458  *  - +:nfkd+: Compatibility decomposition.
11459  *
11460  *  The encoding of +self+ must be one of:
11461  *
11462  *  - Encoding::UTF_8
11463  *  - Encoding::UTF_16BE
11464  *  - Encoding::UTF_16LE
11465  *  - Encoding::UTF_32BE
11466  *  - Encoding::UTF_32LE
11467  *  - Encoding::GB18030
11468  *  - Encoding::UCS_2BE
11469  *  - Encoding::UCS_4BE
11470  *
11471  *  Examples:
11472  *
11473  *    "a\u0300".unicode_normalize      # => "a"
11474  *    "\u00E0".unicode_normalize(:nfd) # => "a "
11475  *
11476  *  Related: String#unicode_normalize!, String#unicode_normalized?.
11477  */
11478 static VALUE
11479 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11480 {
11481     return unicode_normalize_common(argc, argv, str, id_normalize);
11482 }
11483
11484 /*
11485  *  call-seq:
11486  *    unicode_normalize!(form = :nfc) -> self
11487  *
11488  *  Like String#unicode_normalize, except that the normalization
11489  *  is performed on +self+.
11490  *
11491  *  Related String#unicode_normalized?.
11492  *
11493  */
11494 static VALUE
11495 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11496 {
11497     return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11498 }
11499
11500 /*  call-seq:
11501  *   unicode_normalized?(form = :nfc) -> true or false
11502  *
11503  *  Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11504  *  +false+ otherwise.
11505  *  The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11506  *
11507  *  Examples:
11508  *
11509  *    "a\u0300".unicode_normalized?       # => false
11510  *    "a\u0300".unicode_normalized?(:nfd) # => true
11511  *    "\u00E0".unicode_normalized?        # => true
11512  *    "\u00E0".unicode_normalized?(:nfd)  # => false
11513  *
11514  *
11515  *  Raises an exception if +self+ is not in a Unicode encoding:
11516  *
11517  *    s = "\xE0".force_encoding('ISO-8859-1')
11518  *    s.unicode_normalized? # Raises Encoding::CompatibilityError.
11519  *
11520  *  Related: String#unicode_normalize, String#unicode_normalize!.
11521  *
11522  */
11523 static VALUE
11524 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11525 {
11526     return unicode_normalize_common(argc, argv, str, id_normalized_p);
11527 }
11528
11529 /**********************************************************************
11530  * Document-class: Symbol
11531  *
11532  * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11533  *
11534  * You can create a +Symbol+ object explicitly with:
11535  *
11536  * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11537  *
11538  * The same +Symbol+ object will be
11539  * created for a given name or string for the duration of a program's
11540  * execution, regardless of the context or meaning of that name. Thus
11541  * if <code>Fred</code> is a constant in one context, a method in
11542  * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11543  * will be the same object in all three contexts.
11544  *
11545  *     module One
11546  *       class Fred
11547  *       end
11548  *       $f1 = :Fred
11549  *     end
11550  *     module Two
11551  *       Fred = 1
11552  *       $f2 = :Fred
11553  *     end
11554  *     def Fred()
11555  *     end
11556  *     $f3 = :Fred
11557  *     $f1.object_id   #=> 2514190
11558  *     $f2.object_id   #=> 2514190
11559  *     $f3.object_id   #=> 2514190
11560  *
11561  * Constant, method, and variable names are returned as symbols:
11562  *
11563  *     module One
11564  *       Two = 2
11565  *       def three; 3 end
11566  *       @four = 4
11567  *       @@five = 5
11568  *       $six = 6
11569  *     end
11570  *     seven = 7
11571  *
11572  *     One.constants
11573  *     # => [:Two]
11574  *     One.instance_methods(true)
11575  *     # => [:three]
11576  *     One.instance_variables
11577  *     # => [:@four]
11578  *     One.class_variables
11579  *     # => [:@@five]
11580  *     global_variables.grep(/six/)
11581  *     # => [:$six]
11582  *     local_variables
11583  *     # => [:seven]
11584  *
11585  * A +Symbol+ object differs from a String object in that
11586  * a +Symbol+ object represents an identifier, while a String object
11587  * represents text or data.
11588  *
11589  * == What's Here
11590  *
11591  * First, what's elsewhere. \Class +Symbol+:
11592  *
11593  * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11594  * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11595  *
11596  * Here, class +Symbol+ provides methods that are useful for:
11597  *
11598  * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11599  * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11600  * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11601  *
11602  * === Methods for Querying
11603  *
11604  * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11605  * - #=~: Returns the index of the first substring in symbol that matches a
11606  *   given Regexp or other object; returns +nil+ if no match is found.
11607  * - #[], #slice : Returns a substring of symbol
11608  *   determined by a given index, start/length, or range, or string.
11609  * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11610  * - #encoding: Returns the Encoding object that represents the encoding
11611  *   of symbol.
11612  * - #end_with?: Returns +true+ if symbol ends with
11613  *   any of the given strings.
11614  * - #match: Returns a MatchData object if symbol
11615  *   matches a given Regexp; +nil+ otherwise.
11616  * - #match?: Returns +true+ if symbol
11617  *   matches a given Regexp; +false+ otherwise.
11618  * - #length, #size: Returns the number of characters in symbol.
11619  * - #start_with?: Returns +true+ if symbol starts with
11620  *   any of the given strings.
11621  *
11622  * === Methods for Comparing
11623  *
11624  * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11625  *   or larger than symbol.
11626  * - #==, #===: Returns +true+ if a given symbol has the same content and
11627  *   encoding.
11628  * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11629  *   symbol is smaller than, equal to, or larger than symbol.
11630  * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11631  *   after Unicode case folding; +false+ otherwise.
11632  *
11633  * === Methods for Converting
11634  *
11635  * - #capitalize: Returns symbol with the first character upcased
11636  *   and all other characters downcased.
11637  * - #downcase: Returns symbol with all characters downcased.
11638  * - #inspect: Returns the string representation of +self+ as a symbol literal.
11639  * - #name: Returns the frozen string corresponding to symbol.
11640  * - #succ, #next: Returns the symbol that is the successor to symbol.
11641  * - #swapcase: Returns symbol with all upcase characters downcased
11642  *   and all downcase characters upcased.
11643  * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11644  * - #to_s, #id2name: Returns the string corresponding to +self+.
11645  * - #to_sym, #intern: Returns +self+.
11646  * - #upcase: Returns symbol with all characters upcased.
11647  *
11648  */
11649
11650
11651 /*
11652  *  call-seq:
11653  *    symbol == object -> true or false
11654  *
11655  *  Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11656  */
11657
11658 #define sym_equal rb_obj_equal
11659
11660 static int
11661 sym_printable(const char *s, const char *send, rb_encoding *enc)
11662 {
11663     while (s < send) {
11664         int n;
11665         int c = rb_enc_precise_mbclen(s, send, enc);
11666
11667         if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11668         n = MBCLEN_CHARFOUND_LEN(c);
11669         c = rb_enc_mbc_to_codepoint(s, send, enc);
11670         if (!rb_enc_isprint(c, enc)) return FALSE;
11671         s += n;
11672     }
11673     return TRUE;
11674 }
11675
11676 int
11677 rb_str_symname_p(VALUE sym)
11678 {
11679     rb_encoding *enc;
11680     const char *ptr;
11681     long len;
11682     rb_encoding *resenc = rb_default_internal_encoding();
11683
11684     if (resenc == NULL) resenc = rb_default_external_encoding();
11685     enc = STR_ENC_GET(sym);
11686     ptr = RSTRING_PTR(sym);
11687     len = RSTRING_LEN(sym);
11688     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11689         !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11690         return FALSE;
11691     }
11692     return TRUE;
11693 }
11694
11695 VALUE
11696 rb_str_quote_unprintable(VALUE str)
11697 {
11698     rb_encoding *enc;
11699     const char *ptr;
11700     long len;
11701     rb_encoding *resenc;
11702
11703     Check_Type(str, T_STRING);
11704     resenc = rb_default_internal_encoding();
11705     if (resenc == NULL) resenc = rb_default_external_encoding();
11706     enc = STR_ENC_GET(str);
11707     ptr = RSTRING_PTR(str);
11708     len = RSTRING_LEN(str);
11709     if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11710         !sym_printable(ptr, ptr + len, enc)) {
11711         return rb_str_escape(str);
11712     }
11713     return str;
11714 }
11715
11716 VALUE
11717 rb_id_quote_unprintable(ID id)
11718 {
11719     VALUE str = rb_id2str(id);
11720     if (!rb_str_symname_p(str)) {
11721         return rb_str_escape(str);
11722     }
11723     return str;
11724 }
11725
11726 /*
11727  *  call-seq:
11728  *    inspect -> string
11729  *
11730  *  Returns a string representation of +self+ (including the leading colon):
11731  *
11732  *    :foo.inspect # => ":foo"
11733  *
11734  *  Related:  Symbol#to_s, Symbol#name.
11735  *
11736  */
11737
11738 static VALUE
11739 sym_inspect(VALUE sym)
11740 {
11741     VALUE str = rb_sym2str(sym);
11742     const char *ptr;
11743     long len;
11744     char *dest;
11745
11746     if (!rb_str_symname_p(str)) {
11747         str = rb_str_inspect(str);
11748         len = RSTRING_LEN(str);
11749         rb_str_resize(str, len + 1);
11750         dest = RSTRING_PTR(str);
11751         memmove(dest + 1, dest, len);
11752     }
11753     else {
11754         rb_encoding *enc = STR_ENC_GET(str);
11755         VALUE orig_str = str;
11756
11757         len = RSTRING_LEN(orig_str);
11758         str = rb_enc_str_new(0, len + 1, enc);
11759
11760         // Get data pointer after allocation
11761         ptr = RSTRING_PTR(orig_str);
11762         dest = RSTRING_PTR(str);
11763         memcpy(dest + 1, ptr, len);
11764
11765         RB_GC_GUARD(orig_str);
11766     }
11767     dest[0] = ':';
11768
11769     RUBY_ASSERT_BUILTIN_TYPE(str, T_STRING);
11770
11771     return str;
11772 }
11773
11774 VALUE
11775 rb_sym_to_s(VALUE sym)
11776 {
11777     return str_new_shared(rb_cString, rb_sym2str(sym));
11778 }
11779
11780 VALUE
11781 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11782 {
11783     VALUE obj;
11784
11785     if (argc < 1) {
11786         rb_raise(rb_eArgError, "no receiver given");
11787     }
11788     obj = argv[0];
11789     return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11790 }
11791
11792 /*
11793  *  call-seq:
11794  *    succ
11795  *
11796  *  Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11797  *
11798  *    :foo.succ # => :fop
11799  *
11800  *  Related: String#succ.
11801  */
11802
11803 static VALUE
11804 sym_succ(VALUE sym)
11805 {
11806     return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11807 }
11808
11809 /*
11810  *  call-seq:
11811  *   symbol <=> object -> -1, 0, +1, or nil
11812  *
11813  *  If +object+ is a symbol,
11814  *  returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11815  *
11816  *    :bar <=> :foo # => -1
11817  *    :foo <=> :foo # => 0
11818  *    :foo <=> :bar # => 1
11819  *
11820  *  Otherwise, returns +nil+:
11821  *
11822  *   :foo <=> 'bar' # => nil
11823  *
11824  *  Related: String#<=>.
11825  */
11826
11827 static VALUE
11828 sym_cmp(VALUE sym, VALUE other)
11829 {
11830     if (!SYMBOL_P(other)) {
11831         return Qnil;
11832     }
11833     return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11834 }
11835
11836 /*
11837  *  call-seq:
11838  *    casecmp(object) -> -1, 0, 1, or nil
11839  *
11840  *  :include: doc/symbol/casecmp.rdoc
11841  *
11842  */
11843
11844 static VALUE
11845 sym_casecmp(VALUE sym, VALUE other)
11846 {
11847     if (!SYMBOL_P(other)) {
11848         return Qnil;
11849     }
11850     return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11851 }
11852
11853 /*
11854  *  call-seq:
11855  *    casecmp?(object) -> true, false, or nil
11856  *
11857  *  :include: doc/symbol/casecmp_p.rdoc
11858  *
11859  */
11860
11861 static VALUE
11862 sym_casecmp_p(VALUE sym, VALUE other)
11863 {
11864     if (!SYMBOL_P(other)) {
11865         return Qnil;
11866     }
11867     return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11868 }
11869
11870 /*
11871  *  call-seq:
11872  *    symbol =~ object -> integer or nil
11873  *
11874  *  Equivalent to <tt>symbol.to_s =~ object</tt>,
11875  *  including possible updates to global variables;
11876  *  see String#=~.
11877  *
11878  */
11879
11880 static VALUE
11881 sym_match(VALUE sym, VALUE other)
11882 {
11883     return rb_str_match(rb_sym2str(sym), other);
11884 }
11885
11886 /*
11887  *  call-seq:
11888  *    match(pattern, offset = 0) -> matchdata or nil
11889  *    match(pattern, offset = 0) {|matchdata| } -> object
11890  *
11891  *  Equivalent to <tt>self.to_s.match</tt>,
11892  *  including possible updates to global variables;
11893  *  see String#match.
11894  *
11895  */
11896
11897 static VALUE
11898 sym_match_m(int argc, VALUE *argv, VALUE sym)
11899 {
11900     return rb_str_match_m(argc, argv, rb_sym2str(sym));
11901 }
11902
11903 /*
11904  *  call-seq:
11905  *    match?(pattern, offset) -> true or false
11906  *
11907  *  Equivalent to <tt>sym.to_s.match?</tt>;
11908  *  see String#match.
11909  *
11910  */
11911
11912 static VALUE
11913 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11914 {
11915     return rb_str_match_m_p(argc, argv, sym);
11916 }
11917
11918 /*
11919  *  call-seq:
11920  *    symbol[index] -> string or nil
11921  *    symbol[start, length] -> string or nil
11922  *    symbol[range] -> string or nil
11923  *    symbol[regexp, capture = 0] -> string or nil
11924  *    symbol[substring] -> string or nil
11925  *
11926  *  Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11927  *
11928  */
11929
11930 static VALUE
11931 sym_aref(int argc, VALUE *argv, VALUE sym)
11932 {
11933     return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11934 }
11935
11936 /*
11937  *  call-seq:
11938  *    length -> integer
11939  *
11940  *  Equivalent to <tt>self.to_s.length</tt>; see String#length.
11941  */
11942
11943 static VALUE
11944 sym_length(VALUE sym)
11945 {
11946     return rb_str_length(rb_sym2str(sym));
11947 }
11948
11949 /*
11950  *  call-seq:
11951  *    empty? -> true or false
11952  *
11953  *  Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11954  *
11955  */
11956
11957 static VALUE
11958 sym_empty(VALUE sym)
11959 {
11960     return rb_str_empty(rb_sym2str(sym));
11961 }
11962
11963 /*
11964  *  call-seq:
11965  *    upcase(*options) -> symbol
11966  *
11967  *  Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11968  *
11969  *  See String#upcase.
11970  *
11971  */
11972
11973 static VALUE
11974 sym_upcase(int argc, VALUE *argv, VALUE sym)
11975 {
11976     return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11977 }
11978
11979 /*
11980  *  call-seq:
11981  *    downcase(*options) -> symbol
11982  *
11983  *  Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11984  *
11985  *  See String#downcase.
11986  *
11987  *  Related: Symbol#upcase.
11988  *
11989  */
11990
11991 static VALUE
11992 sym_downcase(int argc, VALUE *argv, VALUE sym)
11993 {
11994     return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11995 }
11996
11997 /*
11998  *  call-seq:
11999  *    capitalize(*options) -> symbol
12000  *
12001  *  Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12002  *
12003  *  See String#capitalize.
12004  *
12005  */
12006
12007 static VALUE
12008 sym_capitalize(int argc, VALUE *argv, VALUE sym)
12009 {
12010     return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12011 }
12012
12013 /*
12014  *  call-seq:
12015  *    swapcase(*options) -> symbol
12016  *
12017  *  Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12018  *
12019  *  See String#swapcase.
12020  *
12021  */
12022
12023 static VALUE
12024 sym_swapcase(int argc, VALUE *argv, VALUE sym)
12025 {
12026     return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12027 }
12028
12029 /*
12030  *  call-seq:
12031  *    start_with?(*string_or_regexp) -> true or false
12032  *
12033  *  Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12034  *
12035  */
12036
12037 static VALUE
12038 sym_start_with(int argc, VALUE *argv, VALUE sym)
12039 {
12040     return rb_str_start_with(argc, argv, rb_sym2str(sym));
12041 }
12042
12043 /*
12044  *  call-seq:
12045  *    end_with?(*strings) -> true or false
12046  *
12047  *
12048  *  Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12049  *
12050  */
12051
12052 static VALUE
12053 sym_end_with(int argc, VALUE *argv, VALUE sym)
12054 {
12055     return rb_str_end_with(argc, argv, rb_sym2str(sym));
12056 }
12057
12058 /*
12059  *  call-seq:
12060  *    encoding -> encoding
12061  *
12062  *  Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12063  *
12064  */
12065
12066 static VALUE
12067 sym_encoding(VALUE sym)
12068 {
12069     return rb_obj_encoding(rb_sym2str(sym));
12070 }
12071
12072 static VALUE
12073 string_for_symbol(VALUE name)
12074 {
12075     if (!RB_TYPE_P(name, T_STRING)) {
12076         VALUE tmp = rb_check_string_type(name);
12077         if (NIL_P(tmp)) {
12078             rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
12079                      name);
12080         }
12081         name = tmp;
12082     }
12083     return name;
12084 }
12085
12086 ID
12087 rb_to_id(VALUE name)
12088 {
12089     if (SYMBOL_P(name)) {
12090         return SYM2ID(name);
12091     }
12092     name = string_for_symbol(name);
12093     return rb_intern_str(name);
12094 }
12095
12096 VALUE
12097 rb_to_symbol(VALUE name)
12098 {
12099     if (SYMBOL_P(name)) {
12100         return name;
12101     }
12102     name = string_for_symbol(name);
12103     return rb_str_intern(name);
12104 }
12105
12106 /*
12107  *  call-seq:
12108  *    Symbol.all_symbols -> array_of_symbols
12109  *
12110  *  Returns an array of all symbols currently in Ruby's symbol table:
12111  *
12112  *    Symbol.all_symbols.size    # => 9334
12113  *    Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12114  *
12115  */
12116
12117 static VALUE
12118 sym_all_symbols(VALUE _)
12119 {
12120     return rb_sym_all_symbols();
12121 }
12122
12123 VALUE
12124 rb_str_to_interned_str(VALUE str)
12125 {
12126     return rb_fstring(str);
12127 }
12128
12129 VALUE
12130 rb_interned_str(const char *ptr, long len)
12131 {
12132     struct RString fake_str;
12133     return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
12134 }
12135
12136 VALUE
12137 rb_interned_str_cstr(const char *ptr)
12138 {
12139     return rb_interned_str(ptr, strlen(ptr));
12140 }
12141
12142 VALUE
12143 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12144 {
12145     if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12146         rb_enc_autoload(enc);
12147     }
12148
12149     struct RString fake_str;
12150     return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
12151 }
12152
12153 VALUE
12154 rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
12155 {
12156     return rb_enc_interned_str(ptr, strlen(ptr), enc);
12157 }
12158
12159 void
12160 Init_String(void)
12161 {
12162     rb_cString  = rb_define_class("String", rb_cObject);
12163     RUBY_ASSERT(rb_vm_fstring_table());
12164     st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12165     rb_include_module(rb_cString, rb_mComparable);
12166     rb_define_alloc_func(rb_cString, empty_str_alloc);
12167     rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12168     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12169     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12170     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12171     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12172     rb_define_method(rb_cString, "==", rb_str_equal, 1);
12173     rb_define_method(rb_cString, "===", rb_str_equal, 1);
12174     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12175     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12176     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12177     rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12178     rb_define_method(rb_cString, "+", rb_str_plus, 1);
12179     rb_define_method(rb_cString, "*", rb_str_times, 1);
12180     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12181     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12182     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12183     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12184     rb_define_method(rb_cString, "length", rb_str_length, 0);
12185     rb_define_method(rb_cString, "size", rb_str_length, 0);
12186     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12187     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12188     rb_define_method(rb_cString, "=~", rb_str_match, 1);
12189     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12190     rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12191     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
12192     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12193     rb_define_method(rb_cString, "next", rb_str_succ, 0);
12194     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12195     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12196     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12197     rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12198     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12199     rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12200     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12201     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12202     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12203     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12204     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12205     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12206     rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12207     rb_define_method(rb_cString, "scrub", str_scrub, -1);
12208     rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12209     rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12210     rb_define_method(rb_cString, "+@", str_uplus, 0);
12211     rb_define_method(rb_cString, "-@", str_uminus, 0);
12212     rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12213     rb_define_alias(rb_cString, "dedup", "-@");
12214
12215     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12216     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12217     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12218     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12219     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12220     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
12221     rb_define_method(rb_cString, "undump", str_undump, 0);
12222
12223     sym_ascii      = ID2SYM(rb_intern_const("ascii"));
12224     sym_turkic     = ID2SYM(rb_intern_const("turkic"));
12225     sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12226     sym_fold       = ID2SYM(rb_intern_const("fold"));
12227
12228     rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12229     rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12230     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12231     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12232
12233     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12234     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12235     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12236     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12237
12238     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12239     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12240     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12241     rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12242     rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12243     rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12244     rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12245     rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12246     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12247     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12248     rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12249     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12250     rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12251     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12252     rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12253     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12254     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12255
12256     rb_define_method(rb_cString, "include?", rb_str_include, 1);
12257     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12258     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12259
12260     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12261
12262     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12263     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12264     rb_define_method(rb_cString, "center", rb_str_center, -1);
12265
12266     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12267     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12268     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12269     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12270     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12271     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12272     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12273     rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12274     rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12275
12276     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12277     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12278     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12279     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12280     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12281     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12282     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12283     rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12284     rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12285
12286     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12287     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12288     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12289     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12290     rb_define_method(rb_cString, "count", rb_str_count, -1);
12291
12292     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12293     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12294     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12295     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12296
12297     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12298     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12299     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12300     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12301     rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12302
12303     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12304
12305     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12306     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12307
12308     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12309     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12310
12311     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12312     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12313     rb_define_method(rb_cString, "b", rb_str_b, 0);
12314     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12315     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12316
12317     /* define UnicodeNormalize module here so that we don't have to look it up */
12318     mUnicodeNormalize          = rb_define_module("UnicodeNormalize");
12319     id_normalize               = rb_intern_const("normalize");
12320     id_normalized_p            = rb_intern_const("normalized?");
12321
12322     rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12323     rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12324     rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12325
12326     rb_fs = Qnil;
12327     rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12328     rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12329     rb_gc_register_address(&rb_fs);
12330
12331     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12332     rb_include_module(rb_cSymbol, rb_mComparable);
12333     rb_undef_alloc_func(rb_cSymbol);
12334     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
12335     rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12336
12337     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12338     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12339     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12340     rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12341     rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12342     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12343     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12344
12345     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12346     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12347     rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12348     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12349
12350     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12351     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12352     rb_define_method(rb_cSymbol, "length", sym_length, 0);
12353     rb_define_method(rb_cSymbol, "size", sym_length, 0);
12354     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12355     rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12356     rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12357
12358     rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12359     rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12360     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12361     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12362
12363     rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12364     rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12365
12366     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12367 }