transcode.c

   1 /**********************************************************************
   2
   3   transcode.c -
   4
   5   $Author$
   6   created at: Tue Oct 30 16:10:22 JST 2007
   7
   8   Copyright (C) 2007 Martin Duerst
   9
  10 **********************************************************************/
  11
  12 #include "ruby/internal/config.h"
  13
  14 #include <ctype.h>
  15
  16 #include "internal.h"
  17 #include "internal/array.h"
  18 #include "internal/inits.h"
  19 #include "internal/object.h"
  20 #include "internal/string.h"
  21 #include "internal/transcode.h"
  22 #include "ruby/encoding.h"
  23
  24 #include "transcode_data.h"
  25 #include "id.h"
  26
  27 #define ENABLE_ECONV_NEWLINE_OPTION 1
  28
  29 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
  30 static VALUE rb_eUndefinedConversionError;
  31 static VALUE rb_eInvalidByteSequenceError;
  32 static VALUE rb_eConverterNotFoundError;
  33
  34 VALUE rb_cEncodingConverter;
  35
  36 static ID id_destination_encoding;
  37 static ID id_destination_encoding_name;
  38 static ID id_error_bytes;
  39 static ID id_error_char;
  40 static ID id_incomplete_input;
  41 static ID id_readagain_bytes;
  42 static ID id_source_encoding;
  43 static ID id_source_encoding_name;
  44
  45 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
  46 static VALUE sym_xml, sym_text, sym_attr;
  47 static VALUE sym_universal_newline;
  48 static VALUE sym_crlf_newline;
  49 static VALUE sym_cr_newline;
  50 #ifdef ENABLE_ECONV_NEWLINE_OPTION
  51 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
  52 #endif
  53 static VALUE sym_partial_input;
  54
  55 static VALUE sym_invalid_byte_sequence;
  56 static VALUE sym_undefined_conversion;
  57 static VALUE sym_destination_buffer_full;
  58 static VALUE sym_source_buffer_empty;
  59 static VALUE sym_finished;
  60 static VALUE sym_after_output;
  61 static VALUE sym_incomplete_input;
  62
  63 static unsigned char *
  64 allocate_converted_string(const char *sname, const char *dname,
  65         const unsigned char *str, size_t len,
  66         unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
  67         size_t *dst_len_ptr);
  68
  69 /* dynamic structure, one per conversion (similar to iconv_t) */
  70 /* may carry conversion state (e.g. for iso-2022-jp) */
  71 typedef struct rb_transcoding {
  72     const rb_transcoder *transcoder;
  73
  74     int flags;
  75
  76     int resume_position;
  77     unsigned int next_table;
  78     VALUE next_info;
  79     unsigned char next_byte;
  80     unsigned int output_index;
  81
  82     ssize_t recognized_len; /* already interpreted */
  83     ssize_t readagain_len; /* not yet interpreted */
  84     union {
  85         unsigned char ary[8]; /* max_input <= sizeof(ary) */
  86         unsigned char *ptr; /* length: max_input */
  87     } readbuf; /* recognized_len + readagain_len used */
  88
  89     ssize_t writebuf_off;
  90     ssize_t writebuf_len;
  91     union {
  92         unsigned char ary[8]; /* max_output <= sizeof(ary) */
  93         unsigned char *ptr; /* length: max_output */
  94     } writebuf;
  95
  96     union rb_transcoding_state_t { /* opaque data for stateful encoding */
  97         void *ptr;
  98         char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
  99         double dummy_for_alignment;
 100     } state;
 101 } rb_transcoding;
 102 #define TRANSCODING_READBUF(tc) \
 103     ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
 104      (tc)->readbuf.ary : \
 105      (tc)->readbuf.ptr)
 106 #define TRANSCODING_WRITEBUF(tc) \
 107     ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
 108      (tc)->writebuf.ary : \
 109      (tc)->writebuf.ptr)
 110 #define TRANSCODING_WRITEBUF_SIZE(tc) \
 111     ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
 112      sizeof((tc)->writebuf.ary) : \
 113      (size_t)(tc)->transcoder->max_output)
 114 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
 115 #define TRANSCODING_STATE(tc) \
 116     ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
 117      (tc)->state.ary : \
 118      (tc)->state.ptr)
 119
 120 typedef struct {
 121     struct rb_transcoding *tc;
 122     unsigned char *out_buf_start;
 123     unsigned char *out_data_start;
 124     unsigned char *out_data_end;
 125     unsigned char *out_buf_end;
 126     rb_econv_result_t last_result;
 127 } rb_econv_elem_t;
 128
 129 struct rb_econv_t {
 130     int flags;
 131     int started; /* bool */
 132
 133     const char *source_encoding_name;
 134     const char *destination_encoding_name;
 135
 136     const unsigned char *replacement_str;
 137     size_t replacement_len;
 138     const char *replacement_enc;
 139
 140     unsigned char *in_buf_start;
 141     unsigned char *in_data_start;
 142     unsigned char *in_data_end;
 143     unsigned char *in_buf_end;
 144     rb_econv_elem_t *elems;
 145     int replacement_allocated; /* bool */
 146     int num_allocated;
 147     int num_trans;
 148     int num_finished;
 149     struct rb_transcoding *last_tc;
 150
 151     /* last error */
 152     struct {
 153         rb_econv_result_t result;
 154         struct rb_transcoding *error_tc;
 155         const char *source_encoding;
 156         const char *destination_encoding;
 157         const unsigned char *error_bytes_start;
 158         size_t error_bytes_len;
 159         size_t readagain_len;
 160     } last_error;
 161
 162     /* The following fields are only for Encoding::Converter.
 163      * rb_econv_open set them NULL. */
 164     rb_encoding *source_encoding;
 165     rb_encoding *destination_encoding;
 166 };
 167
 168 /*
 169  *  Dispatch data and logic
 170  */
 171
 172 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
 173
 174 typedef struct {
 175     const char *sname;
 176     const char *dname;
 177     const char *lib; /* null means no need to load a library */
 178     const rb_transcoder *transcoder;
 179 } transcoder_entry_t;
 180
 181 static st_table *transcoder_table;
 182
 183 static transcoder_entry_t *
 184 make_transcoder_entry(const char *sname, const char *dname)
 185 {
 186     st_data_t val;
 187     st_table *table2;
 188
 189     if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
 190         val = (st_data_t)st_init_strcasetable();
 191         st_add_direct(transcoder_table, (st_data_t)sname, val);
 192     }
 193     table2 = (st_table *)val;
 194     if (!st_lookup(table2, (st_data_t)dname, &val)) {
 195         transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
 196         entry->sname = sname;
 197         entry->dname = dname;
 198         entry->lib = NULL;
 199         entry->transcoder = NULL;
 200         val = (st_data_t)entry;
 201         st_add_direct(table2, (st_data_t)dname, val);
 202     }
 203     return (transcoder_entry_t *)val;
 204 }
 205
 206 static transcoder_entry_t *
 207 get_transcoder_entry(const char *sname, const char *dname)
 208 {
 209     st_data_t val;
 210     st_table *table2;
 211
 212     if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
 213         return NULL;
 214     }
 215     table2 = (st_table *)val;
 216     if (!st_lookup(table2, (st_data_t)dname, &val)) {
 217         return NULL;
 218     }
 219     return (transcoder_entry_t *)val;
 220 }
 221
 222 void
 223 rb_register_transcoder(const rb_transcoder *tr)
 224 {
 225     const char *const sname = tr->src_encoding;
 226     const char *const dname = tr->dst_encoding;
 227
 228     transcoder_entry_t *entry;
 229
 230     entry = make_transcoder_entry(sname, dname);
 231     if (entry->transcoder) {
 232         rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
 233                  sname, dname);
 234     }
 235
 236     entry->transcoder = tr;
 237 }
 238
 239 static void
 240 declare_transcoder(const char *sname, const char *dname, const char *lib)
 241 {
 242     transcoder_entry_t *entry;
 243
 244     entry = make_transcoder_entry(sname, dname);
 245     entry->lib = lib;
 246 }
 247
 248 static const char transcoder_lib_prefix[] = "enc/trans/";
 249
 250 void
 251 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
 252 {
 253     if (!lib) {
 254         rb_raise(rb_eArgError, "invalid library name - (null)");
 255     }
 256     declare_transcoder(enc1, enc2, lib);
 257 }
 258
 259 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
 260
 261 typedef struct search_path_queue_tag {
 262     struct search_path_queue_tag *next;
 263     const char *enc;
 264 } search_path_queue_t;
 265
 266 typedef struct {
 267     st_table *visited;
 268     search_path_queue_t *queue;
 269     search_path_queue_t **queue_last_ptr;
 270     const char *base_enc;
 271 } search_path_bfs_t;
 272
 273 static int
 274 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
 275 {
 276     const char *dname = (const char *)key;
 277     search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
 278     search_path_queue_t *q;
 279
 280     if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
 281         return ST_CONTINUE;
 282     }
 283
 284     q = ALLOC(search_path_queue_t);
 285     q->enc = dname;
 286     q->next = NULL;
 287     *bfs->queue_last_ptr = q;
 288     bfs->queue_last_ptr = &q->next;
 289
 290     st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
 291     return ST_CONTINUE;
 292 }
 293
 294 static int
 295 transcode_search_path(const char *sname, const char *dname,
 296     void (*callback)(const char *sname, const char *dname, int depth, void *arg),
 297     void *arg)
 298 {
 299     search_path_bfs_t bfs;
 300     search_path_queue_t *q;
 301     st_data_t val;
 302     st_table *table2;
 303     int found;
 304     int pathlen = -1;
 305
 306     if (encoding_equal(sname, dname))
 307         return -1;
 308
 309     q = ALLOC(search_path_queue_t);
 310     q->enc = sname;
 311     q->next = NULL;
 312     bfs.queue_last_ptr = &q->next;
 313     bfs.queue = q;
 314
 315     bfs.visited = st_init_strcasetable();
 316     st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
 317
 318     while (bfs.queue) {
 319         q = bfs.queue;
 320         bfs.queue = q->next;
 321         if (!bfs.queue)
 322             bfs.queue_last_ptr = &bfs.queue;
 323
 324         if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
 325             xfree(q);
 326             continue;
 327         }
 328         table2 = (st_table *)val;
 329
 330         if (st_lookup(table2, (st_data_t)dname, &val)) {
 331             st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
 332             xfree(q);
 333             found = 1;
 334             goto cleanup;
 335         }
 336
 337         bfs.base_enc = q->enc;
 338         st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
 339         bfs.base_enc = NULL;
 340
 341         xfree(q);
 342     }
 343     found = 0;
 344
 345   cleanup:
 346     while (bfs.queue) {
 347         q = bfs.queue;
 348         bfs.queue = q->next;
 349         xfree(q);
 350     }
 351
 352     if (found) {
 353         const char *enc = dname;
 354         int depth;
 355         pathlen = 0;
 356         while (1) {
 357             st_lookup(bfs.visited, (st_data_t)enc, &val);
 358             if (!val)
 359                 break;
 360             pathlen++;
 361             enc = (const char *)val;
 362         }
 363         depth = pathlen;
 364         enc = dname;
 365         while (1) {
 366             st_lookup(bfs.visited, (st_data_t)enc, &val);
 367             if (!val)
 368                 break;
 369             callback((const char *)val, enc, --depth, arg);
 370             enc = (const char *)val;
 371         }
 372     }
 373
 374     st_free_table(bfs.visited);
 375
 376     return pathlen; /* is -1 if not found */
 377 }
 378
 379 int rb_require_internal_silent(VALUE fname);
 380
 381 static const rb_transcoder *
 382 load_transcoder_entry(transcoder_entry_t *entry)
 383 {
 384     if (entry->transcoder)
 385         return entry->transcoder;
 386
 387     if (entry->lib) {
 388         const char *const lib = entry->lib;
 389         const size_t len = strlen(lib);
 390         const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
 391         const VALUE fn = rb_str_new(0, total_len);
 392         char *const path = RSTRING_PTR(fn);
 393
 394         memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
 395         memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
 396         rb_str_set_len(fn, total_len);
 397         OBJ_FREEZE(fn);
 398         rb_require_internal_silent(fn);
 399     }
 400
 401     if (entry->transcoder)
 402         return entry->transcoder;
 403
 404     return NULL;
 405 }
 406
 407 static const char*
 408 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
 409 {
 410     if (encoding_equal(encname, "UTF-8")) {
 411         *len_ret = 3;
 412         *repl_encname_ptr = "UTF-8";
 413         return "\xEF\xBF\xBD";
 414     }
 415     else {
 416         *len_ret = 1;
 417         *repl_encname_ptr = "US-ASCII";
 418         return "?";
 419     }
 420 }
 421
 422 /*
 423  *  Transcoding engine logic
 424  */
 425
 426 static const unsigned char *
 427 transcode_char_start(rb_transcoding *tc,
 428                          const unsigned char *in_start,
 429                          const unsigned char *inchar_start,
 430                          const unsigned char *in_p,
 431                          size_t *char_len_ptr)
 432 {
 433     const unsigned char *ptr;
 434     if (inchar_start - in_start < tc->recognized_len) {
 435         MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
 436                inchar_start, unsigned char, in_p - inchar_start);
 437         ptr = TRANSCODING_READBUF(tc);
 438     }
 439     else {
 440         ptr = inchar_start - tc->recognized_len;
 441     }
 442     *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
 443     return ptr;
 444 }
 445
 446 static rb_econv_result_t
 447 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
 448                       const unsigned char *in_stop, unsigned char *out_stop,
 449                       rb_transcoding *tc,
 450                       const int opt)
 451 {
 452     const rb_transcoder *tr = tc->transcoder;
 453     int unitlen = tr->input_unit_length;
 454     ssize_t readagain_len = 0;
 455
 456     const unsigned char *inchar_start;
 457     const unsigned char *in_p;
 458
 459     unsigned char *out_p;
 460
 461     in_p = inchar_start = *in_pos;
 462
 463     out_p = *out_pos;
 464
 465 #define SUSPEND(ret, num) \
 466     do { \
 467         tc->resume_position = (num); \
 468         if (0 < in_p - inchar_start) \
 469             MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
 470                    inchar_start, unsigned char, in_p - inchar_start); \
 471         *in_pos = in_p; \
 472         *out_pos = out_p; \
 473         tc->recognized_len += in_p - inchar_start; \
 474         if (readagain_len) { \
 475             tc->recognized_len -= readagain_len; \
 476             tc->readagain_len = readagain_len; \
 477         } \
 478         return (ret); \
 479         resume_label ## num:; \
 480     } while (0)
 481 #define SUSPEND_OBUF(num) \
 482     do { \
 483         while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
 484     } while (0)
 485
 486 #define SUSPEND_AFTER_OUTPUT(num) \
 487     if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
 488         SUSPEND(econv_after_output, num); \
 489     }
 490
 491 #define next_table (tc->next_table)
 492 #define next_info (tc->next_info)
 493 #define next_byte (tc->next_byte)
 494 #define writebuf_len (tc->writebuf_len)
 495 #define writebuf_off (tc->writebuf_off)
 496
 497     switch (tc->resume_position) {
 498       case 0: break;
 499       case 1: goto resume_label1;
 500       case 2: goto resume_label2;
 501       case 3: goto resume_label3;
 502       case 4: goto resume_label4;
 503       case 5: goto resume_label5;
 504       case 6: goto resume_label6;
 505       case 7: goto resume_label7;
 506       case 8: goto resume_label8;
 507       case 9: goto resume_label9;
 508       case 10: goto resume_label10;
 509       case 11: goto resume_label11;
 510       case 12: goto resume_label12;
 511       case 13: goto resume_label13;
 512       case 14: goto resume_label14;
 513       case 15: goto resume_label15;
 514       case 16: goto resume_label16;
 515       case 17: goto resume_label17;
 516       case 18: goto resume_label18;
 517       case 19: goto resume_label19;
 518       case 20: goto resume_label20;
 519       case 21: goto resume_label21;
 520       case 22: goto resume_label22;
 521       case 23: goto resume_label23;
 522       case 24: goto resume_label24;
 523       case 25: goto resume_label25;
 524       case 26: goto resume_label26;
 525       case 27: goto resume_label27;
 526       case 28: goto resume_label28;
 527       case 29: goto resume_label29;
 528       case 30: goto resume_label30;
 529       case 31: goto resume_label31;
 530       case 32: goto resume_label32;
 531       case 33: goto resume_label33;
 532       case 34: goto resume_label34;
 533     }
 534
 535     while (1) {
 536         inchar_start = in_p;
 537         tc->recognized_len = 0;
 538         next_table = tr->conv_tree_start;
 539
 540         SUSPEND_AFTER_OUTPUT(24);
 541
 542         if (in_stop <= in_p) {
 543             if (!(opt & ECONV_PARTIAL_INPUT))
 544                 break;
 545             SUSPEND(econv_source_buffer_empty, 7);
 546             continue;
 547         }
 548
 549 #define BYTE_ADDR(index) (tr->byte_array + (index))
 550 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
 551 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
 552 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
 553 #define BL_MIN_BYTE     (BL_BASE[0])
 554 #define BL_MAX_BYTE     (BL_BASE[1])
 555 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
 556 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
 557
 558         next_byte = (unsigned char)*in_p++;
 559       follow_byte:
 560         if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
 561             next_info = INVALID;
 562         else {
 563             next_info = (VALUE)BL_ACTION(next_byte);
 564         }
 565       follow_info:
 566         switch (next_info & 0x1F) {
 567           case NOMAP:
 568             {
 569                 const unsigned char *p = inchar_start;
 570                 writebuf_off = 0;
 571                 while (p < in_p) {
 572                     TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
 573                 }
 574                 writebuf_len = writebuf_off;
 575                 writebuf_off = 0;
 576                 while (writebuf_off < writebuf_len) {
 577                     SUSPEND_OBUF(3);
 578                     *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
 579                 }
 580             }
 581             continue;
 582           case 0x00: case 0x04: case 0x08: case 0x0C:
 583           case 0x10: case 0x14: case 0x18: case 0x1C:
 584             SUSPEND_AFTER_OUTPUT(25);
 585             while (in_p >= in_stop) {
 586                 if (!(opt & ECONV_PARTIAL_INPUT))
 587                     goto incomplete;
 588                 SUSPEND(econv_source_buffer_empty, 5);
 589             }
 590             next_byte = (unsigned char)*in_p++;
 591             next_table = (unsigned int)next_info;
 592             goto follow_byte;
 593           case ZERObt: /* drop input */
 594             continue;
 595           case ONEbt:
 596             SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
 597             continue;
 598           case TWObt:
 599             SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
 600             SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
 601             continue;
 602           case THREEbt:
 603             SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
 604             SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
 605             SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
 606             continue;
 607           case FOURbt:
 608             SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
 609             SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
 610             SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
 611             SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
 612             continue;
 613           case GB4bt:
 614             SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
 615             SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
 616             SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
 617             SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
 618             continue;
 619           case STR1:
 620             tc->output_index = 0;
 621             while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
 622                 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
 623                 tc->output_index++;
 624             }
 625             continue;
 626           case FUNii:
 627             next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
 628             goto follow_info;
 629           case FUNsi:
 630             {
 631                 const unsigned char *char_start;
 632                 size_t char_len;
 633                 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
 634                 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
 635                 goto follow_info;
 636             }
 637           case FUNio:
 638             SUSPEND_OBUF(13);
 639             if (tr->max_output <= out_stop - out_p)
 640                 out_p += tr->func_io(TRANSCODING_STATE(tc),
 641                     next_info, out_p, out_stop - out_p);
 642             else {
 643                 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
 644                     next_info,
 645                     TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
 646                 writebuf_off = 0;
 647                 while (writebuf_off < writebuf_len) {
 648                     SUSPEND_OBUF(20);
 649                     *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
 650                 }
 651             }
 652             break;
 653           case FUNso:
 654             {
 655                 const unsigned char *char_start;
 656                 size_t char_len;
 657                 SUSPEND_OBUF(14);
 658                 if (tr->max_output <= out_stop - out_p) {
 659                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
 660                     out_p += tr->func_so(TRANSCODING_STATE(tc),
 661                         char_start, (size_t)char_len,
 662                         out_p, out_stop - out_p);
 663                 }
 664                 else {
 665                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
 666                     writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
 667                         char_start, (size_t)char_len,
 668                         TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
 669                     writebuf_off = 0;
 670                     while (writebuf_off < writebuf_len) {
 671                         SUSPEND_OBUF(22);
 672                         *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
 673                     }
 674                 }
 675                 break;
 676             }
 677       case FUNsio:
 678             {
 679                 const unsigned char *char_start;
 680                 size_t char_len;
 681                 SUSPEND_OBUF(33);
 682                 if (tr->max_output <= out_stop - out_p) {
 683                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
 684                     out_p += tr->func_sio(TRANSCODING_STATE(tc),
 685                         char_start, (size_t)char_len, next_info,
 686                         out_p, out_stop - out_p);
 687                 }
 688                 else {
 689                     char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
 690                     writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
 691                         char_start, (size_t)char_len, next_info,
 692                         TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
 693                     writebuf_off = 0;
 694                     while (writebuf_off < writebuf_len) {
 695                         SUSPEND_OBUF(34);
 696                         *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
 697                     }
 698                 }
 699                 break;
 700             }
 701           case INVALID:
 702             if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
 703                 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
 704                     SUSPEND_AFTER_OUTPUT(26);
 705                 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
 706                     in_p = in_stop;
 707                     SUSPEND(econv_source_buffer_empty, 8);
 708                 }
 709                 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
 710                     in_p = in_stop;
 711                 }
 712                 else {
 713                     in_p = inchar_start + (unitlen - tc->recognized_len);
 714                 }
 715             }
 716             else {
 717                 ssize_t invalid_len; /* including the last byte which causes invalid */
 718                 ssize_t discard_len;
 719                 invalid_len = tc->recognized_len + (in_p - inchar_start);
 720                 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
 721                 readagain_len = invalid_len - discard_len;
 722             }
 723             goto invalid;
 724           case UNDEF:
 725             goto undef;
 726           default:
 727             rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
 728         }
 729         continue;
 730
 731       invalid:
 732         SUSPEND(econv_invalid_byte_sequence, 1);
 733         continue;
 734
 735       incomplete:
 736         SUSPEND(econv_incomplete_input, 27);
 737         continue;
 738
 739       undef:
 740         SUSPEND(econv_undefined_conversion, 2);
 741         continue;
 742     }
 743
 744     /* cleanup */
 745     if (tr->finish_func) {
 746         SUSPEND_OBUF(4);
 747         if (tr->max_output <= out_stop - out_p) {
 748             out_p += tr->finish_func(TRANSCODING_STATE(tc),
 749                 out_p, out_stop - out_p);
 750         }
 751         else {
 752             writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
 753                 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
 754             writebuf_off = 0;
 755             while (writebuf_off < writebuf_len) {
 756                 SUSPEND_OBUF(23);
 757                 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
 758             }
 759         }
 760     }
 761     while (1)
 762         SUSPEND(econv_finished, 6);
 763 #undef SUSPEND
 764 #undef next_table
 765 #undef next_info
 766 #undef next_byte
 767 #undef writebuf_len
 768 #undef writebuf_off
 769 }
 770
 771 static rb_econv_result_t
 772 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
 773                       const unsigned char *in_stop, unsigned char *out_stop,
 774                       rb_transcoding *tc,
 775                       const int opt)
 776 {
 777     if (tc->readagain_len) {
 778         unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
 779         const unsigned char *readagain_pos = readagain_buf;
 780         const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
 781         rb_econv_result_t res;
 782
 783         MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
 784                unsigned char, tc->readagain_len);
 785         tc->readagain_len = 0;
 786         res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
 787         if (res != econv_source_buffer_empty) {
 788             MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
 789                    readagain_pos, unsigned char, readagain_stop - readagain_pos);
 790             tc->readagain_len += readagain_stop - readagain_pos;
 791             return res;
 792         }
 793     }
 794     return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
 795 }
 796
 797 static rb_transcoding *
 798 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
 799 {
 800     rb_transcoding *tc;
 801
 802     tc = ALLOC(rb_transcoding);
 803     tc->transcoder = tr;
 804     tc->flags = flags;
 805     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
 806         tc->state.ptr = xmalloc(tr->state_size);
 807     if (tr->state_init_func) {
 808         (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
 809     }
 810     tc->resume_position = 0;
 811     tc->recognized_len = 0;
 812     tc->readagain_len = 0;
 813     tc->writebuf_len = 0;
 814     tc->writebuf_off = 0;
 815     if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
 816         tc->readbuf.ptr = xmalloc(tr->max_input);
 817     }
 818     if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
 819         tc->writebuf.ptr = xmalloc(tr->max_output);
 820     }
 821     return tc;
 822 }
 823
 824 static rb_econv_result_t
 825 rb_transcoding_convert(rb_transcoding *tc,
 826   const unsigned char **input_ptr, const unsigned char *input_stop,
 827   unsigned char **output_ptr, unsigned char *output_stop,
 828   int flags)
 829 {
 830     return transcode_restartable(
 831                 input_ptr, output_ptr,
 832                 input_stop, output_stop,
 833                 tc, flags);
 834 }
 835
 836 static void
 837 rb_transcoding_close(rb_transcoding *tc)
 838 {
 839     const rb_transcoder *tr = tc->transcoder;
 840     if (tr->state_fini_func) {
 841         (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
 842     }
 843     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
 844         xfree(tc->state.ptr);
 845     if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
 846         xfree(tc->readbuf.ptr);
 847     if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
 848         xfree(tc->writebuf.ptr);
 849     xfree(tc);
 850 }
 851
 852 static size_t
 853 rb_transcoding_memsize(rb_transcoding *tc)
 854 {
 855     size_t size = sizeof(rb_transcoding);
 856     const rb_transcoder *tr = tc->transcoder;
 857
 858     if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
 859         size += tr->state_size;
 860     }
 861     if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
 862         size += tr->max_input;
 863     }
 864     if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
 865         size += tr->max_output;
 866     }
 867     return size;
 868 }
 869
 870 static rb_econv_t *
 871 rb_econv_alloc(int n_hint)
 872 {
 873     rb_econv_t *ec;
 874
 875     if (n_hint <= 0)
 876         n_hint = 1;
 877
 878     ec = ALLOC(rb_econv_t);
 879     ec->flags = 0;
 880     ec->source_encoding_name = NULL;
 881     ec->destination_encoding_name = NULL;
 882     ec->started = 0;
 883     ec->replacement_str = NULL;
 884     ec->replacement_len = 0;
 885     ec->replacement_enc = NULL;
 886     ec->replacement_allocated = 0;
 887     ec->in_buf_start = NULL;
 888     ec->in_data_start = NULL;
 889     ec->in_data_end = NULL;
 890     ec->in_buf_end = NULL;
 891     ec->num_allocated = n_hint;
 892     ec->num_trans = 0;
 893     ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
 894     ec->num_finished = 0;
 895     ec->last_tc = NULL;
 896     ec->last_error.result = econv_source_buffer_empty;
 897     ec->last_error.error_tc = NULL;
 898     ec->last_error.source_encoding = NULL;
 899     ec->last_error.destination_encoding = NULL;
 900     ec->last_error.error_bytes_start = NULL;
 901     ec->last_error.error_bytes_len = 0;
 902     ec->last_error.readagain_len = 0;
 903     ec->source_encoding = NULL;
 904     ec->destination_encoding = NULL;
 905     return ec;
 906 }
 907
 908 static int
 909 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
 910 {
 911     int n, j;
 912     int bufsize = 4096;
 913     unsigned char *p;
 914
 915     if (ec->num_trans == ec->num_allocated) {
 916         n = ec->num_allocated * 2;
 917         REALLOC_N(ec->elems, rb_econv_elem_t, n);
 918         ec->num_allocated = n;
 919     }
 920
 921     p = xmalloc(bufsize);
 922
 923     MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
 924
 925     ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
 926     ec->elems[i].out_buf_start = p;
 927     ec->elems[i].out_buf_end = p + bufsize;
 928     ec->elems[i].out_data_start = p;
 929     ec->elems[i].out_data_end = p;
 930     ec->elems[i].last_result = econv_source_buffer_empty;
 931
 932     ec->num_trans++;
 933
 934     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
 935         for (j = ec->num_trans-1; i <= j; j--) {
 936             rb_transcoding *tc = ec->elems[j].tc;
 937             const rb_transcoder *tr2 = tc->transcoder;
 938             if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
 939                 ec->last_tc = tc;
 940                 break;
 941             }
 942         }
 943
 944     return 0;
 945 }
 946
 947 static rb_econv_t *
 948 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
 949 {
 950     rb_econv_t *ec;
 951     int i, ret;
 952
 953     for (i = 0; i < n; i++) {
 954         const rb_transcoder *tr;
 955         tr = load_transcoder_entry(entries[i]);
 956         if (!tr)
 957             return NULL;
 958     }
 959
 960     ec = rb_econv_alloc(n);
 961
 962     for (i = 0; i < n; i++) {
 963         const rb_transcoder *tr = load_transcoder_entry(entries[i]);
 964         ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
 965         if (ret == -1) {
 966             rb_econv_close(ec);
 967             return NULL;
 968         }
 969     }
 970
 971     return ec;
 972 }
 973
 974 struct trans_open_t {
 975     transcoder_entry_t **entries;
 976     int num_additional;
 977 };
 978
 979 static void
 980 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
 981 {
 982     struct trans_open_t *toarg = arg;
 983
 984     if (!toarg->entries) {
 985         toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
 986     }
 987     toarg->entries[depth] = get_transcoder_entry(sname, dname);
 988 }
 989
 990 static rb_econv_t *
 991 rb_econv_open0(const char *sname, const char *dname, int ecflags)
 992 {
 993     transcoder_entry_t **entries = NULL;
 994     int num_trans;
 995     rb_econv_t *ec;
 996
 997     /* Just check if sname and dname are defined */
 998     /* (This check is needed?) */
 999     if (*sname) rb_enc_find_index(sname);
1000     if (*dname) rb_enc_find_index(dname);
1001
1002     if (*sname == '\0' && *dname == '\0') {
1003         num_trans = 0;
1004         entries = NULL;
1005         sname = dname = "";
1006     }
1007     else {
1008         struct trans_open_t toarg;
1009         toarg.entries = NULL;
1010         toarg.num_additional = 0;
1011         num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1012         entries = toarg.entries;
1013         if (num_trans < 0) {
1014             xfree(entries);
1015             return NULL;
1016         }
1017     }
1018
1019     ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1020     xfree(entries);
1021     if (!ec)
1022         return NULL;
1023
1024     ec->flags = ecflags;
1025     ec->source_encoding_name = sname;
1026     ec->destination_encoding_name = dname;
1027
1028     return ec;
1029 }
1030
1031 #define MAX_ECFLAGS_DECORATORS 32
1032
1033 static int
1034 decorator_names(int ecflags, const char **decorators_ret)
1035 {
1036     int num_decorators;
1037
1038     switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1039       case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
1040       case ECONV_CRLF_NEWLINE_DECORATOR:
1041       case ECONV_CR_NEWLINE_DECORATOR:
1042       case 0:
1043         break;
1044       default:
1045         return -1;
1046     }
1047
1048     if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1049         (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
1050         return -1;
1051
1052     num_decorators = 0;
1053
1054     if (ecflags & ECONV_XML_TEXT_DECORATOR)
1055         decorators_ret[num_decorators++] = "xml_text_escape";
1056     if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1057         decorators_ret[num_decorators++] = "xml_attr_content_escape";
1058     if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1059         decorators_ret[num_decorators++] = "xml_attr_quote";
1060
1061     if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1062         decorators_ret[num_decorators++] = "crlf_newline";
1063     if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1064         decorators_ret[num_decorators++] = "cr_newline";
1065     if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1066         decorators_ret[num_decorators++] = "universal_newline";
1067
1068     return num_decorators;
1069 }
1070
1071 rb_econv_t *
1072 rb_econv_open(const char *sname, const char *dname, int ecflags)
1073 {
1074     rb_econv_t *ec;
1075     int num_decorators;
1076     const char *decorators[MAX_ECFLAGS_DECORATORS];
1077     int i;
1078
1079     num_decorators = decorator_names(ecflags, decorators);
1080     if (num_decorators == -1)
1081         return NULL;
1082
1083     ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1084     if (!ec)
1085         return NULL;
1086
1087     for (i = 0; i < num_decorators; i++)
1088         if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1089             rb_econv_close(ec);
1090             return NULL;
1091         }
1092
1093     ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1094
1095     return ec;
1096 }
1097
1098 static int
1099 trans_sweep(rb_econv_t *ec,
1100     const unsigned char **input_ptr, const unsigned char *input_stop,
1101     unsigned char **output_ptr, unsigned char *output_stop,
1102     int flags,
1103     int start)
1104 {
1105     int try;
1106     int i, f;
1107
1108     const unsigned char **ipp, *is, *iold;
1109     unsigned char **opp, *os, *oold;
1110     rb_econv_result_t res;
1111
1112     try = 1;
1113     while (try) {
1114         try = 0;
1115         for (i = start; i < ec->num_trans; i++) {
1116             rb_econv_elem_t *te = &ec->elems[i];
1117
1118             if (i == 0) {
1119                 ipp = input_ptr;
1120                 is = input_stop;
1121             }
1122             else {
1123                 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1124                 ipp = (const unsigned char **)&prev_te->out_data_start;
1125                 is = prev_te->out_data_end;
1126             }
1127
1128             if (i == ec->num_trans-1) {
1129                 opp = output_ptr;
1130                 os = output_stop;
1131             }
1132             else {
1133                 if (te->out_buf_start != te->out_data_start) {
1134                     ssize_t len = te->out_data_end - te->out_data_start;
1135                     ssize_t off = te->out_data_start - te->out_buf_start;
1136                     MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1137                     te->out_data_start = te->out_buf_start;
1138                     te->out_data_end -= off;
1139                 }
1140                 opp = &te->out_data_end;
1141                 os = te->out_buf_end;
1142             }
1143
1144             f = flags;
1145             if (ec->num_finished != i)
1146                 f |= ECONV_PARTIAL_INPUT;
1147             if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1148                 start = 1;
1149                 flags &= ~ECONV_AFTER_OUTPUT;
1150             }
1151             if (i != 0)
1152                 f &= ~ECONV_AFTER_OUTPUT;
1153             iold = *ipp;
1154             oold = *opp;
1155             te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1156             if (iold != *ipp || oold != *opp)
1157                 try = 1;
1158
1159             switch (res) {
1160               case econv_invalid_byte_sequence:
1161               case econv_incomplete_input:
1162               case econv_undefined_conversion:
1163               case econv_after_output:
1164                 return i;
1165
1166               case econv_destination_buffer_full:
1167               case econv_source_buffer_empty:
1168                 break;
1169
1170               case econv_finished:
1171                 ec->num_finished = i+1;
1172                 break;
1173             }
1174         }
1175     }
1176     return -1;
1177 }
1178
1179 static rb_econv_result_t
1180 rb_trans_conv(rb_econv_t *ec,
1181     const unsigned char **input_ptr, const unsigned char *input_stop,
1182     unsigned char **output_ptr, unsigned char *output_stop,
1183     int flags,
1184     int *result_position_ptr)
1185 {
1186     int i;
1187     int needreport_index;
1188     int sweep_start;
1189
1190     unsigned char empty_buf;
1191     unsigned char *empty_ptr = &empty_buf;
1192
1193     if (!input_ptr) {
1194         input_ptr = (const unsigned char **)&empty_ptr;
1195         input_stop = empty_ptr;
1196     }
1197
1198     if (!output_ptr) {
1199         output_ptr = &empty_ptr;
1200         output_stop = empty_ptr;
1201     }
1202
1203     if (ec->elems[0].last_result == econv_after_output)
1204         ec->elems[0].last_result = econv_source_buffer_empty;
1205
1206     for (i = ec->num_trans-1; 0 <= i; i--) {
1207         switch (ec->elems[i].last_result) {
1208           case econv_invalid_byte_sequence:
1209           case econv_incomplete_input:
1210           case econv_undefined_conversion:
1211           case econv_after_output:
1212           case econv_finished:
1213             sweep_start = i+1;
1214             goto found_needreport;
1215
1216           case econv_destination_buffer_full:
1217           case econv_source_buffer_empty:
1218             break;
1219
1220           default:
1221             rb_bug("unexpected transcode last result");
1222         }
1223     }
1224
1225     /* /^[sd]+$/ is confirmed.  but actually /^s*d*$/. */
1226
1227     if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1228         (flags & ECONV_AFTER_OUTPUT)) {
1229         rb_econv_result_t res;
1230
1231         res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1232                 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
1233                 result_position_ptr);
1234
1235         if (res == econv_source_buffer_empty)
1236             return econv_after_output;
1237         return res;
1238     }
1239
1240     sweep_start = 0;
1241
1242   found_needreport:
1243
1244     do {
1245         needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1246         sweep_start = needreport_index + 1;
1247     } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1248
1249     for (i = ec->num_trans-1; 0 <= i; i--) {
1250         if (ec->elems[i].last_result != econv_source_buffer_empty) {
1251             rb_econv_result_t res = ec->elems[i].last_result;
1252             if (res == econv_invalid_byte_sequence ||
1253                 res == econv_incomplete_input ||
1254                 res == econv_undefined_conversion ||
1255                 res == econv_after_output) {
1256                 ec->elems[i].last_result = econv_source_buffer_empty;
1257             }
1258             if (result_position_ptr)
1259                 *result_position_ptr = i;
1260             return res;
1261         }
1262     }
1263     if (result_position_ptr)
1264         *result_position_ptr = -1;
1265     return econv_source_buffer_empty;
1266 }
1267
1268 static rb_econv_result_t
1269 rb_econv_convert0(rb_econv_t *ec,
1270     const unsigned char **input_ptr, const unsigned char *input_stop,
1271     unsigned char **output_ptr, unsigned char *output_stop,
1272     int flags)
1273 {
1274     rb_econv_result_t res;
1275     int result_position;
1276     int has_output = 0;
1277
1278     memset(&ec->last_error, 0, sizeof(ec->last_error));
1279
1280     if (ec->num_trans == 0) {
1281         size_t len;
1282         if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1283             if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1284                 len = output_stop - *output_ptr;
1285                 memcpy(*output_ptr, ec->in_data_start, len);
1286                 *output_ptr = output_stop;
1287                 ec->in_data_start += len;
1288                 res = econv_destination_buffer_full;
1289                 goto gotresult;
1290             }
1291             len = ec->in_data_end - ec->in_data_start;
1292             memcpy(*output_ptr, ec->in_data_start, len);
1293             *output_ptr += len;
1294             ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1295             if (flags & ECONV_AFTER_OUTPUT) {
1296                 res = econv_after_output;
1297                 goto gotresult;
1298             }
1299         }
1300         if (output_stop - *output_ptr < input_stop - *input_ptr) {
1301             len = output_stop - *output_ptr;
1302         }
1303         else {
1304             len = input_stop - *input_ptr;
1305         }
1306         if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1307             *(*output_ptr)++ = *(*input_ptr)++;
1308             res = econv_after_output;
1309             goto gotresult;
1310         }
1311         memcpy(*output_ptr, *input_ptr, len);
1312         *output_ptr += len;
1313         *input_ptr += len;
1314         if (*input_ptr != input_stop)
1315             res = econv_destination_buffer_full;
1316         else if (flags & ECONV_PARTIAL_INPUT)
1317             res = econv_source_buffer_empty;
1318         else
1319             res = econv_finished;
1320         goto gotresult;
1321     }
1322
1323     if (ec->elems[ec->num_trans-1].out_data_start) {
1324         unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1325         unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1326         if (data_start != data_end) {
1327             size_t len;
1328             if (output_stop - *output_ptr < data_end - data_start) {
1329                 len = output_stop - *output_ptr;
1330                 memcpy(*output_ptr, data_start, len);
1331                 *output_ptr = output_stop;
1332                 ec->elems[ec->num_trans-1].out_data_start += len;
1333                 res = econv_destination_buffer_full;
1334                 goto gotresult;
1335             }
1336             len = data_end - data_start;
1337             memcpy(*output_ptr, data_start, len);
1338             *output_ptr += len;
1339             ec->elems[ec->num_trans-1].out_data_start =
1340                 ec->elems[ec->num_trans-1].out_data_end =
1341                 ec->elems[ec->num_trans-1].out_buf_start;
1342             has_output = 1;
1343         }
1344     }
1345
1346     if (ec->in_buf_start &&
1347         ec->in_data_start != ec->in_data_end) {
1348         res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1349                 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1350         if (res != econv_source_buffer_empty)
1351             goto gotresult;
1352     }
1353
1354     if (has_output &&
1355         (flags & ECONV_AFTER_OUTPUT) &&
1356         *input_ptr != input_stop) {
1357         input_stop = *input_ptr;
1358         res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1359         if (res == econv_source_buffer_empty)
1360             res = econv_after_output;
1361     }
1362     else if ((flags & ECONV_AFTER_OUTPUT) ||
1363         ec->num_trans == 1) {
1364         res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1365     }
1366     else {
1367         flags |= ECONV_AFTER_OUTPUT;
1368         do {
1369             res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1370         } while (res == econv_after_output);
1371     }
1372
1373   gotresult:
1374     ec->last_error.result = res;
1375     if (res == econv_invalid_byte_sequence ||
1376         res == econv_incomplete_input ||
1377         res == econv_undefined_conversion) {
1378         rb_transcoding *error_tc = ec->elems[result_position].tc;
1379         ec->last_error.error_tc = error_tc;
1380         ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1381         ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1382         ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1383         ec->last_error.error_bytes_len = error_tc->recognized_len;
1384         ec->last_error.readagain_len = error_tc->readagain_len;
1385     }
1386
1387     return res;
1388 }
1389
1390 static int output_replacement_character(rb_econv_t *ec);
1391
1392 static int
1393 output_hex_charref(rb_econv_t *ec)
1394 {
1395     int ret;
1396     unsigned char utfbuf[1024];
1397     const unsigned char *utf;
1398     size_t utf_len;
1399     int utf_allocated = 0;
1400     char charef_buf[16];
1401     const unsigned char *p;
1402
1403     if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1404         utf = ec->last_error.error_bytes_start;
1405         utf_len = ec->last_error.error_bytes_len;
1406     }
1407     else {
1408         utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1409                 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1410                 utfbuf, sizeof(utfbuf),
1411                 &utf_len);
1412         if (!utf)
1413             return -1;
1414         if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1415             utf_allocated = 1;
1416     }
1417
1418     if (utf_len % 4 != 0)
1419         goto fail;
1420
1421     p = utf;
1422     while (4 <= utf_len) {
1423         unsigned int u = 0;
1424         u += p[0] << 24;
1425         u += p[1] << 16;
1426         u += p[2] << 8;
1427         u += p[3];
1428         snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1429
1430         ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1431         if (ret == -1)
1432             goto fail;
1433
1434         p += 4;
1435         utf_len -= 4;
1436     }
1437
1438     if (utf_allocated)
1439         xfree((void *)utf);
1440     return 0;
1441
1442   fail:
1443     if (utf_allocated)
1444         xfree((void *)utf);
1445     return -1;
1446 }
1447
1448 rb_econv_result_t
1449 rb_econv_convert(rb_econv_t *ec,
1450     const unsigned char **input_ptr, const unsigned char *input_stop,
1451     unsigned char **output_ptr, unsigned char *output_stop,
1452     int flags)
1453 {
1454     rb_econv_result_t ret;
1455
1456     unsigned char empty_buf;
1457     unsigned char *empty_ptr = &empty_buf;
1458
1459     ec->started = 1;
1460
1461     if (!input_ptr) {
1462         input_ptr = (const unsigned char **)&empty_ptr;
1463         input_stop = empty_ptr;
1464     }
1465
1466     if (!output_ptr) {
1467         output_ptr = &empty_ptr;
1468         output_stop = empty_ptr;
1469     }
1470
1471   resume:
1472     ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1473
1474     if (ret == econv_invalid_byte_sequence ||
1475         ret == econv_incomplete_input) {
1476         /* deal with invalid byte sequence */
1477         /* todo: add more alternative behaviors */
1478         switch (ec->flags & ECONV_INVALID_MASK) {
1479           case ECONV_INVALID_REPLACE:
1480             if (output_replacement_character(ec) == 0)
1481                 goto resume;
1482         }
1483     }
1484
1485     if (ret == econv_undefined_conversion) {
1486         /* valid character in source encoding
1487          * but no related character(s) in destination encoding */
1488         /* todo: add more alternative behaviors */
1489         switch (ec->flags & ECONV_UNDEF_MASK) {
1490           case ECONV_UNDEF_REPLACE:
1491             if (output_replacement_character(ec) == 0)
1492                 goto resume;
1493             break;
1494
1495           case ECONV_UNDEF_HEX_CHARREF:
1496             if (output_hex_charref(ec) == 0)
1497                 goto resume;
1498             break;
1499         }
1500     }
1501
1502     return ret;
1503 }
1504
1505 const char *
1506 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
1507 {
1508     rb_transcoding *tc = ec->last_tc;
1509     const rb_transcoder *tr;
1510
1511     if (tc == NULL)
1512         return "";
1513
1514     tr = tc->transcoder;
1515
1516     if (tr->asciicompat_type == asciicompat_encoder)
1517         return tr->src_encoding;
1518     return tr->dst_encoding;
1519 }
1520
1521 static unsigned char *
1522 allocate_converted_string(const char *sname, const char *dname,
1523         const unsigned char *str, size_t len,
1524         unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1525         size_t *dst_len_ptr)
1526 {
1527     unsigned char *dst_str;
1528     size_t dst_len;
1529     size_t dst_bufsize;
1530
1531     rb_econv_t *ec;
1532     rb_econv_result_t res;
1533
1534     const unsigned char *sp;
1535     unsigned char *dp;
1536
1537     if (caller_dst_buf)
1538         dst_bufsize = caller_dst_bufsize;
1539     else if (len == 0)
1540         dst_bufsize = 1;
1541     else
1542         dst_bufsize = len;
1543
1544     ec = rb_econv_open(sname, dname, 0);
1545     if (ec == NULL)
1546         return NULL;
1547     if (caller_dst_buf)
1548         dst_str = caller_dst_buf;
1549     else
1550         dst_str = xmalloc(dst_bufsize);
1551     dst_len = 0;
1552     sp = str;
1553     dp = dst_str+dst_len;
1554     res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1555     dst_len = dp - dst_str;
1556     while (res == econv_destination_buffer_full) {
1557         if (SIZE_MAX/2 < dst_bufsize) {
1558             goto fail;
1559         }
1560         dst_bufsize *= 2;
1561         if (dst_str == caller_dst_buf) {
1562             unsigned char *tmp;
1563             tmp = xmalloc(dst_bufsize);
1564             memcpy(tmp, dst_str, dst_bufsize/2);
1565             dst_str = tmp;
1566         }
1567         else {
1568             dst_str = xrealloc(dst_str, dst_bufsize);
1569         }
1570         dp = dst_str+dst_len;
1571         res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1572         dst_len = dp - dst_str;
1573     }
1574     if (res != econv_finished) {
1575         goto fail;
1576     }
1577     rb_econv_close(ec);
1578     *dst_len_ptr = dst_len;
1579     return dst_str;
1580
1581   fail:
1582     if (dst_str != caller_dst_buf)
1583         xfree(dst_str);
1584     rb_econv_close(ec);
1585     return NULL;
1586 }
1587
1588 /* result: 0:success -1:failure */
1589 int
1590 rb_econv_insert_output(rb_econv_t *ec,
1591     const unsigned char *str, size_t len, const char *str_encoding)
1592 {
1593     const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1594     unsigned char insert_buf[4096];
1595     const unsigned char *insert_str = NULL;
1596     size_t insert_len;
1597
1598     int last_trans_index;
1599     rb_transcoding *tc;
1600
1601     unsigned char **buf_start_p;
1602     unsigned char **data_start_p;
1603     unsigned char **data_end_p;
1604     unsigned char **buf_end_p;
1605
1606     size_t need;
1607
1608     ec->started = 1;
1609
1610     if (len == 0)
1611         return 0;
1612
1613     if (encoding_equal(insert_encoding, str_encoding)) {
1614         insert_str = str;
1615         insert_len = len;
1616     }
1617     else {
1618         insert_str = allocate_converted_string(str_encoding, insert_encoding,
1619                 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1620         if (insert_str == NULL)
1621             return -1;
1622     }
1623
1624     need = insert_len;
1625
1626     last_trans_index = ec->num_trans-1;
1627     if (ec->num_trans == 0) {
1628         tc = NULL;
1629         buf_start_p = &ec->in_buf_start;
1630         data_start_p = &ec->in_data_start;
1631         data_end_p = &ec->in_data_end;
1632         buf_end_p = &ec->in_buf_end;
1633     }
1634     else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1635         tc = ec->elems[last_trans_index].tc;
1636         need += tc->readagain_len;
1637         if (need < insert_len)
1638             goto fail;
1639         if (last_trans_index == 0) {
1640             buf_start_p = &ec->in_buf_start;
1641             data_start_p = &ec->in_data_start;
1642             data_end_p = &ec->in_data_end;
1643             buf_end_p = &ec->in_buf_end;
1644         }
1645         else {
1646             rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1647             buf_start_p = &ee->out_buf_start;
1648             data_start_p = &ee->out_data_start;
1649             data_end_p = &ee->out_data_end;
1650             buf_end_p = &ee->out_buf_end;
1651         }
1652     }
1653     else {
1654         rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1655         buf_start_p = &ee->out_buf_start;
1656         data_start_p = &ee->out_data_start;
1657         data_end_p = &ee->out_data_end;
1658         buf_end_p = &ee->out_buf_end;
1659         tc = ec->elems[last_trans_index].tc;
1660     }
1661
1662     if (*buf_start_p == NULL) {
1663         unsigned char *buf = xmalloc(need);
1664         *buf_start_p = buf;
1665         *data_start_p = buf;
1666         *data_end_p = buf;
1667         *buf_end_p = buf+need;
1668     }
1669     else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1670         MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1671         *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1672         *data_start_p = *buf_start_p;
1673         if ((size_t)(*buf_end_p - *data_end_p) < need) {
1674             unsigned char *buf;
1675             size_t s = (*data_end_p - *buf_start_p) + need;
1676             if (s < need)
1677                 goto fail;
1678             buf = xrealloc(*buf_start_p, s);
1679             *data_start_p = buf;
1680             *data_end_p = buf + (*data_end_p - *buf_start_p);
1681             *buf_start_p = buf;
1682             *buf_end_p = buf + s;
1683         }
1684     }
1685
1686     memcpy(*data_end_p, insert_str, insert_len);
1687     *data_end_p += insert_len;
1688     if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1689         memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1690         *data_end_p += tc->readagain_len;
1691         tc->readagain_len = 0;
1692     }
1693
1694     if (insert_str != str && insert_str != insert_buf)
1695         xfree((void*)insert_str);
1696     return 0;
1697
1698   fail:
1699     if (insert_str != str && insert_str != insert_buf)
1700         xfree((void*)insert_str);
1701     return -1;
1702 }
1703
1704 void
1705 rb_econv_close(rb_econv_t *ec)
1706 {
1707     int i;
1708
1709     if (ec->replacement_allocated) {
1710         xfree((void *)ec->replacement_str);
1711     }
1712     for (i = 0; i < ec->num_trans; i++) {
1713         rb_transcoding_close(ec->elems[i].tc);
1714         if (ec->elems[i].out_buf_start)
1715             xfree(ec->elems[i].out_buf_start);
1716     }
1717     xfree(ec->in_buf_start);
1718     xfree(ec->elems);
1719     xfree(ec);
1720 }
1721
1722 size_t
1723 rb_econv_memsize(rb_econv_t *ec)
1724 {
1725     size_t size = sizeof(rb_econv_t);
1726     int i;
1727
1728     if (ec->replacement_allocated) {
1729         size += ec->replacement_len;
1730     }
1731     for (i = 0; i < ec->num_trans; i++) {
1732         size += rb_transcoding_memsize(ec->elems[i].tc);
1733
1734         if (ec->elems[i].out_buf_start) {
1735             size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1736         }
1737     }
1738     size += ec->in_buf_end - ec->in_buf_start;
1739     size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1740
1741     return size;
1742 }
1743
1744 int
1745 rb_econv_putbackable(rb_econv_t *ec)
1746 {
1747     if (ec->num_trans == 0)
1748         return 0;
1749 #if SIZEOF_SIZE_T > SIZEOF_INT
1750     if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1751 #endif
1752     return (int)ec->elems[0].tc->readagain_len;
1753 }
1754
1755 void
1756 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1757 {
1758     rb_transcoding *tc;
1759     if (ec->num_trans == 0 || n == 0)
1760         return;
1761     tc = ec->elems[0].tc;
1762     memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1763     tc->readagain_len -= n;
1764 }
1765
1766 struct asciicompat_encoding_t {
1767     const char *ascii_compat_name;
1768     const char *ascii_incompat_name;
1769 };
1770
1771 static int
1772 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1773 {
1774     struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1775     transcoder_entry_t *entry = (transcoder_entry_t *)val;
1776     const rb_transcoder *tr;
1777
1778     if (DECORATOR_P(entry->sname, entry->dname))
1779         return ST_CONTINUE;
1780     tr = load_transcoder_entry(entry);
1781     if (tr && tr->asciicompat_type == asciicompat_decoder) {
1782         data->ascii_compat_name = tr->dst_encoding;
1783         return ST_STOP;
1784     }
1785     return ST_CONTINUE;
1786 }
1787
1788 const char *
1789 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1790 {
1791     st_data_t v;
1792     st_table *table2;
1793     struct asciicompat_encoding_t data;
1794
1795     if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1796         return NULL;
1797     table2 = (st_table *)v;
1798
1799     /*
1800      * Assumption:
1801      * There is at most one transcoder for
1802      * converting from ASCII incompatible encoding.
1803      *
1804      * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1805      */
1806     if (table2->num_entries != 1)
1807         return NULL;
1808
1809     data.ascii_incompat_name = ascii_incompat_name;
1810     data.ascii_compat_name = NULL;
1811     st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1812     return data.ascii_compat_name;
1813 }
1814
1815 VALUE
1816 rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1817 {
1818     unsigned const char *sp, *se;
1819     unsigned char *ds, *dp, *de;
1820     rb_econv_result_t res;
1821     int max_output;
1822
1823     if (NIL_P(dst)) {
1824         dst = rb_str_buf_new(len);
1825         if (ec->destination_encoding)
1826             rb_enc_associate(dst, ec->destination_encoding);
1827     }
1828
1829     if (ec->last_tc)
1830         max_output = ec->last_tc->transcoder->max_output;
1831     else
1832         max_output = 1;
1833
1834     do {
1835         long dlen = RSTRING_LEN(dst);
1836         if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1837             unsigned long new_capa = (unsigned long)dlen + len + max_output;
1838             if (LONG_MAX < new_capa)
1839                 rb_raise(rb_eArgError, "too long string");
1840             rb_str_resize(dst, new_capa);
1841             rb_str_set_len(dst, dlen);
1842         }
1843         sp = (const unsigned char *)ss;
1844         se = sp + len;
1845         ds = (unsigned char *)RSTRING_PTR(dst);
1846         de = ds + rb_str_capacity(dst);
1847         dp = ds += dlen;
1848         res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1849         len -= (const char *)sp - ss;
1850         ss  = (const char *)sp;
1851         rb_str_set_len(dst, dlen + (dp - ds));
1852         rb_econv_check_error(ec);
1853     } while (res == econv_destination_buffer_full);
1854
1855     return dst;
1856 }
1857
1858 VALUE
1859 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1860 {
1861     src = rb_str_new_frozen(src);
1862     dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1863     RB_GC_GUARD(src);
1864     return dst;
1865 }
1866
1867 VALUE
1868 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1869 {
1870     return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1871 }
1872
1873 VALUE
1874 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1875 {
1876     return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1877 }
1878
1879 VALUE
1880 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
1881 {
1882     return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1883 }
1884
1885 static int
1886 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1887 {
1888     transcoder_entry_t *entry;
1889     const rb_transcoder *tr;
1890
1891     if (ec->started != 0)
1892         return -1;
1893
1894     entry = get_transcoder_entry(sname, dname);
1895     if (!entry)
1896         return -1;
1897
1898     tr = load_transcoder_entry(entry);
1899     if (!tr) return -1;
1900
1901     return rb_econv_add_transcoder_at(ec, tr, n);
1902 }
1903
1904 static int
1905 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1906 {
1907     return rb_econv_add_converter(ec, "", decorator_name, n);
1908 }
1909
1910 int
1911 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1912 {
1913     const rb_transcoder *tr;
1914
1915     if (ec->num_trans == 0)
1916         return rb_econv_decorate_at(ec, decorator_name, 0);
1917
1918     tr = ec->elems[0].tc->transcoder;
1919
1920     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1921         tr->asciicompat_type == asciicompat_decoder)
1922         return rb_econv_decorate_at(ec, decorator_name, 1);
1923
1924     return rb_econv_decorate_at(ec, decorator_name, 0);
1925 }
1926
1927 int
1928 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1929 {
1930     const rb_transcoder *tr;
1931
1932     if (ec->num_trans == 0)
1933         return rb_econv_decorate_at(ec, decorator_name, 0);
1934
1935     tr = ec->elems[ec->num_trans-1].tc->transcoder;
1936
1937     if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1938         tr->asciicompat_type == asciicompat_encoder)
1939         return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1940
1941     return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1942 }
1943
1944 void
1945 rb_econv_binmode(rb_econv_t *ec)
1946 {
1947     const char *dname = 0;
1948
1949     switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1950       case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
1951         dname = "universal_newline";
1952         break;
1953       case ECONV_CRLF_NEWLINE_DECORATOR:
1954         dname = "crlf_newline";
1955         break;
1956       case ECONV_CR_NEWLINE_DECORATOR:
1957         dname = "cr_newline";
1958         break;
1959     }
1960
1961     if (dname) {
1962         const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1963         int num_trans = ec->num_trans;
1964         int i, j = 0;
1965
1966         for (i=0; i < num_trans; i++) {
1967             if (transcoder == ec->elems[i].tc->transcoder) {
1968                 rb_transcoding_close(ec->elems[i].tc);
1969                 xfree(ec->elems[i].out_buf_start);
1970                 ec->num_trans--;
1971             }
1972             else
1973                 ec->elems[j++] = ec->elems[i];
1974         }
1975     }
1976
1977     ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
1978 }
1979
1980 static VALUE
1981 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1982 {
1983     int has_description = 0;
1984
1985     if (NIL_P(mesg))
1986         mesg = rb_str_new(NULL, 0);
1987
1988     if (*sname != '\0' || *dname != '\0') {
1989         if (*sname == '\0')
1990             rb_str_cat2(mesg, dname);
1991         else if (*dname == '\0')
1992             rb_str_cat2(mesg, sname);
1993         else
1994             rb_str_catf(mesg, "%s to %s", sname, dname);
1995         has_description = 1;
1996     }
1997
1998     if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1999                    ECONV_XML_TEXT_DECORATOR|
2000                    ECONV_XML_ATTR_CONTENT_DECORATOR|
2001                    ECONV_XML_ATTR_QUOTE_DECORATOR)) {
2002         const char *pre = "";
2003         if (has_description)
2004             rb_str_cat2(mesg, " with ");
2005         if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)  {
2006             rb_str_cat2(mesg, pre); pre = ",";
2007             rb_str_cat2(mesg, "universal_newline");
2008         }
2009         if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2010             rb_str_cat2(mesg, pre); pre = ",";
2011             rb_str_cat2(mesg, "crlf_newline");
2012         }
2013         if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2014             rb_str_cat2(mesg, pre); pre = ",";
2015             rb_str_cat2(mesg, "cr_newline");
2016         }
2017         if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2018             rb_str_cat2(mesg, pre); pre = ",";
2019             rb_str_cat2(mesg, "xml_text");
2020         }
2021         if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2022             rb_str_cat2(mesg, pre); pre = ",";
2023             rb_str_cat2(mesg, "xml_attr_content");
2024         }
2025         if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2026             rb_str_cat2(mesg, pre); pre = ",";
2027             rb_str_cat2(mesg, "xml_attr_quote");
2028         }
2029         has_description = 1;
2030     }
2031     if (!has_description) {
2032         rb_str_cat2(mesg, "no-conversion");
2033     }
2034
2035     return mesg;
2036 }
2037
2038 VALUE
2039 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2040 {
2041     VALUE mesg, exc;
2042     mesg = rb_str_new_cstr("code converter not found (");
2043     econv_description(sname, dname, ecflags, mesg);
2044     rb_str_cat2(mesg, ")");
2045     exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2046     return exc;
2047 }
2048
2049 static VALUE
2050 make_econv_exception(rb_econv_t *ec)
2051 {
2052     VALUE mesg, exc;
2053     if (ec->last_error.result == econv_invalid_byte_sequence ||
2054         ec->last_error.result == econv_incomplete_input) {
2055         const char *err = (const char *)ec->last_error.error_bytes_start;
2056         size_t error_len = ec->last_error.error_bytes_len;
2057         VALUE bytes = rb_str_new(err, error_len);
2058         VALUE dumped = rb_str_dump(bytes);
2059         size_t readagain_len = ec->last_error.readagain_len;
2060         VALUE bytes2 = Qnil;
2061         VALUE dumped2;
2062         if (ec->last_error.result == econv_incomplete_input) {
2063             mesg = rb_sprintf("incomplete %s on %s",
2064                     StringValueCStr(dumped),
2065                     ec->last_error.source_encoding);
2066         }
2067         else if (readagain_len) {
2068             bytes2 = rb_str_new(err+error_len, readagain_len);
2069             dumped2 = rb_str_dump(bytes2);
2070             mesg = rb_sprintf("%s followed by %s on %s",
2071                     StringValueCStr(dumped),
2072                     StringValueCStr(dumped2),
2073                     ec->last_error.source_encoding);
2074         }
2075         else {
2076             mesg = rb_sprintf("%s on %s",
2077                     StringValueCStr(dumped),
2078                     ec->last_error.source_encoding);
2079         }
2080
2081         exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2082         rb_ivar_set(exc, id_error_bytes, bytes);
2083         rb_ivar_set(exc, id_readagain_bytes, bytes2);
2084         rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2085         goto set_encs;
2086     }
2087     if (ec->last_error.result == econv_undefined_conversion) {
2088         VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2089                                  ec->last_error.error_bytes_len);
2090         VALUE dumped = Qnil;
2091         int idx;
2092         if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2093             rb_encoding *utf8 = rb_utf8_encoding();
2094             const char *start, *end;
2095             int n;
2096             start = (const char *)ec->last_error.error_bytes_start;
2097             end = start + ec->last_error.error_bytes_len;
2098             n = rb_enc_precise_mbclen(start, end, utf8);
2099             if (MBCLEN_CHARFOUND_P(n) &&
2100                 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2101                 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2102                 dumped = rb_sprintf("U+%04X", cc);
2103             }
2104         }
2105         if (NIL_P(dumped))
2106             dumped = rb_str_dump(bytes);
2107         if (strcmp(ec->last_error.source_encoding,
2108                    ec->source_encoding_name) == 0 &&
2109             strcmp(ec->last_error.destination_encoding,
2110                    ec->destination_encoding_name) == 0) {
2111             mesg = rb_sprintf("%s from %s to %s",
2112                     StringValueCStr(dumped),
2113                     ec->last_error.source_encoding,
2114                     ec->last_error.destination_encoding);
2115         }
2116         else {
2117             int i;
2118             mesg = rb_sprintf("%s to %s in conversion from %s",
2119                     StringValueCStr(dumped),
2120                     ec->last_error.destination_encoding,
2121                     ec->source_encoding_name);
2122             for (i = 0; i < ec->num_trans; i++) {
2123                 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2124                 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2125                     rb_str_catf(mesg, " to %s",
2126                                 ec->elems[i].tc->transcoder->dst_encoding);
2127             }
2128         }
2129         exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2130         idx = rb_enc_find_index(ec->last_error.source_encoding);
2131         if (0 <= idx)
2132             rb_enc_associate_index(bytes, idx);
2133         rb_ivar_set(exc, id_error_char, bytes);
2134         goto set_encs;
2135     }
2136     return Qnil;
2137
2138   set_encs:
2139     rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2140     rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2141     int idx = rb_enc_find_index(ec->last_error.source_encoding);
2142     if (0 <= idx)
2143         rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2144     idx = rb_enc_find_index(ec->last_error.destination_encoding);
2145     if (0 <= idx)
2146         rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2147     return exc;
2148 }
2149
2150 static void
2151 more_output_buffer(
2152         VALUE destination,
2153         unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2154         int max_output,
2155         unsigned char **out_start_ptr,
2156         unsigned char **out_pos,
2157         unsigned char **out_stop_ptr)
2158 {
2159     size_t len = (*out_pos - *out_start_ptr);
2160     size_t new_len = (len + max_output) * 2;
2161     *out_start_ptr = resize_destination(destination, len, new_len);
2162     *out_pos = *out_start_ptr + len;
2163     *out_stop_ptr = *out_start_ptr + new_len;
2164 }
2165
2166 static int
2167 make_replacement(rb_econv_t *ec)
2168 {
2169     rb_transcoding *tc;
2170     const rb_transcoder *tr;
2171     const unsigned char *replacement;
2172     const char *repl_enc;
2173     const char *ins_enc;
2174     size_t len;
2175
2176     if (ec->replacement_str)
2177         return 0;
2178
2179     ins_enc = rb_econv_encoding_to_insert_output(ec);
2180
2181     tc = ec->last_tc;
2182     if (*ins_enc) {
2183         tr = tc->transcoder;
2184         rb_enc_find(tr->dst_encoding);
2185         replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2186     }
2187     else {
2188         replacement = (unsigned char *)"?";
2189         len = 1;
2190         repl_enc = "";
2191     }
2192
2193     ec->replacement_str = replacement;
2194     ec->replacement_len = len;
2195     ec->replacement_enc = repl_enc;
2196     ec->replacement_allocated = 0;
2197     return 0;
2198 }
2199
2200 int
2201 rb_econv_set_replacement(rb_econv_t *ec,
2202     const unsigned char *str, size_t len, const char *encname)
2203 {
2204     unsigned char *str2;
2205     size_t len2;
2206     const char *encname2;
2207
2208     encname2 = rb_econv_encoding_to_insert_output(ec);
2209
2210     if (!*encname2 || encoding_equal(encname, encname2)) {
2211         str2 = xmalloc(len);
2212         MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2213         len2 = len;
2214         encname2 = encname;
2215     }
2216     else {
2217         str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2218         if (!str2)
2219             return -1;
2220     }
2221
2222     if (ec->replacement_allocated) {
2223         xfree((void *)ec->replacement_str);
2224     }
2225     ec->replacement_allocated = 1;
2226     ec->replacement_str = str2;
2227     ec->replacement_len = len2;
2228     ec->replacement_enc = encname2;
2229     return 0;
2230 }
2231
2232 static int
2233 output_replacement_character(rb_econv_t *ec)
2234 {
2235     int ret;
2236
2237     if (make_replacement(ec) == -1)
2238         return -1;
2239
2240     ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2241     if (ret == -1)
2242         return -1;
2243
2244     return 0;
2245 }
2246
2247 #if 1
2248 #define hash_fallback rb_hash_aref
2249
2250 static VALUE
2251 proc_fallback(VALUE fallback, VALUE c)
2252 {
2253     return rb_proc_call(fallback, rb_ary_new4(1, &c));
2254 }
2255
2256 static VALUE
2257 method_fallback(VALUE fallback, VALUE c)
2258 {
2259     return rb_method_call(1, &c, fallback);
2260 }
2261
2262 static VALUE
2263 aref_fallback(VALUE fallback, VALUE c)
2264 {
2265     return rb_funcallv_public(fallback, idAREF, 1, &c);
2266 }
2267
2268 static void
2269 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2270                const unsigned char *in_stop, unsigned char *out_stop,
2271                VALUE destination,
2272                unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2273                const char *src_encoding,
2274                const char *dst_encoding,
2275                int ecflags,
2276                VALUE ecopts)
2277 {
2278     rb_econv_t *ec;
2279     rb_transcoding *last_tc;
2280     rb_econv_result_t ret;
2281     unsigned char *out_start = *out_pos;
2282     int max_output;
2283     VALUE exc;
2284     VALUE fallback = Qnil;
2285     VALUE (*fallback_func)(VALUE, VALUE) = 0;
2286
2287     ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2288     if (!ec)
2289         rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2290
2291     if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2292         fallback = rb_hash_aref(ecopts, sym_fallback);
2293         if (RB_TYPE_P(fallback, T_HASH)) {
2294             fallback_func = hash_fallback;
2295         }
2296         else if (rb_obj_is_proc(fallback)) {
2297             fallback_func = proc_fallback;
2298         }
2299         else if (rb_obj_is_method(fallback)) {
2300             fallback_func = method_fallback;
2301         }
2302         else {
2303             fallback_func = aref_fallback;
2304         }
2305     }
2306     last_tc = ec->last_tc;
2307     max_output = last_tc ? last_tc->transcoder->max_output : 1;
2308
2309   resume:
2310     ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2311
2312     if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2313         VALUE rep = rb_enc_str_new(
2314                 (const char *)ec->last_error.error_bytes_start,
2315                 ec->last_error.error_bytes_len,
2316                 rb_enc_find(ec->last_error.source_encoding));
2317         rep = (*fallback_func)(fallback, rep);
2318         if (rep != Qundef && !NIL_P(rep)) {
2319             StringValue(rep);
2320             ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2321                     RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2322             if ((int)ret == -1) {
2323                 rb_raise(rb_eArgError, "too big fallback string");
2324             }
2325             goto resume;
2326         }
2327     }
2328
2329     if (ret == econv_invalid_byte_sequence ||
2330         ret == econv_incomplete_input ||
2331         ret == econv_undefined_conversion) {
2332         exc = make_econv_exception(ec);
2333         rb_econv_close(ec);
2334         rb_exc_raise(exc);
2335     }
2336
2337     if (ret == econv_destination_buffer_full) {
2338         more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2339         goto resume;
2340     }
2341
2342     rb_econv_close(ec);
2343     return;
2344 }
2345 #else
2346 /* sample transcode_loop implementation in byte-by-byte stream style */
2347 static void
2348 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2349                const unsigned char *in_stop, unsigned char *out_stop,
2350                VALUE destination,
2351                unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2352                const char *src_encoding,
2353                const char *dst_encoding,
2354                int ecflags,
2355                VALUE ecopts)
2356 {
2357     rb_econv_t *ec;
2358     rb_transcoding *last_tc;
2359     rb_econv_result_t ret;
2360     unsigned char *out_start = *out_pos;
2361     const unsigned char *ptr;
2362     int max_output;
2363     VALUE exc;
2364
2365     ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2366     if (!ec)
2367         rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2368
2369     last_tc = ec->last_tc;
2370     max_output = last_tc ? last_tc->transcoder->max_output : 1;
2371
2372     ret = econv_source_buffer_empty;
2373     ptr = *in_pos;
2374     while (ret != econv_finished) {
2375         unsigned char input_byte;
2376         const unsigned char *p = &input_byte;
2377
2378         if (ret == econv_source_buffer_empty) {
2379             if (ptr < in_stop) {
2380                 input_byte = *ptr;
2381                 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2382             }
2383             else {
2384                 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2385             }
2386         }
2387         else {
2388             ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2389         }
2390         if (&input_byte != p)
2391             ptr += p - &input_byte;
2392         switch (ret) {
2393           case econv_invalid_byte_sequence:
2394           case econv_incomplete_input:
2395           case econv_undefined_conversion:
2396             exc = make_econv_exception(ec);
2397             rb_econv_close(ec);
2398             rb_exc_raise(exc);
2399             break;
2400
2401           case econv_destination_buffer_full:
2402             more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2403             break;
2404
2405           case econv_source_buffer_empty:
2406             break;
2407
2408           case econv_finished:
2409             break;
2410         }
2411     }
2412     rb_econv_close(ec);
2413     *in_pos = in_stop;
2414     return;
2415 }
2416 #endif
2417
2418
2419 /*
2420  *  String-specific code
2421  */
2422
2423 static unsigned char *
2424 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2425 {
2426     rb_str_resize(destination, new_len);
2427     return (unsigned char *)RSTRING_PTR(destination);
2428 }
2429
2430 static int
2431 econv_opts(VALUE opt, int ecflags)
2432 {
2433     VALUE v;
2434     int newlineflag = 0;
2435
2436     v = rb_hash_aref(opt, sym_invalid);
2437     if (NIL_P(v)) {
2438     }
2439     else if (v==sym_replace) {
2440         ecflags |= ECONV_INVALID_REPLACE;
2441     }
2442     else {
2443         rb_raise(rb_eArgError, "unknown value for invalid character option");
2444     }
2445
2446     v = rb_hash_aref(opt, sym_undef);
2447     if (NIL_P(v)) {
2448     }
2449     else if (v==sym_replace) {
2450         ecflags |= ECONV_UNDEF_REPLACE;
2451     }
2452     else {
2453         rb_raise(rb_eArgError, "unknown value for undefined character option");
2454     }
2455
2456     v = rb_hash_aref(opt, sym_replace);
2457     if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2458         ecflags |= ECONV_UNDEF_REPLACE;
2459     }
2460
2461     v = rb_hash_aref(opt, sym_xml);
2462     if (!NIL_P(v)) {
2463         if (v==sym_text) {
2464             ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
2465         }
2466         else if (v==sym_attr) {
2467             ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
2468         }
2469         else if (SYMBOL_P(v)) {
2470             rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2471         }
2472         else {
2473             rb_raise(rb_eArgError, "unexpected value for xml option");
2474         }
2475     }
2476
2477 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2478     v = rb_hash_aref(opt, sym_newline);
2479     if (!NIL_P(v)) {
2480         newlineflag = 2;
2481         ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2482         if (v == sym_universal) {
2483             ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
2484         }
2485         else if (v == sym_crlf) {
2486             ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2487         }
2488         else if (v == sym_cr) {
2489             ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2490         }
2491         else if (v == sym_lf) {
2492             /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2493         }
2494         else if (SYMBOL_P(v)) {
2495             rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2496                      rb_sym2str(v));
2497         }
2498         else {
2499             rb_raise(rb_eArgError, "unexpected value for newline option");
2500         }
2501     }
2502 #endif
2503     {
2504         int setflags = 0;
2505
2506         v = rb_hash_aref(opt, sym_universal_newline);
2507         if (RTEST(v))
2508             setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
2509         newlineflag |= !NIL_P(v);
2510
2511         v = rb_hash_aref(opt, sym_crlf_newline);
2512         if (RTEST(v))
2513             setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2514         newlineflag |= !NIL_P(v);
2515
2516         v = rb_hash_aref(opt, sym_cr_newline);
2517         if (RTEST(v))
2518             setflags |= ECONV_CR_NEWLINE_DECORATOR;
2519         newlineflag |= !NIL_P(v);
2520
2521         switch (newlineflag) {
2522           case 1:
2523             ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2524             ecflags |= setflags;
2525             break;
2526
2527           case 3:
2528             rb_warning(":newline option precedes other newline options");
2529             break;
2530         }
2531     }
2532
2533     return ecflags;
2534 }
2535
2536 int
2537 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2538 {
2539     VALUE newhash = Qnil;
2540     VALUE v;
2541
2542     if (NIL_P(opthash)) {
2543         *opts = Qnil;
2544         return ecflags;
2545     }
2546     ecflags = econv_opts(opthash, ecflags);
2547
2548     v = rb_hash_aref(opthash, sym_replace);
2549     if (!NIL_P(v)) {
2550         StringValue(v);
2551         if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
2552             VALUE dumped = rb_str_dump(v);
2553             rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2554                      StringValueCStr(dumped),
2555                      rb_enc_name(rb_enc_get(v)));
2556         }
2557         v = rb_str_new_frozen(v);
2558         newhash = rb_hash_new();
2559         rb_hash_aset(newhash, sym_replace, v);
2560     }
2561
2562     v = rb_hash_aref(opthash, sym_fallback);
2563     if (!NIL_P(v)) {
2564         VALUE h = rb_check_hash_type(v);
2565         if (NIL_P(h)
2566             ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2567             : (v = h, 1)) {
2568             if (NIL_P(newhash))
2569                 newhash = rb_hash_new();
2570             rb_hash_aset(newhash, sym_fallback, v);
2571         }
2572     }
2573
2574     if (!NIL_P(newhash))
2575         rb_hash_freeze(newhash);
2576     *opts = newhash;
2577
2578     return ecflags;
2579 }
2580
2581 int
2582 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
2583 {
2584     return rb_econv_prepare_options(opthash, opts, 0);
2585 }
2586
2587 rb_econv_t *
2588 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2589 {
2590     rb_econv_t *ec;
2591     VALUE replacement;
2592
2593     if (NIL_P(opthash)) {
2594         replacement = Qnil;
2595     }
2596     else {
2597         if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2598             rb_bug("rb_econv_open_opts called with invalid opthash");
2599         replacement = rb_hash_aref(opthash, sym_replace);
2600     }
2601
2602     ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2603     if (!ec)
2604         return ec;
2605
2606     if (!NIL_P(replacement)) {
2607         int ret;
2608         rb_encoding *enc = rb_enc_get(replacement);
2609
2610         ret = rb_econv_set_replacement(ec,
2611                 (const unsigned char *)RSTRING_PTR(replacement),
2612                 RSTRING_LEN(replacement),
2613                 rb_enc_name(enc));
2614         if (ret == -1) {
2615             rb_econv_close(ec);
2616             return NULL;
2617         }
2618     }
2619     return ec;
2620 }
2621
2622 static int
2623 enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2624 {
2625     rb_encoding *enc;
2626     const char *n;
2627     int encidx;
2628     VALUE encval;
2629
2630     if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2631         !(enc = rb_enc_from_index(encidx))) {
2632         enc = NULL;
2633         encidx = 0;
2634         n = StringValueCStr(*arg);
2635     }
2636     else {
2637         n = rb_enc_name(enc);
2638     }
2639
2640     *name_p = n;
2641     *enc_p = enc;
2642
2643     return encidx;
2644 }
2645
2646 static int
2647 str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2648         const char **sname_p, rb_encoding **senc_p,
2649         const char **dname_p, rb_encoding **denc_p)
2650 {
2651     rb_encoding *senc, *denc;
2652     const char *sname, *dname;
2653     int sencidx, dencidx;
2654
2655     dencidx = enc_arg(arg1, &dname, &denc);
2656
2657     if (NIL_P(*arg2)) {
2658         sencidx = rb_enc_get_index(str);
2659         senc = rb_enc_from_index(sencidx);
2660         sname = rb_enc_name(senc);
2661     }
2662     else {
2663         sencidx = enc_arg(arg2, &sname, &senc);
2664     }
2665
2666     *sname_p = sname;
2667     *senc_p = senc;
2668     *dname_p = dname;
2669     *denc_p = denc;
2670     return dencidx;
2671 }
2672
2673 static int
2674 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2675 {
2676     VALUE dest;
2677     VALUE str = *self;
2678     VALUE arg1, arg2;
2679     long blen, slen;
2680     unsigned char *buf, *bp, *sp;
2681     const unsigned char *fromp;
2682     rb_encoding *senc, *denc;
2683     const char *sname, *dname;
2684     int dencidx;
2685     int explicitly_invalid_replace = TRUE;
2686
2687     rb_check_arity(argc, 0, 2);
2688
2689     if (argc == 0) {
2690         arg1 = rb_enc_default_internal();
2691         if (NIL_P(arg1)) {
2692             if (!ecflags) return -1;
2693             arg1 = rb_obj_encoding(str);
2694         }
2695         if (!(ecflags & ECONV_INVALID_MASK)) {
2696             explicitly_invalid_replace = FALSE;
2697         }
2698         ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
2699     }
2700     else {
2701         arg1 = argv[0];
2702     }
2703     arg2 = argc<=1 ? Qnil : argv[1];
2704     dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2705
2706     if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2707                     ECONV_XML_TEXT_DECORATOR|
2708                     ECONV_XML_ATTR_CONTENT_DECORATOR|
2709                     ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
2710         if (senc && senc == denc) {
2711             if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2712                 VALUE rep = Qnil;
2713                 if (!NIL_P(ecopts)) {
2714                     rep = rb_hash_aref(ecopts, sym_replace);
2715                 }
2716                 dest = rb_enc_str_scrub(senc, str, rep);
2717                 if (NIL_P(dest)) dest = str;
2718                 *self = dest;
2719                 return dencidx;
2720             }
2721             return NIL_P(arg2) ? -1 : dencidx;
2722         }
2723         if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2724             if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2725                 return dencidx;
2726             }
2727         }
2728         if (encoding_equal(sname, dname)) {
2729             return NIL_P(arg2) ? -1 : dencidx;
2730         }
2731     }
2732     else {
2733         if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2734             rb_encoding *utf8 = rb_utf8_encoding();
2735             str = rb_str_conv_enc(str, senc, utf8);
2736             senc = utf8;
2737             sname = "UTF-8";
2738         }
2739         if (encoding_equal(sname, dname)) {
2740             sname = "";
2741             dname = "";
2742         }
2743     }
2744
2745     fromp = sp = (unsigned char *)RSTRING_PTR(str);
2746     slen = RSTRING_LEN(str);
2747     blen = slen + 30; /* len + margin */
2748     dest = rb_str_tmp_new(blen);
2749     bp = (unsigned char *)RSTRING_PTR(dest);
2750
2751     transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2752     if (fromp != sp+slen) {
2753         rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2754     }
2755     buf = (unsigned char *)RSTRING_PTR(dest);
2756     *bp = '\0';
2757     rb_str_set_len(dest, bp - buf);
2758
2759     /* set encoding */
2760     if (!denc) {
2761         dencidx = rb_define_dummy_encoding(dname);
2762         RB_GC_GUARD(arg1);
2763         RB_GC_GUARD(arg2);
2764     }
2765     *self = dest;
2766
2767     return dencidx;
2768 }
2769
2770 static int
2771 str_transcode(int argc, VALUE *argv, VALUE *self)
2772 {
2773     VALUE opt;
2774     int ecflags = 0;
2775     VALUE ecopts = Qnil;
2776
2777     argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2778     if (!NIL_P(opt)) {
2779         ecflags = rb_econv_prepare_opts(opt, &ecopts);
2780     }
2781     return str_transcode0(argc, argv, self, ecflags, ecopts);
2782 }
2783
2784 static inline VALUE
2785 str_encode_associate(VALUE str, int encidx)
2786 {
2787     int cr = 0;
2788
2789     rb_enc_associate_index(str, encidx);
2790
2791     /* transcoded string never be broken. */
2792     if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2793         rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2794     }
2795     else {
2796         cr = ENC_CODERANGE_VALID;
2797     }
2798     ENC_CODERANGE_SET(str, cr);
2799     return str;
2800 }
2801
2802 /*
2803  *  call-seq:
2804  *     str.encode!(encoding, **options)   -> str
2805  *     str.encode!(dst_encoding, src_encoding, **options)   -> str
2806  *
2807  *  The first form transcodes the contents of <i>str</i> from
2808  *  str.encoding to +encoding+.
2809  *  The second form transcodes the contents of <i>str</i> from
2810  *  src_encoding to dst_encoding.
2811  *  The +options+ keyword arguments give details for conversion. See String#encode
2812  *  for details.
2813  *  Returns the string even if no changes were made.
2814  */
2815
2816 static VALUE
2817 str_encode_bang(int argc, VALUE *argv, VALUE str)
2818 {
2819     VALUE newstr;
2820     int encidx;
2821
2822     rb_check_frozen(str);
2823
2824     newstr = str;
2825     encidx = str_transcode(argc, argv, &newstr);
2826
2827     if (encidx < 0) return str;
2828     if (newstr == str) {
2829         rb_enc_associate_index(str, encidx);
2830         return str;
2831     }
2832     rb_str_shared_replace(str, newstr);
2833     return str_encode_associate(str, encidx);
2834 }
2835
2836 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2837
2838 /*
2839  *  call-seq:
2840  *     str.encode(encoding, **options)   -> str
2841  *     str.encode(dst_encoding, src_encoding, **options)   -> str
2842  *     str.encode(**options)   -> str
2843  *
2844  *  The first form returns a copy of +str+ transcoded
2845  *  to encoding +encoding+.
2846  *  The second form returns a copy of +str+ transcoded
2847  *  from src_encoding to dst_encoding.
2848  *  The last form returns a copy of +str+ transcoded to
2849  *  <tt>Encoding.default_internal</tt>.
2850  *
2851  *  By default, the first and second form raise
2852  *  Encoding::UndefinedConversionError for characters that are
2853  *  undefined in the destination encoding, and
2854  *  Encoding::InvalidByteSequenceError for invalid byte sequences
2855  *  in the source encoding. The last form by default does not raise
2856  *  exceptions but uses replacement strings.
2857  *
2858  *  The +options+ keyword arguments give details for conversion.
2859  *  The arguments are:
2860  *
2861  *  :invalid ::
2862  *    If the value is +:replace+, #encode replaces invalid byte sequences in
2863  *    +str+ with the replacement character.  The default is to raise the
2864  *    Encoding::InvalidByteSequenceError exception
2865  *  :undef ::
2866  *    If the value is +:replace+, #encode replaces characters which are
2867  *    undefined in the destination encoding with the replacement character.
2868  *    The default is to raise the Encoding::UndefinedConversionError.
2869  *  :replace ::
2870  *    Sets the replacement string to the given value. The default replacement
2871  *    string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2872  *  :fallback ::
2873  *    Sets the replacement string by the given object for undefined
2874  *    character.  The object should be a Hash, a Proc, a Method, or an
2875  *    object which has [] method.
2876  *    Its key is an undefined character encoded in the source encoding
2877  *    of current transcoder. Its value can be any encoding until it
2878  *    can be converted into the destination encoding of the transcoder.
2879  *  :xml ::
2880  *    The value must be +:text+ or +:attr+.
2881  *    If the value is +:text+ #encode replaces undefined characters with their
2882  *    (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2883  *    are converted to "&amp;", "&lt;", and "&gt;", respectively.
2884  *    If the value is +:attr+, #encode also quotes the replacement result
2885  *    (using '"'), and replaces '"' with "&quot;".
2886  *  :cr_newline ::
2887  *    Replaces LF ("\n") with CR ("\r") if value is true.
2888  *  :crlf_newline ::
2889  *    Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2890  *  :universal_newline ::
2891  *    Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2892  */
2893
2894 static VALUE
2895 str_encode(int argc, VALUE *argv, VALUE str)
2896 {
2897     VALUE newstr = str;
2898     int encidx = str_transcode(argc, argv, &newstr);
2899     return encoded_dup(newstr, str, encidx);
2900 }
2901
2902 VALUE
2903 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2904 {
2905     int argc = 1;
2906     VALUE *argv = &to;
2907     VALUE newstr = str;
2908     int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2909     return encoded_dup(newstr, str, encidx);
2910 }
2911
2912 static VALUE
2913 encoded_dup(VALUE newstr, VALUE str, int encidx)
2914 {
2915     if (encidx < 0) return rb_str_dup(str);
2916     if (newstr == str) {
2917         newstr = rb_str_dup(str);
2918         rb_enc_associate_index(newstr, encidx);
2919         return newstr;
2920     }
2921     else {
2922         RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2923     }
2924     return str_encode_associate(newstr, encidx);
2925 }
2926
2927 /*
2928  * Document-class: Encoding::Converter
2929  *
2930  * Encoding conversion class.
2931  */
2932 static void
2933 econv_free(void *ptr)
2934 {
2935     rb_econv_t *ec = ptr;
2936     rb_econv_close(ec);
2937 }
2938
2939 static size_t
2940 econv_memsize(const void *ptr)
2941 {
2942     return sizeof(rb_econv_t);
2943 }
2944
2945 static const rb_data_type_t econv_data_type = {
2946     "econv",
2947     {0, econv_free, econv_memsize,},
2948     0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2949 };
2950
2951 static VALUE
2952 econv_s_allocate(VALUE klass)
2953 {
2954     return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2955 }
2956
2957 static rb_encoding *
2958 make_dummy_encoding(const char *name)
2959 {
2960     rb_encoding *enc;
2961     int idx;
2962     idx = rb_define_dummy_encoding(name);
2963     enc = rb_enc_from_index(idx);
2964     return enc;
2965 }
2966
2967 static rb_encoding *
2968 make_encoding(const char *name)
2969 {
2970     rb_encoding *enc;
2971     enc = rb_enc_find(name);
2972     if (!enc)
2973         enc = make_dummy_encoding(name);
2974     return enc;
2975 }
2976
2977 static VALUE
2978 make_encobj(const char *name)
2979 {
2980     return rb_enc_from_encoding(make_encoding(name));
2981 }
2982
2983 /*
2984  * call-seq:
2985  *   Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2986  *   Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2987  *
2988  * Returns the corresponding ASCII compatible encoding.
2989  *
2990  * Returns nil if the argument is an ASCII compatible encoding.
2991  *
2992  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2993  * can represents exactly the same characters as the given ASCII incompatible encoding.
2994  * So, no conversion undefined error occurs when converting between the two encodings.
2995  *
2996  *   Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2997  *   Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2998  *   Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2999  *
3000  */
3001 static VALUE
3002 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3003 {
3004     const char *arg_name, *result_name;
3005     rb_encoding *arg_enc, *result_enc;
3006
3007     enc_arg(&arg, &arg_name, &arg_enc);
3008
3009     result_name = rb_econv_asciicompat_encoding(arg_name);
3010
3011     if (result_name == NULL)
3012         return Qnil;
3013
3014     result_enc = make_encoding(result_name);
3015
3016     return rb_enc_from_encoding(result_enc);
3017 }
3018
3019 static void
3020 econv_args(int argc, VALUE *argv,
3021     VALUE *snamev_p, VALUE *dnamev_p,
3022     const char **sname_p, const char **dname_p,
3023     rb_encoding **senc_p, rb_encoding **denc_p,
3024     int *ecflags_p,
3025     VALUE *ecopts_p)
3026 {
3027     VALUE opt, flags_v, ecopts;
3028     int sidx, didx;
3029     const char *sname, *dname;
3030     rb_encoding *senc, *denc;
3031     int ecflags;
3032
3033     argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3034
3035     if (!NIL_P(flags_v)) {
3036         if (!NIL_P(opt)) {
3037             rb_error_arity(argc + 1, 2, 3);
3038         }
3039         ecflags = NUM2INT(rb_to_int(flags_v));
3040         ecopts = Qnil;
3041     }
3042     else if (!NIL_P(opt)) {
3043         ecflags = rb_econv_prepare_opts(opt, &ecopts);
3044     }
3045     else {
3046         ecflags = 0;
3047         ecopts = Qnil;
3048     }
3049
3050     senc = NULL;
3051     sidx = rb_to_encoding_index(*snamev_p);
3052     if (0 <= sidx) {
3053         senc = rb_enc_from_index(sidx);
3054     }
3055     else {
3056         StringValue(*snamev_p);
3057     }
3058
3059     denc = NULL;
3060     didx = rb_to_encoding_index(*dnamev_p);
3061     if (0 <= didx) {
3062         denc = rb_enc_from_index(didx);
3063     }
3064     else {
3065         StringValue(*dnamev_p);
3066     }
3067
3068     sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3069     dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3070
3071     *sname_p = sname;
3072     *dname_p = dname;
3073     *senc_p = senc;
3074     *denc_p = denc;
3075     *ecflags_p = ecflags;
3076     *ecopts_p = ecopts;
3077 }
3078
3079 static int
3080 decorate_convpath(VALUE convpath, int ecflags)
3081 {
3082     int num_decorators;
3083     const char *decorators[MAX_ECFLAGS_DECORATORS];
3084     int i;
3085     int n, len;
3086
3087     num_decorators = decorator_names(ecflags, decorators);
3088     if (num_decorators == -1)
3089         return -1;
3090
3091     len = n = RARRAY_LENINT(convpath);
3092     if (n != 0) {
3093         VALUE pair = RARRAY_AREF(convpath, n-1);
3094         if (RB_TYPE_P(pair, T_ARRAY)) {
3095             const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3096             const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3097             transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3098             const rb_transcoder *tr = load_transcoder_entry(entry);
3099             if (!tr)
3100                 return -1;
3101             if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3102                     tr->asciicompat_type == asciicompat_encoder) {
3103                 n--;
3104                 rb_ary_store(convpath, len + num_decorators - 1, pair);
3105             }
3106         }
3107         else {
3108             rb_ary_store(convpath, len + num_decorators - 1, pair);
3109         }
3110     }
3111
3112     for (i = 0; i < num_decorators; i++)
3113         rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3114
3115     return 0;
3116 }
3117
3118 static void
3119 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3120 {
3121     VALUE *ary_p = arg;
3122     VALUE v;
3123
3124     if (NIL_P(*ary_p)) {
3125         *ary_p = rb_ary_new();
3126     }
3127
3128     if (DECORATOR_P(sname, dname)) {
3129         v = rb_str_new_cstr(dname);
3130     }
3131     else {
3132         v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3133     }
3134     rb_ary_store(*ary_p, depth, v);
3135 }
3136
3137 /*
3138  * call-seq:
3139  *   Encoding::Converter.search_convpath(source_encoding, destination_encoding)         -> ary
3140  *   Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt)    -> ary
3141  *
3142  *  Returns a conversion path.
3143  *
3144  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3145  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3146  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3147  *
3148  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3149  *   or
3150  *   p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3151  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3152  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3153  *   #    "universal_newline"]
3154  *
3155  *   p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3156  *   or
3157  *   p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3158  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3159  *   #    "universal_newline",
3160  *   #    [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3161  */
3162 static VALUE
3163 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3164 {
3165     VALUE snamev, dnamev;
3166     const char *sname, *dname;
3167     rb_encoding *senc, *denc;
3168     int ecflags;
3169     VALUE ecopts;
3170     VALUE convpath;
3171
3172     econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3173
3174     convpath = Qnil;
3175     transcode_search_path(sname, dname, search_convpath_i, &convpath);
3176
3177     if (NIL_P(convpath)) {
3178         VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3179         RB_GC_GUARD(snamev);
3180         RB_GC_GUARD(dnamev);
3181         rb_exc_raise(exc);
3182     }
3183
3184     if (decorate_convpath(convpath, ecflags) == -1) {
3185         VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3186         RB_GC_GUARD(snamev);
3187         RB_GC_GUARD(dnamev);
3188         rb_exc_raise(exc);
3189     }
3190
3191     return convpath;
3192 }
3193
3194 /*
3195  * Check the existence of a conversion path.
3196  * Returns the number of converters in the conversion path.
3197  * result: >=0:success -1:failure
3198  */
3199 int
3200 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3201 {
3202     VALUE convpath = Qnil;
3203     transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3204                           &convpath);
3205     return RTEST(convpath);
3206 }
3207
3208 struct rb_econv_init_by_convpath_t {
3209     rb_econv_t *ec;
3210     int index;
3211     int ret;
3212 };
3213
3214 static void
3215 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3216 {
3217     struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3218     int ret;
3219
3220     if (a->ret == -1)
3221         return;
3222
3223     ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3224
3225     a->ret = ret;
3226     return;
3227 }
3228
3229 static rb_econv_t *
3230 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3231     const char **sname_p, const char **dname_p,
3232     rb_encoding **senc_p, rb_encoding**denc_p)
3233 {
3234     rb_econv_t *ec;
3235     long i;
3236     int ret, first=1;
3237     VALUE elt;
3238     rb_encoding *senc = 0, *denc = 0;
3239     const char *sname, *dname;
3240
3241     ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3242     DATA_PTR(self) = ec;
3243
3244     for (i = 0; i < RARRAY_LEN(convpath); i++) {
3245         VALUE snamev, dnamev;
3246         VALUE pair;
3247         elt = rb_ary_entry(convpath, i);
3248         if (!NIL_P(pair = rb_check_array_type(elt))) {
3249             if (RARRAY_LEN(pair) != 2)
3250                 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3251             snamev = rb_ary_entry(pair, 0);
3252             enc_arg(&snamev, &sname, &senc);
3253             dnamev = rb_ary_entry(pair, 1);
3254             enc_arg(&dnamev, &dname, &denc);
3255         }
3256         else {
3257             sname = "";
3258             dname = StringValueCStr(elt);
3259         }
3260         if (DECORATOR_P(sname, dname)) {
3261             ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3262             if (ret == -1) {
3263                 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3264                 RB_GC_GUARD(snamev);
3265                 RB_GC_GUARD(dnamev);
3266                 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3267             }
3268         }
3269         else {
3270             int j = ec->num_trans;
3271             struct rb_econv_init_by_convpath_t arg;
3272             arg.ec = ec;
3273             arg.index = ec->num_trans;
3274             arg.ret = 0;
3275             ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3276             if (ret == -1 || arg.ret == -1) {
3277                 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3278                 RB_GC_GUARD(snamev);
3279                 RB_GC_GUARD(dnamev);
3280                 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3281             }
3282             if (first) {
3283                 first = 0;
3284                 *senc_p = senc;
3285                 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3286             }
3287             *denc_p = denc;
3288             *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3289         }
3290     }
3291
3292     if (first) {
3293         *senc_p = NULL;
3294         *denc_p = NULL;
3295         *sname_p = "";
3296         *dname_p = "";
3297     }
3298
3299     ec->source_encoding_name = *sname_p;
3300     ec->destination_encoding_name = *dname_p;
3301
3302     return ec;
3303 }
3304
3305 /*
3306  * call-seq:
3307  *   Encoding::Converter.new(source_encoding, destination_encoding)
3308  *   Encoding::Converter.new(source_encoding, destination_encoding, opt)
3309  *   Encoding::Converter.new(convpath)
3310  *
3311  * possible options elements:
3312  *   hash form:
3313  *     :invalid => nil            # raise error on invalid byte sequence (default)
3314  *     :invalid => :replace       # replace invalid byte sequence
3315  *     :undef => nil              # raise error on undefined conversion (default)
3316  *     :undef => :replace         # replace undefined conversion
3317  *     :replace => string         # replacement string ("?" or "\uFFFD" if not specified)
3318  *     :newline => :universal     # decorator for converting CRLF and CR to LF
3319  *     :newline => :crlf          # decorator for converting LF to CRLF
3320  *     :newline => :cr            # decorator for converting LF to CR
3321  *     :universal_newline => true # decorator for converting CRLF and CR to LF
3322  *     :crlf_newline => true      # decorator for converting LF to CRLF
3323  *     :cr_newline => true        # decorator for converting LF to CR
3324  *     :xml => :text              # escape as XML CharData.
3325  *     :xml => :attr              # escape as XML AttValue
3326  *   integer form:
3327  *     Encoding::Converter::INVALID_REPLACE
3328  *     Encoding::Converter::UNDEF_REPLACE
3329  *     Encoding::Converter::UNDEF_HEX_CHARREF
3330  *     Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3331  *     Encoding::Converter::CRLF_NEWLINE_DECORATOR
3332  *     Encoding::Converter::CR_NEWLINE_DECORATOR
3333  *     Encoding::Converter::XML_TEXT_DECORATOR
3334  *     Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3335  *     Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3336  *
3337  * Encoding::Converter.new creates an instance of Encoding::Converter.
3338  *
3339  * Source_encoding and destination_encoding should be a string or
3340  * Encoding object.
3341  *
3342  * opt should be nil, a hash or an integer.
3343  *
3344  * convpath should be an array.
3345  * convpath may contain
3346  * - two-element arrays which contain encodings or encoding names, or
3347  * - strings representing decorator names.
3348  *
3349  * Encoding::Converter.new optionally takes an option.
3350  * The option should be a hash or an integer.
3351  * The option hash can contain :invalid => nil, etc.
3352  * The option integer should be logical-or of constants such as
3353  * Encoding::Converter::INVALID_REPLACE, etc.
3354  *
3355  * [:invalid => nil]
3356  *   Raise error on invalid byte sequence.  This is a default behavior.
3357  * [:invalid => :replace]
3358  *   Replace invalid byte sequence by replacement string.
3359  * [:undef => nil]
3360  *   Raise an error if a character in source_encoding is not defined in destination_encoding.
3361  *   This is a default behavior.
3362  * [:undef => :replace]
3363  *   Replace undefined character in destination_encoding with replacement string.
3364  * [:replace => string]
3365  *   Specify the replacement string.
3366  *   If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3367  * [:universal_newline => true]
3368  *   Convert CRLF and CR to LF.
3369  * [:crlf_newline => true]
3370  *   Convert LF to CRLF.
3371  * [:cr_newline => true]
3372  *   Convert LF to CR.
3373  * [:xml => :text]
3374  *   Escape as XML CharData.
3375  *   This form can be used as an HTML 4.0 #PCDATA.
3376  *   - '&' -> '&amp;'
3377  *   - '<' -> '&lt;'
3378  *   - '>' -> '&gt;'
3379  *   - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3380  * [:xml => :attr]
3381  *   Escape as XML AttValue.
3382  *   The converted result is quoted as "...".
3383  *   This form can be used as an HTML 4.0 attribute value.
3384  *   - '&' -> '&amp;'
3385  *   - '<' -> '&lt;'
3386  *   - '>' -> '&gt;'
3387  *   - '"' -> '&quot;'
3388  *   - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3389  *
3390  * Examples:
3391  *   # UTF-16BE to UTF-8
3392  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3393  *
3394  *   # Usually, decorators such as newline conversion are inserted last.
3395  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3396  *   p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3397  *                 #    "universal_newline"]
3398  *
3399  *   # But, if the last encoding is ASCII incompatible,
3400  *   # decorators are inserted before the last conversion.
3401  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3402  *   p ec.convpath #=> ["crlf_newline",
3403  *                 #    [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3404  *
3405  *   # Conversion path can be specified directly.
3406  *   ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3407  *   p ec.convpath #=> ["universal_newline",
3408  *                 #    [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3409  *                 #    [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3410  */
3411 static VALUE
3412 econv_init(int argc, VALUE *argv, VALUE self)
3413 {
3414     VALUE ecopts;
3415     VALUE snamev, dnamev;
3416     const char *sname, *dname;
3417     rb_encoding *senc, *denc;
3418     rb_econv_t *ec;
3419     int ecflags;
3420     VALUE convpath;
3421
3422     if (rb_check_typeddata(self, &econv_data_type)) {
3423         rb_raise(rb_eTypeError, "already initialized");
3424     }
3425
3426     if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3427         ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3428         ecflags = 0;
3429         ecopts = Qnil;
3430     }
3431     else {
3432         econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3433         ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3434     }
3435
3436     if (!ec) {
3437         VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3438         RB_GC_GUARD(snamev);
3439         RB_GC_GUARD(dnamev);
3440         rb_exc_raise(exc);
3441     }
3442
3443     if (!DECORATOR_P(sname, dname)) {
3444         if (!senc)
3445             senc = make_dummy_encoding(sname);
3446         if (!denc)
3447             denc = make_dummy_encoding(dname);
3448         RB_GC_GUARD(snamev);
3449         RB_GC_GUARD(dnamev);
3450     }
3451
3452     ec->source_encoding = senc;
3453     ec->destination_encoding = denc;
3454
3455     DATA_PTR(self) = ec;
3456
3457     return self;
3458 }
3459
3460 /*
3461  * call-seq:
3462  *   ec.inspect         -> string
3463  *
3464  * Returns a printable version of <i>ec</i>
3465  *
3466  *   ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3467  *   puts ec.inspect    #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3468  *
3469  */
3470 static VALUE
3471 econv_inspect(VALUE self)
3472 {
3473     const char *cname = rb_obj_classname(self);
3474     rb_econv_t *ec;
3475
3476     TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3477     if (!ec)
3478         return rb_sprintf("#<%s: uninitialized>", cname);
3479     else {
3480         const char *sname = ec->source_encoding_name;
3481         const char *dname = ec->destination_encoding_name;
3482         VALUE str;
3483         str = rb_sprintf("#<%s: ", cname);
3484         econv_description(sname, dname, ec->flags, str);
3485         rb_str_cat2(str, ">");
3486         return str;
3487     }
3488 }
3489
3490 static rb_econv_t *
3491 check_econv(VALUE self)
3492 {
3493     rb_econv_t *ec;
3494
3495     TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3496     if (!ec) {
3497         rb_raise(rb_eTypeError, "uninitialized encoding converter");
3498     }
3499     return ec;
3500 }
3501
3502 /*
3503  * call-seq:
3504  *   ec.source_encoding -> encoding
3505  *
3506  * Returns the source encoding as an Encoding object.
3507  */
3508 static VALUE
3509 econv_source_encoding(VALUE self)
3510 {
3511     rb_econv_t *ec = check_econv(self);
3512     if (!ec->source_encoding)
3513         return Qnil;
3514     return rb_enc_from_encoding(ec->source_encoding);
3515 }
3516
3517 /*
3518  * call-seq:
3519  *   ec.destination_encoding -> encoding
3520  *
3521  * Returns the destination encoding as an Encoding object.
3522  */
3523 static VALUE
3524 econv_destination_encoding(VALUE self)
3525 {
3526     rb_econv_t *ec = check_econv(self);
3527     if (!ec->destination_encoding)
3528         return Qnil;
3529     return rb_enc_from_encoding(ec->destination_encoding);
3530 }
3531
3532 /*
3533  * call-seq:
3534  *   ec.convpath        -> ary
3535  *
3536  * Returns the conversion path of ec.
3537  *
3538  * The result is an array of conversions.
3539  *
3540  *   ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3541  *   p ec.convpath
3542  *   #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3543  *   #    [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3544  *   #    "crlf_newline"]
3545  *
3546  * Each element of the array is a pair of encodings or a string.
3547  * A pair means an encoding conversion.
3548  * A string means a decorator.
3549  *
3550  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3551  * a converter from ISO-8859-1 to UTF-8.
3552  * "crlf_newline" means newline converter from LF to CRLF.
3553  */
3554 static VALUE
3555 econv_convpath(VALUE self)
3556 {
3557     rb_econv_t *ec = check_econv(self);
3558     VALUE result;
3559     int i;
3560
3561     result = rb_ary_new();
3562     for (i = 0; i < ec->num_trans; i++) {
3563         const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3564         VALUE v;
3565         if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3566             v = rb_str_new_cstr(tr->dst_encoding);
3567         else
3568             v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3569         rb_ary_push(result, v);
3570     }
3571     return result;
3572 }
3573
3574 /*
3575  * call-seq:
3576  *   ec == other        -> true or false
3577  */
3578 static VALUE
3579 econv_equal(VALUE self, VALUE other)
3580 {
3581     rb_econv_t *ec1 = check_econv(self);
3582     rb_econv_t *ec2;
3583     int i;
3584
3585     if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3586         return Qnil;
3587     }
3588     ec2 = DATA_PTR(other);
3589     if (!ec2) return Qfalse;
3590     if (ec1->source_encoding_name != ec2->source_encoding_name &&
3591         strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3592         return Qfalse;
3593     if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3594         strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3595         return Qfalse;
3596     if (ec1->flags != ec2->flags) return Qfalse;
3597     if (ec1->replacement_enc != ec2->replacement_enc &&
3598         strcmp(ec1->replacement_enc, ec2->replacement_enc))
3599         return Qfalse;
3600     if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3601     if (ec1->replacement_str != ec2->replacement_str &&
3602         memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3603         return Qfalse;
3604
3605     if (ec1->num_trans != ec2->num_trans) return Qfalse;
3606     for (i = 0; i < ec1->num_trans; i++) {
3607         if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3608             return Qfalse;
3609     }
3610     return Qtrue;
3611 }
3612
3613 static VALUE
3614 econv_result_to_symbol(rb_econv_result_t res)
3615 {
3616     switch (res) {
3617       case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3618       case econv_incomplete_input: return sym_incomplete_input;
3619       case econv_undefined_conversion: return sym_undefined_conversion;
3620       case econv_destination_buffer_full: return sym_destination_buffer_full;
3621       case econv_source_buffer_empty: return sym_source_buffer_empty;
3622       case econv_finished: return sym_finished;
3623       case econv_after_output: return sym_after_output;
3624       default: return INT2NUM(res); /* should not be reached */
3625     }
3626 }
3627
3628 /*
3629  * call-seq:
3630  *   ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3631  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3632  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3633  *   ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3634  *
3635  * possible opt elements:
3636  *   hash form:
3637  *     :partial_input => true           # source buffer may be part of larger source
3638  *     :after_output => true            # stop conversion after output before input
3639  *   integer form:
3640  *     Encoding::Converter::PARTIAL_INPUT
3641  *     Encoding::Converter::AFTER_OUTPUT
3642  *
3643  * possible results:
3644  *    :invalid_byte_sequence
3645  *    :incomplete_input
3646  *    :undefined_conversion
3647  *    :after_output
3648  *    :destination_buffer_full
3649  *    :source_buffer_empty
3650  *    :finished
3651  *
3652  * primitive_convert converts source_buffer into destination_buffer.
3653  *
3654  * source_buffer should be a string or nil.
3655  * nil means an empty string.
3656  *
3657  * destination_buffer should be a string.
3658  *
3659  * destination_byteoffset should be an integer or nil.
3660  * nil means the end of destination_buffer.
3661  * If it is omitted, nil is assumed.
3662  *
3663  * destination_bytesize should be an integer or nil.
3664  * nil means unlimited.
3665  * If it is omitted, nil is assumed.
3666  *
3667  * opt should be nil, a hash or an integer.
3668  * nil means no flags.
3669  * If it is omitted, nil is assumed.
3670  *
3671  * primitive_convert converts the content of source_buffer from beginning
3672  * and store the result into destination_buffer.
3673  *
3674  * destination_byteoffset and destination_bytesize specify the region which
3675  * the converted result is stored.
3676  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3677  * If destination_byteoffset is nil,
3678  * destination_buffer.bytesize is used for appending the result.
3679  * destination_bytesize specifies maximum number of bytes.
3680  * If destination_bytesize is nil,
3681  * destination size is unlimited.
3682  * After conversion, destination_buffer is resized to
3683  * destination_byteoffset + actually produced number of bytes.
3684  * Also destination_buffer's encoding is set to destination_encoding.
3685  *
3686  * primitive_convert drops the converted part of source_buffer.
3687  * the dropped part is converted in destination_buffer or
3688  * buffered in Encoding::Converter object.
3689  *
3690  * primitive_convert stops conversion when one of following condition met.
3691  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3692  *   +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3693  * - unexpected end of source buffer (:incomplete_input)
3694  *   this occur only when :partial_input is not specified.
3695  *   +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3696  * - character not representable in output encoding (:undefined_conversion)
3697  *   +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3698  * - after some output is generated, before input is done (:after_output)
3699  *   this occur only when :after_output is specified.
3700  * - destination buffer is full (:destination_buffer_full)
3701  *   this occur only when destination_bytesize is non-nil.
3702  * - source buffer is empty (:source_buffer_empty)
3703  *   this occur only when :partial_input is specified.
3704  * - conversion is finished (:finished)
3705  *
3706  * example:
3707  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3708  *   ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3709  *   p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3710  *
3711  *   ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3712  *   ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3713  *   p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3714  *   ret = ec.primitive_convert(src, dst="", nil, 1)
3715  *   p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3716  *   ret = ec.primitive_convert(src, dst="", nil, 1)
3717  *   p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3718  *   ret = ec.primitive_convert(src, dst="", nil, 1)
3719  *   p [ret, src, dst] #=> [:finished, "", "i"]
3720  *
3721  */
3722 static VALUE
3723 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3724 {
3725     VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3726     rb_econv_t *ec = check_econv(self);
3727     rb_econv_result_t res;
3728     const unsigned char *ip, *is;
3729     unsigned char *op, *os;
3730     long output_byteoffset, output_bytesize;
3731     unsigned long output_byteend;
3732     int flags;
3733
3734     argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3735
3736     if (NIL_P(output_byteoffset_v))
3737         output_byteoffset = 0; /* dummy */
3738     else
3739         output_byteoffset = NUM2LONG(output_byteoffset_v);
3740
3741     if (NIL_P(output_bytesize_v))
3742         output_bytesize = 0; /* dummy */
3743     else
3744         output_bytesize = NUM2LONG(output_bytesize_v);
3745
3746     if (!NIL_P(flags_v)) {
3747         if (!NIL_P(opt)) {
3748             rb_error_arity(argc + 1, 2, 5);
3749         }
3750         flags = NUM2INT(rb_to_int(flags_v));
3751     }
3752     else if (!NIL_P(opt)) {
3753         VALUE v;
3754         flags = 0;
3755         v = rb_hash_aref(opt, sym_partial_input);
3756         if (RTEST(v))
3757             flags |= ECONV_PARTIAL_INPUT;
3758         v = rb_hash_aref(opt, sym_after_output);
3759         if (RTEST(v))
3760             flags |= ECONV_AFTER_OUTPUT;
3761     }
3762     else {
3763         flags = 0;
3764     }
3765
3766     StringValue(output);
3767     if (!NIL_P(input))
3768         StringValue(input);
3769     rb_str_modify(output);
3770
3771     if (NIL_P(output_bytesize_v)) {
3772 #if USE_RVARGC
3773         output_bytesize = rb_str_capacity(output);
3774 #else
3775         output_bytesize = RSTRING_EMBED_LEN_MAX;
3776 #endif
3777         if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3778             output_bytesize = RSTRING_LEN(input);
3779     }
3780
3781   retry:
3782
3783     if (NIL_P(output_byteoffset_v))
3784         output_byteoffset = RSTRING_LEN(output);
3785
3786     if (output_byteoffset < 0)
3787         rb_raise(rb_eArgError, "negative output_byteoffset");
3788
3789     if (RSTRING_LEN(output) < output_byteoffset)
3790         rb_raise(rb_eArgError, "output_byteoffset too big");
3791
3792     if (output_bytesize < 0)
3793         rb_raise(rb_eArgError, "negative output_bytesize");
3794
3795     output_byteend = (unsigned long)output_byteoffset +
3796                      (unsigned long)output_bytesize;
3797
3798     if (output_byteend < (unsigned long)output_byteoffset ||
3799         LONG_MAX < output_byteend)
3800         rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3801
3802     if (rb_str_capacity(output) < output_byteend)
3803         rb_str_resize(output, output_byteend);
3804
3805     if (NIL_P(input)) {
3806         ip = is = NULL;
3807     }
3808     else {
3809         ip = (const unsigned char *)RSTRING_PTR(input);
3810         is = ip + RSTRING_LEN(input);
3811     }
3812
3813     op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3814     os = op + output_bytesize;
3815
3816     res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3817     rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3818     if (!NIL_P(input)) {
3819         rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3820     }
3821
3822     if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3823         if (LONG_MAX / 2 < output_bytesize)
3824             rb_raise(rb_eArgError, "too long conversion result");
3825         output_bytesize *= 2;
3826         output_byteoffset_v = Qnil;
3827         goto retry;
3828     }
3829
3830     if (ec->destination_encoding) {
3831         rb_enc_associate(output, ec->destination_encoding);
3832     }
3833
3834     return econv_result_to_symbol(res);
3835 }
3836
3837 /*
3838  * call-seq:
3839  *   ec.convert(source_string) -> destination_string
3840  *
3841  * Convert source_string and return destination_string.
3842  *
3843  * source_string is assumed as a part of source.
3844  * i.e.  :partial_input=>true is specified internally.
3845  * finish method should be used last.
3846  *
3847  *   ec = Encoding::Converter.new("utf-8", "euc-jp")
3848  *   puts ec.convert("\u3042").dump     #=> "\xA4\xA2"
3849  *   puts ec.finish.dump                #=> ""
3850  *
3851  *   ec = Encoding::Converter.new("euc-jp", "utf-8")
3852  *   puts ec.convert("\xA4").dump       #=> ""
3853  *   puts ec.convert("\xA2").dump       #=> "\xE3\x81\x82"
3854  *   puts ec.finish.dump                #=> ""
3855  *
3856  *   ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3857  *   puts ec.convert("\xE3").dump       #=> "".force_encoding("ISO-2022-JP")
3858  *   puts ec.convert("\x81").dump       #=> "".force_encoding("ISO-2022-JP")
3859  *   puts ec.convert("\x82").dump       #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3860  *   puts ec.finish.dump                #=> "\e(B".force_encoding("ISO-2022-JP")
3861  *
3862  * If a conversion error occur,
3863  * Encoding::UndefinedConversionError or
3864  * Encoding::InvalidByteSequenceError is raised.
3865  * Encoding::Converter#convert doesn't supply methods to recover or restart
3866  * from these exceptions.
3867  * When you want to handle these conversion errors,
3868  * use Encoding::Converter#primitive_convert.
3869  *
3870  */
3871 static VALUE
3872 econv_convert(VALUE self, VALUE source_string)
3873 {
3874     VALUE ret, dst;
3875     VALUE av[5];
3876     int ac;
3877     rb_econv_t *ec = check_econv(self);
3878
3879     StringValue(source_string);
3880
3881     dst = rb_str_new(NULL, 0);
3882
3883     av[0] = rb_str_dup(source_string);
3884     av[1] = dst;
3885     av[2] = Qnil;
3886     av[3] = Qnil;
3887     av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3888     ac = 5;
3889
3890     ret = econv_primitive_convert(ac, av, self);
3891
3892     if (ret == sym_invalid_byte_sequence ||
3893         ret == sym_undefined_conversion ||
3894         ret == sym_incomplete_input) {
3895         VALUE exc = make_econv_exception(ec);
3896         rb_exc_raise(exc);
3897     }
3898
3899     if (ret == sym_finished) {
3900         rb_raise(rb_eArgError, "converter already finished");
3901     }
3902
3903     if (ret != sym_source_buffer_empty) {
3904         rb_bug("unexpected result of econv_primitive_convert");
3905     }
3906
3907     return dst;
3908 }
3909
3910 /*
3911  * call-seq:
3912  *   ec.finish -> string
3913  *
3914  * Finishes the converter.
3915  * It returns the last part of the converted string.
3916  *
3917  *   ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3918  *   p ec.convert("\u3042")     #=> "\e$B$\""
3919  *   p ec.finish                #=> "\e(B"
3920  */
3921 static VALUE
3922 econv_finish(VALUE self)
3923 {
3924     VALUE ret, dst;
3925     VALUE av[5];
3926     int ac;
3927     rb_econv_t *ec = check_econv(self);
3928
3929     dst = rb_str_new(NULL, 0);
3930
3931     av[0] = Qnil;
3932     av[1] = dst;
3933     av[2] = Qnil;
3934     av[3] = Qnil;
3935     av[4] = INT2FIX(0);
3936     ac = 5;
3937
3938     ret = econv_primitive_convert(ac, av, self);
3939
3940     if (ret == sym_invalid_byte_sequence ||
3941         ret == sym_undefined_conversion ||
3942         ret == sym_incomplete_input) {
3943         VALUE exc = make_econv_exception(ec);
3944         rb_exc_raise(exc);
3945     }
3946
3947     if (ret != sym_finished) {
3948         rb_bug("unexpected result of econv_primitive_convert");
3949     }
3950
3951     return dst;
3952 }
3953
3954 /*
3955  * call-seq:
3956  *   ec.primitive_errinfo -> array
3957  *
3958  * primitive_errinfo returns important information regarding the last error
3959  * as a 5-element array:
3960  *
3961  *   [result, enc1, enc2, error_bytes, readagain_bytes]
3962  *
3963  * result is the last result of primitive_convert.
3964  *
3965  * Other elements are only meaningful when result is
3966  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3967  *
3968  * enc1 and enc2 indicate a conversion step as a pair of strings.
3969  * For example, a converter from EUC-JP to ISO-8859-1 converts
3970  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3971  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3972  *
3973  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3974  * error_bytes is discarded portion.
3975  * readagain_bytes is buffered portion which is read again on next conversion.
3976  *
3977  * Example:
3978  *
3979  *   # \xff is invalid as EUC-JP.
3980  *   ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3981  *   ec.primitive_convert(src="\xff", dst="", nil, 10)
3982  *   p ec.primitive_errinfo
3983  *   #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
3984  *
3985  *   # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3986  *   # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3987  *   # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3988  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3989  *   ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3990  *   p ec.primitive_errinfo
3991  *   #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3992  *
3993  *   # partial character is invalid
3994  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3995  *   ec.primitive_convert(src="\xa4", dst="", nil, 10)
3996  *   p ec.primitive_errinfo
3997  *   #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3998  *
3999  *   # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4000  *   # partial characters.
4001  *   ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4002  *   ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4003  *   p ec.primitive_errinfo
4004  *   #=> [:source_buffer_empty, nil, nil, nil, nil]
4005  *
4006  *   # \xd8\x00\x00@ is invalid as UTF-16BE because
4007  *   # no low surrogate after high surrogate (\xd8\x00).
4008  *   # It is detected by 3rd byte (\00) which is part of next character.
4009  *   # So the high surrogate (\xd8\x00) is discarded and
4010  *   # the 3rd byte is read again later.
4011  *   # Since the byte is buffered in ec, it is dropped from src.
4012  *   ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4013  *   ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4014  *   p ec.primitive_errinfo
4015  *   #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4016  *   p src
4017  *   #=> "@"
4018  *
4019  *   # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4020  *   # The problem is detected by 4th byte.
4021  *   ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4022  *   ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4023  *   p ec.primitive_errinfo
4024  *   #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4025  *   p src
4026  *   #=> ""
4027  *
4028  */
4029 static VALUE
4030 econv_primitive_errinfo(VALUE self)
4031 {
4032     rb_econv_t *ec = check_econv(self);
4033
4034     VALUE ary;
4035
4036     ary = rb_ary_new2(5);
4037
4038     rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4039     rb_ary_store(ary, 4, Qnil);
4040
4041     if (ec->last_error.source_encoding)
4042         rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4043
4044     if (ec->last_error.destination_encoding)
4045         rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4046
4047     if (ec->last_error.error_bytes_start) {
4048         rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4049         rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4050     }
4051
4052     return ary;
4053 }
4054
4055 /*
4056  * call-seq:
4057  *   ec.insert_output(string) -> nil
4058  *
4059  * Inserts string into the encoding converter.
4060  * The string will be converted to the destination encoding and
4061  * output on later conversions.
4062  *
4063  * If the destination encoding is stateful,
4064  * string is converted according to the state and the state is updated.
4065  *
4066  * This method should be used only when a conversion error occurs.
4067  *
4068  *  ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4069  *  src = "HIRAGANA LETTER A is \u{3042}."
4070  *  dst = ""
4071  *  p ec.primitive_convert(src, dst)    #=> :undefined_conversion
4072  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["HIRAGANA LETTER A is ", "."]
4073  *  ec.insert_output("<err>")
4074  *  p ec.primitive_convert(src, dst)    #=> :finished
4075  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["HIRAGANA LETTER A is <err>.", ""]
4076  *
4077  *  ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4078  *  src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4079  *  dst = ""
4080  *  p ec.primitive_convert(src, dst)    #=> :undefined_conversion
4081  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4082  *  ec.insert_output "?"                # state change required to output "?".
4083  *  p ec.primitive_convert(src, dst)    #=> :finished
4084  *  puts "[#{dst.dump}, #{src.dump}]"   #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4085  *
4086  */
4087 static VALUE
4088 econv_insert_output(VALUE self, VALUE string)
4089 {
4090     const char *insert_enc;
4091
4092     int ret;
4093
4094     rb_econv_t *ec = check_econv(self);
4095
4096     StringValue(string);
4097     insert_enc = rb_econv_encoding_to_insert_output(ec);
4098     string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4099
4100     ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4101     if (ret == -1) {
4102         rb_raise(rb_eArgError, "too big string");
4103     }
4104
4105     return Qnil;
4106 }
4107
4108 /*
4109  * call-seq:
4110  *   ec.putback                    -> string
4111  *   ec.putback(max_numbytes)      -> string
4112  *
4113  * Put back the bytes which will be converted.
4114  *
4115  * The bytes are caused by invalid_byte_sequence error.
4116  * When invalid_byte_sequence error, some bytes are discarded and
4117  * some bytes are buffered to be converted later.
4118  * The latter bytes can be put back.
4119  * It can be observed by
4120  * Encoding::InvalidByteSequenceError#readagain_bytes and
4121  * Encoding::Converter#primitive_errinfo.
4122  *
4123  *   ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4124  *   src = "\x00\xd8\x61\x00"
4125  *   dst = ""
4126  *   p ec.primitive_convert(src, dst)   #=> :invalid_byte_sequence
4127  *   p ec.primitive_errinfo     #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4128  *   p ec.putback               #=> "a\x00"
4129  *   p ec.putback               #=> ""          # no more bytes to put back
4130  *
4131  */
4132 static VALUE
4133 econv_putback(int argc, VALUE *argv, VALUE self)
4134 {
4135     rb_econv_t *ec = check_econv(self);
4136     int n;
4137     int putbackable;
4138     VALUE str, max;
4139
4140     if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4141         n = rb_econv_putbackable(ec);
4142     }
4143     else {
4144         n = NUM2INT(max);
4145         putbackable = rb_econv_putbackable(ec);
4146         if (putbackable < n)
4147             n = putbackable;
4148     }
4149
4150     str = rb_str_new(NULL, n);
4151     rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4152
4153     if (ec->source_encoding) {
4154         rb_enc_associate(str, ec->source_encoding);
4155     }
4156
4157     return str;
4158 }
4159
4160 /*
4161  * call-seq:
4162  *   ec.last_error -> exception or nil
4163  *
4164  * Returns an exception object for the last conversion.
4165  * Returns nil if the last conversion did not produce an error.
4166  *
4167  * "error" means that
4168  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4169  * Encoding::Converter#convert and
4170  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4171  * Encoding::Converter#primitive_convert.
4172  *
4173  *  ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4174  *  p ec.primitive_convert(src="\xf1abcd", dst="")       #=> :invalid_byte_sequence
4175  *  p ec.last_error      #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4176  *  p ec.primitive_convert(src, dst, nil, 1)             #=> :destination_buffer_full
4177  *  p ec.last_error      #=> nil
4178  *
4179  */
4180 static VALUE
4181 econv_last_error(VALUE self)
4182 {
4183     rb_econv_t *ec = check_econv(self);
4184     VALUE exc;
4185
4186     exc = make_econv_exception(ec);
4187     if (NIL_P(exc))
4188         return Qnil;
4189     return exc;
4190 }
4191
4192 /*
4193  * call-seq:
4194  *   ec.replacement -> string
4195  *
4196  * Returns the replacement string.
4197  *
4198  *  ec = Encoding::Converter.new("euc-jp", "us-ascii")
4199  *  p ec.replacement    #=> "?"
4200  *
4201  *  ec = Encoding::Converter.new("euc-jp", "utf-8")
4202  *  p ec.replacement    #=> "\uFFFD"
4203  */
4204 static VALUE
4205 econv_get_replacement(VALUE self)
4206 {
4207     rb_econv_t *ec = check_econv(self);
4208     int ret;
4209     rb_encoding *enc;
4210
4211     ret = make_replacement(ec);
4212     if (ret == -1) {
4213         rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4214     }
4215
4216     enc = rb_enc_find(ec->replacement_enc);
4217     return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4218 }
4219
4220 /*
4221  * call-seq:
4222  *   ec.replacement = string
4223  *
4224  * Sets the replacement string.
4225  *
4226  *  ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4227  *  ec.replacement = "<undef>"
4228  *  p ec.convert("a \u3042 b")      #=> "a <undef> b"
4229  */
4230 static VALUE
4231 econv_set_replacement(VALUE self, VALUE arg)
4232 {
4233     rb_econv_t *ec = check_econv(self);
4234     VALUE string = arg;
4235     int ret;
4236     rb_encoding *enc;
4237
4238     StringValue(string);
4239     enc = rb_enc_get(string);
4240
4241     ret = rb_econv_set_replacement(ec,
4242             (const unsigned char *)RSTRING_PTR(string),
4243             RSTRING_LEN(string),
4244             rb_enc_name(enc));
4245
4246     if (ret == -1) {
4247         /* xxx: rb_eInvalidByteSequenceError? */
4248         rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4249     }
4250
4251     return arg;
4252 }
4253
4254 VALUE
4255 rb_econv_make_exception(rb_econv_t *ec)
4256 {
4257     return make_econv_exception(ec);
4258 }
4259
4260 void
4261 rb_econv_check_error(rb_econv_t *ec)
4262 {
4263     VALUE exc;
4264
4265     exc = make_econv_exception(ec);
4266     if (NIL_P(exc))
4267         return;
4268     rb_exc_raise(exc);
4269 }
4270
4271 /*
4272  * call-seq:
4273  *   ecerr.source_encoding_name         -> string
4274  *
4275  * Returns the source encoding name as a string.
4276  */
4277 static VALUE
4278 ecerr_source_encoding_name(VALUE self)
4279 {
4280     return rb_attr_get(self, id_source_encoding_name);
4281 }
4282
4283 /*
4284  * call-seq:
4285  *   ecerr.source_encoding              -> encoding
4286  *
4287  * Returns the source encoding as an encoding object.
4288  *
4289  * Note that the result may not be equal to the source encoding of
4290  * the encoding converter if the conversion has multiple steps.
4291  *
4292  *  ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4293  *  begin
4294  *    ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4295  *  rescue Encoding::UndefinedConversionError
4296  *    p $!.source_encoding              #=> #<Encoding:UTF-8>
4297  *    p $!.destination_encoding         #=> #<Encoding:EUC-JP>
4298  *    p $!.source_encoding_name         #=> "UTF-8"
4299  *    p $!.destination_encoding_name    #=> "EUC-JP"
4300  *  end
4301  *
4302  */
4303 static VALUE
4304 ecerr_source_encoding(VALUE self)
4305 {
4306     return rb_attr_get(self, id_source_encoding);
4307 }
4308
4309 /*
4310  * call-seq:
4311  *   ecerr.destination_encoding_name         -> string
4312  *
4313  * Returns the destination encoding name as a string.
4314  */
4315 static VALUE
4316 ecerr_destination_encoding_name(VALUE self)
4317 {
4318     return rb_attr_get(self, id_destination_encoding_name);
4319 }
4320
4321 /*
4322  * call-seq:
4323  *   ecerr.destination_encoding         -> string
4324  *
4325  * Returns the destination encoding as an encoding object.
4326  */
4327 static VALUE
4328 ecerr_destination_encoding(VALUE self)
4329 {
4330     return rb_attr_get(self, id_destination_encoding);
4331 }
4332
4333 /*
4334  * call-seq:
4335  *   ecerr.error_char         -> string
4336  *
4337  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4338  *
4339  *  ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4340  *  begin
4341  *    ec.convert("\xa0")
4342  *  rescue Encoding::UndefinedConversionError
4343  *    puts $!.error_char.dump   #=> "\xC2\xA0"
4344  *    p $!.error_char.encoding  #=> #<Encoding:UTF-8>
4345  *  end
4346  *
4347  */
4348 static VALUE
4349 ecerr_error_char(VALUE self)
4350 {
4351     return rb_attr_get(self, id_error_char);
4352 }
4353
4354 /*
4355  * call-seq:
4356  *   ecerr.error_bytes         -> string
4357  *
4358  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4359  *
4360  *  ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4361  *  begin
4362  *    ec.convert("abc\xA1\xFFdef")
4363  *  rescue Encoding::InvalidByteSequenceError
4364  *    p $!      #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4365  *    puts $!.error_bytes.dump          #=> "\xA1"
4366  *    puts $!.readagain_bytes.dump      #=> "\xFF"
4367  *  end
4368  */
4369 static VALUE
4370 ecerr_error_bytes(VALUE self)
4371 {
4372     return rb_attr_get(self, id_error_bytes);
4373 }
4374
4375 /*
4376  * call-seq:
4377  *   ecerr.readagain_bytes         -> string
4378  *
4379  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4380  */
4381 static VALUE
4382 ecerr_readagain_bytes(VALUE self)
4383 {
4384     return rb_attr_get(self, id_readagain_bytes);
4385 }
4386
4387 /*
4388  * call-seq:
4389  *   ecerr.incomplete_input?         -> true or false
4390  *
4391  * Returns true if the invalid byte sequence error is caused by
4392  * premature end of string.
4393  *
4394  *  ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4395  *
4396  *  begin
4397  *    ec.convert("abc\xA1z")
4398  *  rescue Encoding::InvalidByteSequenceError
4399  *    p $!      #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4400  *    p $!.incomplete_input?    #=> false
4401  *  end
4402  *
4403  *  begin
4404  *    ec.convert("abc\xA1")
4405  *    ec.finish
4406  *  rescue Encoding::InvalidByteSequenceError
4407  *    p $!      #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4408  *    p $!.incomplete_input?    #=> true
4409  *  end
4410  */
4411 static VALUE
4412 ecerr_incomplete_input(VALUE self)
4413 {
4414     return rb_attr_get(self, id_incomplete_input);
4415 }
4416
4417 /*
4418  *  Document-class: Encoding::UndefinedConversionError
4419  *
4420  *  Raised by Encoding and String methods when a transcoding operation
4421  *  fails.
4422  */
4423
4424 /*
4425  *  Document-class: Encoding::InvalidByteSequenceError
4426  *
4427  *  Raised by Encoding and String methods when the string being
4428  *  transcoded contains a byte invalid for the either the source or
4429  *  target encoding.
4430  */
4431
4432 /*
4433  *  Document-class: Encoding::ConverterNotFoundError
4434  *
4435  *  Raised by transcoding methods when a named encoding does not
4436  *  correspond with a known converter.
4437  */
4438
4439 void
4440 Init_transcode(void)
4441 {
4442     transcoder_table = st_init_strcasetable();
4443
4444     id_destination_encoding = rb_intern_const("destination_encoding");
4445     id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4446     id_error_bytes = rb_intern_const("error_bytes");
4447     id_error_char = rb_intern_const("error_char");
4448     id_incomplete_input = rb_intern_const("incomplete_input");
4449     id_readagain_bytes = rb_intern_const("readagain_bytes");
4450     id_source_encoding = rb_intern_const("source_encoding");
4451     id_source_encoding_name = rb_intern_const("source_encoding_name");
4452
4453     sym_invalid = ID2SYM(rb_intern_const("invalid"));
4454     sym_undef = ID2SYM(rb_intern_const("undef"));
4455     sym_replace = ID2SYM(rb_intern_const("replace"));
4456     sym_fallback = ID2SYM(rb_intern_const("fallback"));
4457     sym_xml = ID2SYM(rb_intern_const("xml"));
4458     sym_text = ID2SYM(rb_intern_const("text"));
4459     sym_attr = ID2SYM(rb_intern_const("attr"));
4460
4461     sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4462     sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4463     sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4464     sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4465     sym_finished = ID2SYM(rb_intern_const("finished"));
4466     sym_after_output = ID2SYM(rb_intern_const("after_output"));
4467     sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4468     sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4469     sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4470     sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4471     sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4472
4473 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4474     sym_newline = ID2SYM(rb_intern_const("newline"));
4475     sym_universal = ID2SYM(rb_intern_const("universal"));
4476     sym_crlf = ID2SYM(rb_intern_const("crlf"));
4477     sym_cr = ID2SYM(rb_intern_const("cr"));
4478     sym_lf = ID2SYM(rb_intern_const("lf"));
4479 #endif
4480
4481     InitVM(transcode);
4482 }
4483
4484 void
4485 InitVM_transcode(void)
4486 {
4487     rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4488     rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4489     rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4490
4491     rb_define_method(rb_cString, "encode", str_encode, -1);
4492     rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4493
4494     rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4495     rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4496     rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4497     rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4498     rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4499     rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4500     rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4501     rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4502     rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4503     rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4504     rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4505     rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4506     rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4507     rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4508     rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4509     rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4510     rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4511     rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4512     rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4513
4514     /* Document-const: INVALID_MASK
4515      *
4516      * Mask for invalid byte sequences
4517      */
4518     rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4519
4520     /* Document-const: INVALID_REPLACE
4521      *
4522      * Replace invalid byte sequences
4523      */
4524     rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4525
4526     /* Document-const: UNDEF_MASK
4527      *
4528      * Mask for a valid character in the source encoding but no related
4529      * character(s) in destination encoding.
4530      */
4531     rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4532
4533     /* Document-const: UNDEF_REPLACE
4534      *
4535      * Replace byte sequences that are undefined in the destination encoding.
4536      */
4537     rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4538
4539     /* Document-const: UNDEF_HEX_CHARREF
4540      *
4541      * Replace byte sequences that are undefined in the destination encoding
4542      * with an XML hexadecimal character reference.  This is valid for XML
4543      * conversion.
4544      */
4545     rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4546
4547     /* Document-const: PARTIAL_INPUT
4548      *
4549      * Indicates the source may be part of a larger string.  See
4550      * primitive_convert for an example.
4551      */
4552     rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4553
4554     /* Document-const: AFTER_OUTPUT
4555      *
4556      * Stop converting after some output is complete but before all of the
4557      * input was consumed.  See primitive_convert for an example.
4558      */
4559     rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4560
4561     /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4562      *
4563      * Decorator for converting CRLF and CR to LF
4564      */
4565     rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4566
4567     /* Document-const: CRLF_NEWLINE_DECORATOR
4568      *
4569      * Decorator for converting LF to CRLF
4570      */
4571     rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4572
4573     /* Document-const: CR_NEWLINE_DECORATOR
4574      *
4575      * Decorator for converting LF to CR
4576      */
4577     rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4578
4579     /* Document-const: XML_TEXT_DECORATOR
4580      *
4581      * Escape as XML CharData
4582      */
4583     rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4584
4585     /* Document-const: XML_ATTR_CONTENT_DECORATOR
4586      *
4587      * Escape as XML AttValue
4588      */
4589     rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4590
4591     /* Document-const: XML_ATTR_QUOTE_DECORATOR
4592      *
4593      * Escape as XML AttValue
4594      */
4595     rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4596
4597     rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4598     rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4599     rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4600     rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4601     rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4602
4603     rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4604     rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4605     rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4606     rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4607     rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4608     rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4609     rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4610
4611     Init_newline();
4612 }