* 2022-01-18 [ci skip]
[ruby-80x24.org.git] / transcode.c
blobd7011443f87831239ca7ec0b88cd865a58f1a754
1 /**********************************************************************
3 transcode.c -
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
8 Copyright (C) 2007 Martin Duerst
10 **********************************************************************/
12 #include "ruby/internal/config.h"
14 #include <ctype.h>
16 #include "internal.h"
17 #include "internal/array.h"
18 #include "internal/inits.h"
19 #include "internal/object.h"
20 #include "internal/string.h"
21 #include "internal/transcode.h"
22 #include "ruby/encoding.h"
24 #include "transcode_data.h"
25 #include "id.h"
27 #define ENABLE_ECONV_NEWLINE_OPTION 1
29 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30 static VALUE rb_eUndefinedConversionError;
31 static VALUE rb_eInvalidByteSequenceError;
32 static VALUE rb_eConverterNotFoundError;
34 VALUE rb_cEncodingConverter;
36 static ID id_destination_encoding;
37 static ID id_destination_encoding_name;
38 static ID id_error_bytes;
39 static ID id_error_char;
40 static ID id_incomplete_input;
41 static ID id_readagain_bytes;
42 static ID id_source_encoding;
43 static ID id_source_encoding_name;
45 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
46 static VALUE sym_xml, sym_text, sym_attr;
47 static VALUE sym_universal_newline;
48 static VALUE sym_crlf_newline;
49 static VALUE sym_cr_newline;
50 #ifdef ENABLE_ECONV_NEWLINE_OPTION
51 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
52 #endif
53 static VALUE sym_partial_input;
55 static VALUE sym_invalid_byte_sequence;
56 static VALUE sym_undefined_conversion;
57 static VALUE sym_destination_buffer_full;
58 static VALUE sym_source_buffer_empty;
59 static VALUE sym_finished;
60 static VALUE sym_after_output;
61 static VALUE sym_incomplete_input;
63 static unsigned char *
64 allocate_converted_string(const char *sname, const char *dname,
65 const unsigned char *str, size_t len,
66 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
67 size_t *dst_len_ptr);
69 /* dynamic structure, one per conversion (similar to iconv_t) */
70 /* may carry conversion state (e.g. for iso-2022-jp) */
71 typedef struct rb_transcoding {
72 const rb_transcoder *transcoder;
74 int flags;
76 int resume_position;
77 unsigned int next_table;
78 VALUE next_info;
79 unsigned char next_byte;
80 unsigned int output_index;
82 ssize_t recognized_len; /* already interpreted */
83 ssize_t readagain_len; /* not yet interpreted */
84 union {
85 unsigned char ary[8]; /* max_input <= sizeof(ary) */
86 unsigned char *ptr; /* length: max_input */
87 } readbuf; /* recognized_len + readagain_len used */
89 ssize_t writebuf_off;
90 ssize_t writebuf_len;
91 union {
92 unsigned char ary[8]; /* max_output <= sizeof(ary) */
93 unsigned char *ptr; /* length: max_output */
94 } writebuf;
96 union rb_transcoding_state_t { /* opaque data for stateful encoding */
97 void *ptr;
98 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
99 double dummy_for_alignment;
100 } state;
101 } rb_transcoding;
102 #define TRANSCODING_READBUF(tc) \
103 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
104 (tc)->readbuf.ary : \
105 (tc)->readbuf.ptr)
106 #define TRANSCODING_WRITEBUF(tc) \
107 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
108 (tc)->writebuf.ary : \
109 (tc)->writebuf.ptr)
110 #define TRANSCODING_WRITEBUF_SIZE(tc) \
111 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
112 sizeof((tc)->writebuf.ary) : \
113 (size_t)(tc)->transcoder->max_output)
114 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
115 #define TRANSCODING_STATE(tc) \
116 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
117 (tc)->state.ary : \
118 (tc)->state.ptr)
120 typedef struct {
121 struct rb_transcoding *tc;
122 unsigned char *out_buf_start;
123 unsigned char *out_data_start;
124 unsigned char *out_data_end;
125 unsigned char *out_buf_end;
126 rb_econv_result_t last_result;
127 } rb_econv_elem_t;
129 struct rb_econv_t {
130 int flags;
131 int started; /* bool */
133 const char *source_encoding_name;
134 const char *destination_encoding_name;
136 const unsigned char *replacement_str;
137 size_t replacement_len;
138 const char *replacement_enc;
140 unsigned char *in_buf_start;
141 unsigned char *in_data_start;
142 unsigned char *in_data_end;
143 unsigned char *in_buf_end;
144 rb_econv_elem_t *elems;
145 int replacement_allocated; /* bool */
146 int num_allocated;
147 int num_trans;
148 int num_finished;
149 struct rb_transcoding *last_tc;
151 /* last error */
152 struct {
153 rb_econv_result_t result;
154 struct rb_transcoding *error_tc;
155 const char *source_encoding;
156 const char *destination_encoding;
157 const unsigned char *error_bytes_start;
158 size_t error_bytes_len;
159 size_t readagain_len;
160 } last_error;
162 /* The following fields are only for Encoding::Converter.
163 * rb_econv_open set them NULL. */
164 rb_encoding *source_encoding;
165 rb_encoding *destination_encoding;
169 * Dispatch data and logic
172 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
174 typedef struct {
175 const char *sname;
176 const char *dname;
177 const char *lib; /* null means no need to load a library */
178 const rb_transcoder *transcoder;
179 } transcoder_entry_t;
181 static st_table *transcoder_table;
183 static transcoder_entry_t *
184 make_transcoder_entry(const char *sname, const char *dname)
186 st_data_t val;
187 st_table *table2;
189 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
190 val = (st_data_t)st_init_strcasetable();
191 st_add_direct(transcoder_table, (st_data_t)sname, val);
193 table2 = (st_table *)val;
194 if (!st_lookup(table2, (st_data_t)dname, &val)) {
195 transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
196 entry->sname = sname;
197 entry->dname = dname;
198 entry->lib = NULL;
199 entry->transcoder = NULL;
200 val = (st_data_t)entry;
201 st_add_direct(table2, (st_data_t)dname, val);
203 return (transcoder_entry_t *)val;
206 static transcoder_entry_t *
207 get_transcoder_entry(const char *sname, const char *dname)
209 st_data_t val;
210 st_table *table2;
212 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
213 return NULL;
215 table2 = (st_table *)val;
216 if (!st_lookup(table2, (st_data_t)dname, &val)) {
217 return NULL;
219 return (transcoder_entry_t *)val;
222 void
223 rb_register_transcoder(const rb_transcoder *tr)
225 const char *const sname = tr->src_encoding;
226 const char *const dname = tr->dst_encoding;
228 transcoder_entry_t *entry;
230 entry = make_transcoder_entry(sname, dname);
231 if (entry->transcoder) {
232 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
233 sname, dname);
236 entry->transcoder = tr;
239 static void
240 declare_transcoder(const char *sname, const char *dname, const char *lib)
242 transcoder_entry_t *entry;
244 entry = make_transcoder_entry(sname, dname);
245 entry->lib = lib;
248 static const char transcoder_lib_prefix[] = "enc/trans/";
250 void
251 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
253 if (!lib) {
254 rb_raise(rb_eArgError, "invalid library name - (null)");
256 declare_transcoder(enc1, enc2, lib);
259 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
261 typedef struct search_path_queue_tag {
262 struct search_path_queue_tag *next;
263 const char *enc;
264 } search_path_queue_t;
266 typedef struct {
267 st_table *visited;
268 search_path_queue_t *queue;
269 search_path_queue_t **queue_last_ptr;
270 const char *base_enc;
271 } search_path_bfs_t;
273 static int
274 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
276 const char *dname = (const char *)key;
277 search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
278 search_path_queue_t *q;
280 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
281 return ST_CONTINUE;
284 q = ALLOC(search_path_queue_t);
285 q->enc = dname;
286 q->next = NULL;
287 *bfs->queue_last_ptr = q;
288 bfs->queue_last_ptr = &q->next;
290 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
291 return ST_CONTINUE;
294 static int
295 transcode_search_path(const char *sname, const char *dname,
296 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
297 void *arg)
299 search_path_bfs_t bfs;
300 search_path_queue_t *q;
301 st_data_t val;
302 st_table *table2;
303 int found;
304 int pathlen = -1;
306 if (encoding_equal(sname, dname))
307 return -1;
309 q = ALLOC(search_path_queue_t);
310 q->enc = sname;
311 q->next = NULL;
312 bfs.queue_last_ptr = &q->next;
313 bfs.queue = q;
315 bfs.visited = st_init_strcasetable();
316 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
318 while (bfs.queue) {
319 q = bfs.queue;
320 bfs.queue = q->next;
321 if (!bfs.queue)
322 bfs.queue_last_ptr = &bfs.queue;
324 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
325 xfree(q);
326 continue;
328 table2 = (st_table *)val;
330 if (st_lookup(table2, (st_data_t)dname, &val)) {
331 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
332 xfree(q);
333 found = 1;
334 goto cleanup;
337 bfs.base_enc = q->enc;
338 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
339 bfs.base_enc = NULL;
341 xfree(q);
343 found = 0;
345 cleanup:
346 while (bfs.queue) {
347 q = bfs.queue;
348 bfs.queue = q->next;
349 xfree(q);
352 if (found) {
353 const char *enc = dname;
354 int depth;
355 pathlen = 0;
356 while (1) {
357 st_lookup(bfs.visited, (st_data_t)enc, &val);
358 if (!val)
359 break;
360 pathlen++;
361 enc = (const char *)val;
363 depth = pathlen;
364 enc = dname;
365 while (1) {
366 st_lookup(bfs.visited, (st_data_t)enc, &val);
367 if (!val)
368 break;
369 callback((const char *)val, enc, --depth, arg);
370 enc = (const char *)val;
374 st_free_table(bfs.visited);
376 return pathlen; /* is -1 if not found */
379 int rb_require_internal_silent(VALUE fname);
381 static const rb_transcoder *
382 load_transcoder_entry(transcoder_entry_t *entry)
384 if (entry->transcoder)
385 return entry->transcoder;
387 if (entry->lib) {
388 const char *const lib = entry->lib;
389 const size_t len = strlen(lib);
390 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
391 const VALUE fn = rb_str_new(0, total_len);
392 char *const path = RSTRING_PTR(fn);
394 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
395 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
396 rb_str_set_len(fn, total_len);
397 OBJ_FREEZE(fn);
398 rb_require_internal_silent(fn);
401 if (entry->transcoder)
402 return entry->transcoder;
404 return NULL;
407 static const char*
408 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
410 if (encoding_equal(encname, "UTF-8")) {
411 *len_ret = 3;
412 *repl_encname_ptr = "UTF-8";
413 return "\xEF\xBF\xBD";
415 else {
416 *len_ret = 1;
417 *repl_encname_ptr = "US-ASCII";
418 return "?";
423 * Transcoding engine logic
426 static const unsigned char *
427 transcode_char_start(rb_transcoding *tc,
428 const unsigned char *in_start,
429 const unsigned char *inchar_start,
430 const unsigned char *in_p,
431 size_t *char_len_ptr)
433 const unsigned char *ptr;
434 if (inchar_start - in_start < tc->recognized_len) {
435 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
436 inchar_start, unsigned char, in_p - inchar_start);
437 ptr = TRANSCODING_READBUF(tc);
439 else {
440 ptr = inchar_start - tc->recognized_len;
442 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
443 return ptr;
446 static rb_econv_result_t
447 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
448 const unsigned char *in_stop, unsigned char *out_stop,
449 rb_transcoding *tc,
450 const int opt)
452 const rb_transcoder *tr = tc->transcoder;
453 int unitlen = tr->input_unit_length;
454 ssize_t readagain_len = 0;
456 const unsigned char *inchar_start;
457 const unsigned char *in_p;
459 unsigned char *out_p;
461 in_p = inchar_start = *in_pos;
463 out_p = *out_pos;
465 #define SUSPEND(ret, num) \
466 do { \
467 tc->resume_position = (num); \
468 if (0 < in_p - inchar_start) \
469 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
470 inchar_start, unsigned char, in_p - inchar_start); \
471 *in_pos = in_p; \
472 *out_pos = out_p; \
473 tc->recognized_len += in_p - inchar_start; \
474 if (readagain_len) { \
475 tc->recognized_len -= readagain_len; \
476 tc->readagain_len = readagain_len; \
478 return (ret); \
479 resume_label ## num:; \
480 } while (0)
481 #define SUSPEND_OBUF(num) \
482 do { \
483 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
484 } while (0)
486 #define SUSPEND_AFTER_OUTPUT(num) \
487 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
488 SUSPEND(econv_after_output, num); \
491 #define next_table (tc->next_table)
492 #define next_info (tc->next_info)
493 #define next_byte (tc->next_byte)
494 #define writebuf_len (tc->writebuf_len)
495 #define writebuf_off (tc->writebuf_off)
497 switch (tc->resume_position) {
498 case 0: break;
499 case 1: goto resume_label1;
500 case 2: goto resume_label2;
501 case 3: goto resume_label3;
502 case 4: goto resume_label4;
503 case 5: goto resume_label5;
504 case 6: goto resume_label6;
505 case 7: goto resume_label7;
506 case 8: goto resume_label8;
507 case 9: goto resume_label9;
508 case 10: goto resume_label10;
509 case 11: goto resume_label11;
510 case 12: goto resume_label12;
511 case 13: goto resume_label13;
512 case 14: goto resume_label14;
513 case 15: goto resume_label15;
514 case 16: goto resume_label16;
515 case 17: goto resume_label17;
516 case 18: goto resume_label18;
517 case 19: goto resume_label19;
518 case 20: goto resume_label20;
519 case 21: goto resume_label21;
520 case 22: goto resume_label22;
521 case 23: goto resume_label23;
522 case 24: goto resume_label24;
523 case 25: goto resume_label25;
524 case 26: goto resume_label26;
525 case 27: goto resume_label27;
526 case 28: goto resume_label28;
527 case 29: goto resume_label29;
528 case 30: goto resume_label30;
529 case 31: goto resume_label31;
530 case 32: goto resume_label32;
531 case 33: goto resume_label33;
532 case 34: goto resume_label34;
535 while (1) {
536 inchar_start = in_p;
537 tc->recognized_len = 0;
538 next_table = tr->conv_tree_start;
540 SUSPEND_AFTER_OUTPUT(24);
542 if (in_stop <= in_p) {
543 if (!(opt & ECONV_PARTIAL_INPUT))
544 break;
545 SUSPEND(econv_source_buffer_empty, 7);
546 continue;
549 #define BYTE_ADDR(index) (tr->byte_array + (index))
550 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
551 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
552 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
553 #define BL_MIN_BYTE (BL_BASE[0])
554 #define BL_MAX_BYTE (BL_BASE[1])
555 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
556 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
558 next_byte = (unsigned char)*in_p++;
559 follow_byte:
560 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
561 next_info = INVALID;
562 else {
563 next_info = (VALUE)BL_ACTION(next_byte);
565 follow_info:
566 switch (next_info & 0x1F) {
567 case NOMAP:
569 const unsigned char *p = inchar_start;
570 writebuf_off = 0;
571 while (p < in_p) {
572 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
574 writebuf_len = writebuf_off;
575 writebuf_off = 0;
576 while (writebuf_off < writebuf_len) {
577 SUSPEND_OBUF(3);
578 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
581 continue;
582 case 0x00: case 0x04: case 0x08: case 0x0C:
583 case 0x10: case 0x14: case 0x18: case 0x1C:
584 SUSPEND_AFTER_OUTPUT(25);
585 while (in_p >= in_stop) {
586 if (!(opt & ECONV_PARTIAL_INPUT))
587 goto incomplete;
588 SUSPEND(econv_source_buffer_empty, 5);
590 next_byte = (unsigned char)*in_p++;
591 next_table = (unsigned int)next_info;
592 goto follow_byte;
593 case ZERObt: /* drop input */
594 continue;
595 case ONEbt:
596 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
597 continue;
598 case TWObt:
599 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
600 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
601 continue;
602 case THREEbt:
603 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
604 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
605 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
606 continue;
607 case FOURbt:
608 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
609 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
610 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
611 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
612 continue;
613 case GB4bt:
614 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
615 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
616 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
617 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
618 continue;
619 case STR1:
620 tc->output_index = 0;
621 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
622 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
623 tc->output_index++;
625 continue;
626 case FUNii:
627 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
628 goto follow_info;
629 case FUNsi:
631 const unsigned char *char_start;
632 size_t char_len;
633 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
634 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
635 goto follow_info;
637 case FUNio:
638 SUSPEND_OBUF(13);
639 if (tr->max_output <= out_stop - out_p)
640 out_p += tr->func_io(TRANSCODING_STATE(tc),
641 next_info, out_p, out_stop - out_p);
642 else {
643 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
644 next_info,
645 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
646 writebuf_off = 0;
647 while (writebuf_off < writebuf_len) {
648 SUSPEND_OBUF(20);
649 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
652 break;
653 case FUNso:
655 const unsigned char *char_start;
656 size_t char_len;
657 SUSPEND_OBUF(14);
658 if (tr->max_output <= out_stop - out_p) {
659 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
660 out_p += tr->func_so(TRANSCODING_STATE(tc),
661 char_start, (size_t)char_len,
662 out_p, out_stop - out_p);
664 else {
665 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
667 char_start, (size_t)char_len,
668 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
669 writebuf_off = 0;
670 while (writebuf_off < writebuf_len) {
671 SUSPEND_OBUF(22);
672 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
675 break;
677 case FUNsio:
679 const unsigned char *char_start;
680 size_t char_len;
681 SUSPEND_OBUF(33);
682 if (tr->max_output <= out_stop - out_p) {
683 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
684 out_p += tr->func_sio(TRANSCODING_STATE(tc),
685 char_start, (size_t)char_len, next_info,
686 out_p, out_stop - out_p);
688 else {
689 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
690 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
691 char_start, (size_t)char_len, next_info,
692 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
693 writebuf_off = 0;
694 while (writebuf_off < writebuf_len) {
695 SUSPEND_OBUF(34);
696 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
699 break;
701 case INVALID:
702 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
703 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
704 SUSPEND_AFTER_OUTPUT(26);
705 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
706 in_p = in_stop;
707 SUSPEND(econv_source_buffer_empty, 8);
709 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
710 in_p = in_stop;
712 else {
713 in_p = inchar_start + (unitlen - tc->recognized_len);
716 else {
717 ssize_t invalid_len; /* including the last byte which causes invalid */
718 ssize_t discard_len;
719 invalid_len = tc->recognized_len + (in_p - inchar_start);
720 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
721 readagain_len = invalid_len - discard_len;
723 goto invalid;
724 case UNDEF:
725 goto undef;
726 default:
727 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
729 continue;
731 invalid:
732 SUSPEND(econv_invalid_byte_sequence, 1);
733 continue;
735 incomplete:
736 SUSPEND(econv_incomplete_input, 27);
737 continue;
739 undef:
740 SUSPEND(econv_undefined_conversion, 2);
741 continue;
744 /* cleanup */
745 if (tr->finish_func) {
746 SUSPEND_OBUF(4);
747 if (tr->max_output <= out_stop - out_p) {
748 out_p += tr->finish_func(TRANSCODING_STATE(tc),
749 out_p, out_stop - out_p);
751 else {
752 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
753 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
754 writebuf_off = 0;
755 while (writebuf_off < writebuf_len) {
756 SUSPEND_OBUF(23);
757 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
761 while (1)
762 SUSPEND(econv_finished, 6);
763 #undef SUSPEND
764 #undef next_table
765 #undef next_info
766 #undef next_byte
767 #undef writebuf_len
768 #undef writebuf_off
771 static rb_econv_result_t
772 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
773 const unsigned char *in_stop, unsigned char *out_stop,
774 rb_transcoding *tc,
775 const int opt)
777 if (tc->readagain_len) {
778 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
779 const unsigned char *readagain_pos = readagain_buf;
780 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
781 rb_econv_result_t res;
783 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
784 unsigned char, tc->readagain_len);
785 tc->readagain_len = 0;
786 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
787 if (res != econv_source_buffer_empty) {
788 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
789 readagain_pos, unsigned char, readagain_stop - readagain_pos);
790 tc->readagain_len += readagain_stop - readagain_pos;
791 return res;
794 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
797 static rb_transcoding *
798 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
800 rb_transcoding *tc;
802 tc = ALLOC(rb_transcoding);
803 tc->transcoder = tr;
804 tc->flags = flags;
805 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
806 tc->state.ptr = xmalloc(tr->state_size);
807 if (tr->state_init_func) {
808 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
810 tc->resume_position = 0;
811 tc->recognized_len = 0;
812 tc->readagain_len = 0;
813 tc->writebuf_len = 0;
814 tc->writebuf_off = 0;
815 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
816 tc->readbuf.ptr = xmalloc(tr->max_input);
818 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
819 tc->writebuf.ptr = xmalloc(tr->max_output);
821 return tc;
824 static rb_econv_result_t
825 rb_transcoding_convert(rb_transcoding *tc,
826 const unsigned char **input_ptr, const unsigned char *input_stop,
827 unsigned char **output_ptr, unsigned char *output_stop,
828 int flags)
830 return transcode_restartable(
831 input_ptr, output_ptr,
832 input_stop, output_stop,
833 tc, flags);
836 static void
837 rb_transcoding_close(rb_transcoding *tc)
839 const rb_transcoder *tr = tc->transcoder;
840 if (tr->state_fini_func) {
841 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
843 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
844 xfree(tc->state.ptr);
845 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
846 xfree(tc->readbuf.ptr);
847 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
848 xfree(tc->writebuf.ptr);
849 xfree(tc);
852 static size_t
853 rb_transcoding_memsize(rb_transcoding *tc)
855 size_t size = sizeof(rb_transcoding);
856 const rb_transcoder *tr = tc->transcoder;
858 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
859 size += tr->state_size;
861 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
862 size += tr->max_input;
864 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
865 size += tr->max_output;
867 return size;
870 static rb_econv_t *
871 rb_econv_alloc(int n_hint)
873 rb_econv_t *ec;
875 if (n_hint <= 0)
876 n_hint = 1;
878 ec = ALLOC(rb_econv_t);
879 ec->flags = 0;
880 ec->source_encoding_name = NULL;
881 ec->destination_encoding_name = NULL;
882 ec->started = 0;
883 ec->replacement_str = NULL;
884 ec->replacement_len = 0;
885 ec->replacement_enc = NULL;
886 ec->replacement_allocated = 0;
887 ec->in_buf_start = NULL;
888 ec->in_data_start = NULL;
889 ec->in_data_end = NULL;
890 ec->in_buf_end = NULL;
891 ec->num_allocated = n_hint;
892 ec->num_trans = 0;
893 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
894 ec->num_finished = 0;
895 ec->last_tc = NULL;
896 ec->last_error.result = econv_source_buffer_empty;
897 ec->last_error.error_tc = NULL;
898 ec->last_error.source_encoding = NULL;
899 ec->last_error.destination_encoding = NULL;
900 ec->last_error.error_bytes_start = NULL;
901 ec->last_error.error_bytes_len = 0;
902 ec->last_error.readagain_len = 0;
903 ec->source_encoding = NULL;
904 ec->destination_encoding = NULL;
905 return ec;
908 static int
909 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
911 int n, j;
912 int bufsize = 4096;
913 unsigned char *p;
915 if (ec->num_trans == ec->num_allocated) {
916 n = ec->num_allocated * 2;
917 REALLOC_N(ec->elems, rb_econv_elem_t, n);
918 ec->num_allocated = n;
921 p = xmalloc(bufsize);
923 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
925 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
926 ec->elems[i].out_buf_start = p;
927 ec->elems[i].out_buf_end = p + bufsize;
928 ec->elems[i].out_data_start = p;
929 ec->elems[i].out_data_end = p;
930 ec->elems[i].last_result = econv_source_buffer_empty;
932 ec->num_trans++;
934 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
935 for (j = ec->num_trans-1; i <= j; j--) {
936 rb_transcoding *tc = ec->elems[j].tc;
937 const rb_transcoder *tr2 = tc->transcoder;
938 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
939 ec->last_tc = tc;
940 break;
944 return 0;
947 static rb_econv_t *
948 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
950 rb_econv_t *ec;
951 int i, ret;
953 for (i = 0; i < n; i++) {
954 const rb_transcoder *tr;
955 tr = load_transcoder_entry(entries[i]);
956 if (!tr)
957 return NULL;
960 ec = rb_econv_alloc(n);
962 for (i = 0; i < n; i++) {
963 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
964 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
965 if (ret == -1) {
966 rb_econv_close(ec);
967 return NULL;
971 return ec;
974 struct trans_open_t {
975 transcoder_entry_t **entries;
976 int num_additional;
979 static void
980 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
982 struct trans_open_t *toarg = arg;
984 if (!toarg->entries) {
985 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
987 toarg->entries[depth] = get_transcoder_entry(sname, dname);
990 static rb_econv_t *
991 rb_econv_open0(const char *sname, const char *dname, int ecflags)
993 transcoder_entry_t **entries = NULL;
994 int num_trans;
995 rb_econv_t *ec;
997 /* Just check if sname and dname are defined */
998 /* (This check is needed?) */
999 if (*sname) rb_enc_find_index(sname);
1000 if (*dname) rb_enc_find_index(dname);
1002 if (*sname == '\0' && *dname == '\0') {
1003 num_trans = 0;
1004 entries = NULL;
1005 sname = dname = "";
1007 else {
1008 struct trans_open_t toarg;
1009 toarg.entries = NULL;
1010 toarg.num_additional = 0;
1011 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1012 entries = toarg.entries;
1013 if (num_trans < 0) {
1014 xfree(entries);
1015 return NULL;
1019 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1020 xfree(entries);
1021 if (!ec)
1022 return NULL;
1024 ec->flags = ecflags;
1025 ec->source_encoding_name = sname;
1026 ec->destination_encoding_name = dname;
1028 return ec;
1031 #define MAX_ECFLAGS_DECORATORS 32
1033 static int
1034 decorator_names(int ecflags, const char **decorators_ret)
1036 int num_decorators;
1038 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1039 case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
1040 case ECONV_CRLF_NEWLINE_DECORATOR:
1041 case ECONV_CR_NEWLINE_DECORATOR:
1042 case 0:
1043 break;
1044 default:
1045 return -1;
1048 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1049 (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
1050 return -1;
1052 num_decorators = 0;
1054 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1055 decorators_ret[num_decorators++] = "xml_text_escape";
1056 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1057 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1058 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1059 decorators_ret[num_decorators++] = "xml_attr_quote";
1061 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1062 decorators_ret[num_decorators++] = "crlf_newline";
1063 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1064 decorators_ret[num_decorators++] = "cr_newline";
1065 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1066 decorators_ret[num_decorators++] = "universal_newline";
1068 return num_decorators;
1071 rb_econv_t *
1072 rb_econv_open(const char *sname, const char *dname, int ecflags)
1074 rb_econv_t *ec;
1075 int num_decorators;
1076 const char *decorators[MAX_ECFLAGS_DECORATORS];
1077 int i;
1079 num_decorators = decorator_names(ecflags, decorators);
1080 if (num_decorators == -1)
1081 return NULL;
1083 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1084 if (!ec)
1085 return NULL;
1087 for (i = 0; i < num_decorators; i++)
1088 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1089 rb_econv_close(ec);
1090 return NULL;
1093 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1095 return ec;
1098 static int
1099 trans_sweep(rb_econv_t *ec,
1100 const unsigned char **input_ptr, const unsigned char *input_stop,
1101 unsigned char **output_ptr, unsigned char *output_stop,
1102 int flags,
1103 int start)
1105 int try;
1106 int i, f;
1108 const unsigned char **ipp, *is, *iold;
1109 unsigned char **opp, *os, *oold;
1110 rb_econv_result_t res;
1112 try = 1;
1113 while (try) {
1114 try = 0;
1115 for (i = start; i < ec->num_trans; i++) {
1116 rb_econv_elem_t *te = &ec->elems[i];
1118 if (i == 0) {
1119 ipp = input_ptr;
1120 is = input_stop;
1122 else {
1123 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1124 ipp = (const unsigned char **)&prev_te->out_data_start;
1125 is = prev_te->out_data_end;
1128 if (i == ec->num_trans-1) {
1129 opp = output_ptr;
1130 os = output_stop;
1132 else {
1133 if (te->out_buf_start != te->out_data_start) {
1134 ssize_t len = te->out_data_end - te->out_data_start;
1135 ssize_t off = te->out_data_start - te->out_buf_start;
1136 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1137 te->out_data_start = te->out_buf_start;
1138 te->out_data_end -= off;
1140 opp = &te->out_data_end;
1141 os = te->out_buf_end;
1144 f = flags;
1145 if (ec->num_finished != i)
1146 f |= ECONV_PARTIAL_INPUT;
1147 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1148 start = 1;
1149 flags &= ~ECONV_AFTER_OUTPUT;
1151 if (i != 0)
1152 f &= ~ECONV_AFTER_OUTPUT;
1153 iold = *ipp;
1154 oold = *opp;
1155 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1156 if (iold != *ipp || oold != *opp)
1157 try = 1;
1159 switch (res) {
1160 case econv_invalid_byte_sequence:
1161 case econv_incomplete_input:
1162 case econv_undefined_conversion:
1163 case econv_after_output:
1164 return i;
1166 case econv_destination_buffer_full:
1167 case econv_source_buffer_empty:
1168 break;
1170 case econv_finished:
1171 ec->num_finished = i+1;
1172 break;
1176 return -1;
1179 static rb_econv_result_t
1180 rb_trans_conv(rb_econv_t *ec,
1181 const unsigned char **input_ptr, const unsigned char *input_stop,
1182 unsigned char **output_ptr, unsigned char *output_stop,
1183 int flags,
1184 int *result_position_ptr)
1186 int i;
1187 int needreport_index;
1188 int sweep_start;
1190 unsigned char empty_buf;
1191 unsigned char *empty_ptr = &empty_buf;
1193 if (!input_ptr) {
1194 input_ptr = (const unsigned char **)&empty_ptr;
1195 input_stop = empty_ptr;
1198 if (!output_ptr) {
1199 output_ptr = &empty_ptr;
1200 output_stop = empty_ptr;
1203 if (ec->elems[0].last_result == econv_after_output)
1204 ec->elems[0].last_result = econv_source_buffer_empty;
1206 for (i = ec->num_trans-1; 0 <= i; i--) {
1207 switch (ec->elems[i].last_result) {
1208 case econv_invalid_byte_sequence:
1209 case econv_incomplete_input:
1210 case econv_undefined_conversion:
1211 case econv_after_output:
1212 case econv_finished:
1213 sweep_start = i+1;
1214 goto found_needreport;
1216 case econv_destination_buffer_full:
1217 case econv_source_buffer_empty:
1218 break;
1220 default:
1221 rb_bug("unexpected transcode last result");
1225 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1227 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1228 (flags & ECONV_AFTER_OUTPUT)) {
1229 rb_econv_result_t res;
1231 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1232 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
1233 result_position_ptr);
1235 if (res == econv_source_buffer_empty)
1236 return econv_after_output;
1237 return res;
1240 sweep_start = 0;
1242 found_needreport:
1244 do {
1245 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1246 sweep_start = needreport_index + 1;
1247 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1249 for (i = ec->num_trans-1; 0 <= i; i--) {
1250 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1251 rb_econv_result_t res = ec->elems[i].last_result;
1252 if (res == econv_invalid_byte_sequence ||
1253 res == econv_incomplete_input ||
1254 res == econv_undefined_conversion ||
1255 res == econv_after_output) {
1256 ec->elems[i].last_result = econv_source_buffer_empty;
1258 if (result_position_ptr)
1259 *result_position_ptr = i;
1260 return res;
1263 if (result_position_ptr)
1264 *result_position_ptr = -1;
1265 return econv_source_buffer_empty;
1268 static rb_econv_result_t
1269 rb_econv_convert0(rb_econv_t *ec,
1270 const unsigned char **input_ptr, const unsigned char *input_stop,
1271 unsigned char **output_ptr, unsigned char *output_stop,
1272 int flags)
1274 rb_econv_result_t res;
1275 int result_position;
1276 int has_output = 0;
1278 memset(&ec->last_error, 0, sizeof(ec->last_error));
1280 if (ec->num_trans == 0) {
1281 size_t len;
1282 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1283 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1284 len = output_stop - *output_ptr;
1285 memcpy(*output_ptr, ec->in_data_start, len);
1286 *output_ptr = output_stop;
1287 ec->in_data_start += len;
1288 res = econv_destination_buffer_full;
1289 goto gotresult;
1291 len = ec->in_data_end - ec->in_data_start;
1292 memcpy(*output_ptr, ec->in_data_start, len);
1293 *output_ptr += len;
1294 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1295 if (flags & ECONV_AFTER_OUTPUT) {
1296 res = econv_after_output;
1297 goto gotresult;
1300 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1301 len = output_stop - *output_ptr;
1303 else {
1304 len = input_stop - *input_ptr;
1306 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1307 *(*output_ptr)++ = *(*input_ptr)++;
1308 res = econv_after_output;
1309 goto gotresult;
1311 memcpy(*output_ptr, *input_ptr, len);
1312 *output_ptr += len;
1313 *input_ptr += len;
1314 if (*input_ptr != input_stop)
1315 res = econv_destination_buffer_full;
1316 else if (flags & ECONV_PARTIAL_INPUT)
1317 res = econv_source_buffer_empty;
1318 else
1319 res = econv_finished;
1320 goto gotresult;
1323 if (ec->elems[ec->num_trans-1].out_data_start) {
1324 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1325 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1326 if (data_start != data_end) {
1327 size_t len;
1328 if (output_stop - *output_ptr < data_end - data_start) {
1329 len = output_stop - *output_ptr;
1330 memcpy(*output_ptr, data_start, len);
1331 *output_ptr = output_stop;
1332 ec->elems[ec->num_trans-1].out_data_start += len;
1333 res = econv_destination_buffer_full;
1334 goto gotresult;
1336 len = data_end - data_start;
1337 memcpy(*output_ptr, data_start, len);
1338 *output_ptr += len;
1339 ec->elems[ec->num_trans-1].out_data_start =
1340 ec->elems[ec->num_trans-1].out_data_end =
1341 ec->elems[ec->num_trans-1].out_buf_start;
1342 has_output = 1;
1346 if (ec->in_buf_start &&
1347 ec->in_data_start != ec->in_data_end) {
1348 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1349 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1350 if (res != econv_source_buffer_empty)
1351 goto gotresult;
1354 if (has_output &&
1355 (flags & ECONV_AFTER_OUTPUT) &&
1356 *input_ptr != input_stop) {
1357 input_stop = *input_ptr;
1358 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1359 if (res == econv_source_buffer_empty)
1360 res = econv_after_output;
1362 else if ((flags & ECONV_AFTER_OUTPUT) ||
1363 ec->num_trans == 1) {
1364 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1366 else {
1367 flags |= ECONV_AFTER_OUTPUT;
1368 do {
1369 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1370 } while (res == econv_after_output);
1373 gotresult:
1374 ec->last_error.result = res;
1375 if (res == econv_invalid_byte_sequence ||
1376 res == econv_incomplete_input ||
1377 res == econv_undefined_conversion) {
1378 rb_transcoding *error_tc = ec->elems[result_position].tc;
1379 ec->last_error.error_tc = error_tc;
1380 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1381 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1382 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1383 ec->last_error.error_bytes_len = error_tc->recognized_len;
1384 ec->last_error.readagain_len = error_tc->readagain_len;
1387 return res;
1390 static int output_replacement_character(rb_econv_t *ec);
1392 static int
1393 output_hex_charref(rb_econv_t *ec)
1395 int ret;
1396 unsigned char utfbuf[1024];
1397 const unsigned char *utf;
1398 size_t utf_len;
1399 int utf_allocated = 0;
1400 char charef_buf[16];
1401 const unsigned char *p;
1403 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1404 utf = ec->last_error.error_bytes_start;
1405 utf_len = ec->last_error.error_bytes_len;
1407 else {
1408 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1409 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1410 utfbuf, sizeof(utfbuf),
1411 &utf_len);
1412 if (!utf)
1413 return -1;
1414 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1415 utf_allocated = 1;
1418 if (utf_len % 4 != 0)
1419 goto fail;
1421 p = utf;
1422 while (4 <= utf_len) {
1423 unsigned int u = 0;
1424 u += p[0] << 24;
1425 u += p[1] << 16;
1426 u += p[2] << 8;
1427 u += p[3];
1428 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1430 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1431 if (ret == -1)
1432 goto fail;
1434 p += 4;
1435 utf_len -= 4;
1438 if (utf_allocated)
1439 xfree((void *)utf);
1440 return 0;
1442 fail:
1443 if (utf_allocated)
1444 xfree((void *)utf);
1445 return -1;
1448 rb_econv_result_t
1449 rb_econv_convert(rb_econv_t *ec,
1450 const unsigned char **input_ptr, const unsigned char *input_stop,
1451 unsigned char **output_ptr, unsigned char *output_stop,
1452 int flags)
1454 rb_econv_result_t ret;
1456 unsigned char empty_buf;
1457 unsigned char *empty_ptr = &empty_buf;
1459 ec->started = 1;
1461 if (!input_ptr) {
1462 input_ptr = (const unsigned char **)&empty_ptr;
1463 input_stop = empty_ptr;
1466 if (!output_ptr) {
1467 output_ptr = &empty_ptr;
1468 output_stop = empty_ptr;
1471 resume:
1472 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1474 if (ret == econv_invalid_byte_sequence ||
1475 ret == econv_incomplete_input) {
1476 /* deal with invalid byte sequence */
1477 /* todo: add more alternative behaviors */
1478 switch (ec->flags & ECONV_INVALID_MASK) {
1479 case ECONV_INVALID_REPLACE:
1480 if (output_replacement_character(ec) == 0)
1481 goto resume;
1485 if (ret == econv_undefined_conversion) {
1486 /* valid character in source encoding
1487 * but no related character(s) in destination encoding */
1488 /* todo: add more alternative behaviors */
1489 switch (ec->flags & ECONV_UNDEF_MASK) {
1490 case ECONV_UNDEF_REPLACE:
1491 if (output_replacement_character(ec) == 0)
1492 goto resume;
1493 break;
1495 case ECONV_UNDEF_HEX_CHARREF:
1496 if (output_hex_charref(ec) == 0)
1497 goto resume;
1498 break;
1502 return ret;
1505 const char *
1506 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
1508 rb_transcoding *tc = ec->last_tc;
1509 const rb_transcoder *tr;
1511 if (tc == NULL)
1512 return "";
1514 tr = tc->transcoder;
1516 if (tr->asciicompat_type == asciicompat_encoder)
1517 return tr->src_encoding;
1518 return tr->dst_encoding;
1521 static unsigned char *
1522 allocate_converted_string(const char *sname, const char *dname,
1523 const unsigned char *str, size_t len,
1524 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1525 size_t *dst_len_ptr)
1527 unsigned char *dst_str;
1528 size_t dst_len;
1529 size_t dst_bufsize;
1531 rb_econv_t *ec;
1532 rb_econv_result_t res;
1534 const unsigned char *sp;
1535 unsigned char *dp;
1537 if (caller_dst_buf)
1538 dst_bufsize = caller_dst_bufsize;
1539 else if (len == 0)
1540 dst_bufsize = 1;
1541 else
1542 dst_bufsize = len;
1544 ec = rb_econv_open(sname, dname, 0);
1545 if (ec == NULL)
1546 return NULL;
1547 if (caller_dst_buf)
1548 dst_str = caller_dst_buf;
1549 else
1550 dst_str = xmalloc(dst_bufsize);
1551 dst_len = 0;
1552 sp = str;
1553 dp = dst_str+dst_len;
1554 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1555 dst_len = dp - dst_str;
1556 while (res == econv_destination_buffer_full) {
1557 if (SIZE_MAX/2 < dst_bufsize) {
1558 goto fail;
1560 dst_bufsize *= 2;
1561 if (dst_str == caller_dst_buf) {
1562 unsigned char *tmp;
1563 tmp = xmalloc(dst_bufsize);
1564 memcpy(tmp, dst_str, dst_bufsize/2);
1565 dst_str = tmp;
1567 else {
1568 dst_str = xrealloc(dst_str, dst_bufsize);
1570 dp = dst_str+dst_len;
1571 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1572 dst_len = dp - dst_str;
1574 if (res != econv_finished) {
1575 goto fail;
1577 rb_econv_close(ec);
1578 *dst_len_ptr = dst_len;
1579 return dst_str;
1581 fail:
1582 if (dst_str != caller_dst_buf)
1583 xfree(dst_str);
1584 rb_econv_close(ec);
1585 return NULL;
1588 /* result: 0:success -1:failure */
1590 rb_econv_insert_output(rb_econv_t *ec,
1591 const unsigned char *str, size_t len, const char *str_encoding)
1593 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1594 unsigned char insert_buf[4096];
1595 const unsigned char *insert_str = NULL;
1596 size_t insert_len;
1598 int last_trans_index;
1599 rb_transcoding *tc;
1601 unsigned char **buf_start_p;
1602 unsigned char **data_start_p;
1603 unsigned char **data_end_p;
1604 unsigned char **buf_end_p;
1606 size_t need;
1608 ec->started = 1;
1610 if (len == 0)
1611 return 0;
1613 if (encoding_equal(insert_encoding, str_encoding)) {
1614 insert_str = str;
1615 insert_len = len;
1617 else {
1618 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1619 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1620 if (insert_str == NULL)
1621 return -1;
1624 need = insert_len;
1626 last_trans_index = ec->num_trans-1;
1627 if (ec->num_trans == 0) {
1628 tc = NULL;
1629 buf_start_p = &ec->in_buf_start;
1630 data_start_p = &ec->in_data_start;
1631 data_end_p = &ec->in_data_end;
1632 buf_end_p = &ec->in_buf_end;
1634 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1635 tc = ec->elems[last_trans_index].tc;
1636 need += tc->readagain_len;
1637 if (need < insert_len)
1638 goto fail;
1639 if (last_trans_index == 0) {
1640 buf_start_p = &ec->in_buf_start;
1641 data_start_p = &ec->in_data_start;
1642 data_end_p = &ec->in_data_end;
1643 buf_end_p = &ec->in_buf_end;
1645 else {
1646 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1647 buf_start_p = &ee->out_buf_start;
1648 data_start_p = &ee->out_data_start;
1649 data_end_p = &ee->out_data_end;
1650 buf_end_p = &ee->out_buf_end;
1653 else {
1654 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1655 buf_start_p = &ee->out_buf_start;
1656 data_start_p = &ee->out_data_start;
1657 data_end_p = &ee->out_data_end;
1658 buf_end_p = &ee->out_buf_end;
1659 tc = ec->elems[last_trans_index].tc;
1662 if (*buf_start_p == NULL) {
1663 unsigned char *buf = xmalloc(need);
1664 *buf_start_p = buf;
1665 *data_start_p = buf;
1666 *data_end_p = buf;
1667 *buf_end_p = buf+need;
1669 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1670 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1671 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1672 *data_start_p = *buf_start_p;
1673 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1674 unsigned char *buf;
1675 size_t s = (*data_end_p - *buf_start_p) + need;
1676 if (s < need)
1677 goto fail;
1678 buf = xrealloc(*buf_start_p, s);
1679 *data_start_p = buf;
1680 *data_end_p = buf + (*data_end_p - *buf_start_p);
1681 *buf_start_p = buf;
1682 *buf_end_p = buf + s;
1686 memcpy(*data_end_p, insert_str, insert_len);
1687 *data_end_p += insert_len;
1688 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1689 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1690 *data_end_p += tc->readagain_len;
1691 tc->readagain_len = 0;
1694 if (insert_str != str && insert_str != insert_buf)
1695 xfree((void*)insert_str);
1696 return 0;
1698 fail:
1699 if (insert_str != str && insert_str != insert_buf)
1700 xfree((void*)insert_str);
1701 return -1;
1704 void
1705 rb_econv_close(rb_econv_t *ec)
1707 int i;
1709 if (ec->replacement_allocated) {
1710 xfree((void *)ec->replacement_str);
1712 for (i = 0; i < ec->num_trans; i++) {
1713 rb_transcoding_close(ec->elems[i].tc);
1714 if (ec->elems[i].out_buf_start)
1715 xfree(ec->elems[i].out_buf_start);
1717 xfree(ec->in_buf_start);
1718 xfree(ec->elems);
1719 xfree(ec);
1722 size_t
1723 rb_econv_memsize(rb_econv_t *ec)
1725 size_t size = sizeof(rb_econv_t);
1726 int i;
1728 if (ec->replacement_allocated) {
1729 size += ec->replacement_len;
1731 for (i = 0; i < ec->num_trans; i++) {
1732 size += rb_transcoding_memsize(ec->elems[i].tc);
1734 if (ec->elems[i].out_buf_start) {
1735 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1738 size += ec->in_buf_end - ec->in_buf_start;
1739 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1741 return size;
1745 rb_econv_putbackable(rb_econv_t *ec)
1747 if (ec->num_trans == 0)
1748 return 0;
1749 #if SIZEOF_SIZE_T > SIZEOF_INT
1750 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1751 #endif
1752 return (int)ec->elems[0].tc->readagain_len;
1755 void
1756 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1758 rb_transcoding *tc;
1759 if (ec->num_trans == 0 || n == 0)
1760 return;
1761 tc = ec->elems[0].tc;
1762 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1763 tc->readagain_len -= n;
1766 struct asciicompat_encoding_t {
1767 const char *ascii_compat_name;
1768 const char *ascii_incompat_name;
1771 static int
1772 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1774 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1775 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1776 const rb_transcoder *tr;
1778 if (DECORATOR_P(entry->sname, entry->dname))
1779 return ST_CONTINUE;
1780 tr = load_transcoder_entry(entry);
1781 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1782 data->ascii_compat_name = tr->dst_encoding;
1783 return ST_STOP;
1785 return ST_CONTINUE;
1788 const char *
1789 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1791 st_data_t v;
1792 st_table *table2;
1793 struct asciicompat_encoding_t data;
1795 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1796 return NULL;
1797 table2 = (st_table *)v;
1800 * Assumption:
1801 * There is at most one transcoder for
1802 * converting from ASCII incompatible encoding.
1804 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1806 if (table2->num_entries != 1)
1807 return NULL;
1809 data.ascii_incompat_name = ascii_incompat_name;
1810 data.ascii_compat_name = NULL;
1811 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1812 return data.ascii_compat_name;
1815 VALUE
1816 rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1818 unsigned const char *sp, *se;
1819 unsigned char *ds, *dp, *de;
1820 rb_econv_result_t res;
1821 int max_output;
1823 if (NIL_P(dst)) {
1824 dst = rb_str_buf_new(len);
1825 if (ec->destination_encoding)
1826 rb_enc_associate(dst, ec->destination_encoding);
1829 if (ec->last_tc)
1830 max_output = ec->last_tc->transcoder->max_output;
1831 else
1832 max_output = 1;
1834 do {
1835 long dlen = RSTRING_LEN(dst);
1836 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1837 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1838 if (LONG_MAX < new_capa)
1839 rb_raise(rb_eArgError, "too long string");
1840 rb_str_resize(dst, new_capa);
1841 rb_str_set_len(dst, dlen);
1843 sp = (const unsigned char *)ss;
1844 se = sp + len;
1845 ds = (unsigned char *)RSTRING_PTR(dst);
1846 de = ds + rb_str_capacity(dst);
1847 dp = ds += dlen;
1848 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1849 len -= (const char *)sp - ss;
1850 ss = (const char *)sp;
1851 rb_str_set_len(dst, dlen + (dp - ds));
1852 rb_econv_check_error(ec);
1853 } while (res == econv_destination_buffer_full);
1855 return dst;
1858 VALUE
1859 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1861 src = rb_str_new_frozen(src);
1862 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1863 RB_GC_GUARD(src);
1864 return dst;
1867 VALUE
1868 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1870 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1873 VALUE
1874 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1876 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1879 VALUE
1880 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
1882 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1885 static int
1886 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1888 transcoder_entry_t *entry;
1889 const rb_transcoder *tr;
1891 if (ec->started != 0)
1892 return -1;
1894 entry = get_transcoder_entry(sname, dname);
1895 if (!entry)
1896 return -1;
1898 tr = load_transcoder_entry(entry);
1899 if (!tr) return -1;
1901 return rb_econv_add_transcoder_at(ec, tr, n);
1904 static int
1905 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1907 return rb_econv_add_converter(ec, "", decorator_name, n);
1911 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1913 const rb_transcoder *tr;
1915 if (ec->num_trans == 0)
1916 return rb_econv_decorate_at(ec, decorator_name, 0);
1918 tr = ec->elems[0].tc->transcoder;
1920 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1921 tr->asciicompat_type == asciicompat_decoder)
1922 return rb_econv_decorate_at(ec, decorator_name, 1);
1924 return rb_econv_decorate_at(ec, decorator_name, 0);
1928 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1930 const rb_transcoder *tr;
1932 if (ec->num_trans == 0)
1933 return rb_econv_decorate_at(ec, decorator_name, 0);
1935 tr = ec->elems[ec->num_trans-1].tc->transcoder;
1937 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1938 tr->asciicompat_type == asciicompat_encoder)
1939 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1941 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1944 void
1945 rb_econv_binmode(rb_econv_t *ec)
1947 const char *dname = 0;
1949 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1950 case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
1951 dname = "universal_newline";
1952 break;
1953 case ECONV_CRLF_NEWLINE_DECORATOR:
1954 dname = "crlf_newline";
1955 break;
1956 case ECONV_CR_NEWLINE_DECORATOR:
1957 dname = "cr_newline";
1958 break;
1961 if (dname) {
1962 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1963 int num_trans = ec->num_trans;
1964 int i, j = 0;
1966 for (i=0; i < num_trans; i++) {
1967 if (transcoder == ec->elems[i].tc->transcoder) {
1968 rb_transcoding_close(ec->elems[i].tc);
1969 xfree(ec->elems[i].out_buf_start);
1970 ec->num_trans--;
1972 else
1973 ec->elems[j++] = ec->elems[i];
1977 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
1980 static VALUE
1981 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1983 int has_description = 0;
1985 if (NIL_P(mesg))
1986 mesg = rb_str_new(NULL, 0);
1988 if (*sname != '\0' || *dname != '\0') {
1989 if (*sname == '\0')
1990 rb_str_cat2(mesg, dname);
1991 else if (*dname == '\0')
1992 rb_str_cat2(mesg, sname);
1993 else
1994 rb_str_catf(mesg, "%s to %s", sname, dname);
1995 has_description = 1;
1998 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1999 ECONV_XML_TEXT_DECORATOR|
2000 ECONV_XML_ATTR_CONTENT_DECORATOR|
2001 ECONV_XML_ATTR_QUOTE_DECORATOR)) {
2002 const char *pre = "";
2003 if (has_description)
2004 rb_str_cat2(mesg, " with ");
2005 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2006 rb_str_cat2(mesg, pre); pre = ",";
2007 rb_str_cat2(mesg, "universal_newline");
2009 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2010 rb_str_cat2(mesg, pre); pre = ",";
2011 rb_str_cat2(mesg, "crlf_newline");
2013 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2014 rb_str_cat2(mesg, pre); pre = ",";
2015 rb_str_cat2(mesg, "cr_newline");
2017 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2018 rb_str_cat2(mesg, pre); pre = ",";
2019 rb_str_cat2(mesg, "xml_text");
2021 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2022 rb_str_cat2(mesg, pre); pre = ",";
2023 rb_str_cat2(mesg, "xml_attr_content");
2025 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2026 rb_str_cat2(mesg, pre); pre = ",";
2027 rb_str_cat2(mesg, "xml_attr_quote");
2029 has_description = 1;
2031 if (!has_description) {
2032 rb_str_cat2(mesg, "no-conversion");
2035 return mesg;
2038 VALUE
2039 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2041 VALUE mesg, exc;
2042 mesg = rb_str_new_cstr("code converter not found (");
2043 econv_description(sname, dname, ecflags, mesg);
2044 rb_str_cat2(mesg, ")");
2045 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2046 return exc;
2049 static VALUE
2050 make_econv_exception(rb_econv_t *ec)
2052 VALUE mesg, exc;
2053 if (ec->last_error.result == econv_invalid_byte_sequence ||
2054 ec->last_error.result == econv_incomplete_input) {
2055 const char *err = (const char *)ec->last_error.error_bytes_start;
2056 size_t error_len = ec->last_error.error_bytes_len;
2057 VALUE bytes = rb_str_new(err, error_len);
2058 VALUE dumped = rb_str_dump(bytes);
2059 size_t readagain_len = ec->last_error.readagain_len;
2060 VALUE bytes2 = Qnil;
2061 VALUE dumped2;
2062 if (ec->last_error.result == econv_incomplete_input) {
2063 mesg = rb_sprintf("incomplete %s on %s",
2064 StringValueCStr(dumped),
2065 ec->last_error.source_encoding);
2067 else if (readagain_len) {
2068 bytes2 = rb_str_new(err+error_len, readagain_len);
2069 dumped2 = rb_str_dump(bytes2);
2070 mesg = rb_sprintf("%s followed by %s on %s",
2071 StringValueCStr(dumped),
2072 StringValueCStr(dumped2),
2073 ec->last_error.source_encoding);
2075 else {
2076 mesg = rb_sprintf("%s on %s",
2077 StringValueCStr(dumped),
2078 ec->last_error.source_encoding);
2081 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2082 rb_ivar_set(exc, id_error_bytes, bytes);
2083 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2084 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2085 goto set_encs;
2087 if (ec->last_error.result == econv_undefined_conversion) {
2088 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2089 ec->last_error.error_bytes_len);
2090 VALUE dumped = Qnil;
2091 int idx;
2092 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2093 rb_encoding *utf8 = rb_utf8_encoding();
2094 const char *start, *end;
2095 int n;
2096 start = (const char *)ec->last_error.error_bytes_start;
2097 end = start + ec->last_error.error_bytes_len;
2098 n = rb_enc_precise_mbclen(start, end, utf8);
2099 if (MBCLEN_CHARFOUND_P(n) &&
2100 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2101 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2102 dumped = rb_sprintf("U+%04X", cc);
2105 if (NIL_P(dumped))
2106 dumped = rb_str_dump(bytes);
2107 if (strcmp(ec->last_error.source_encoding,
2108 ec->source_encoding_name) == 0 &&
2109 strcmp(ec->last_error.destination_encoding,
2110 ec->destination_encoding_name) == 0) {
2111 mesg = rb_sprintf("%s from %s to %s",
2112 StringValueCStr(dumped),
2113 ec->last_error.source_encoding,
2114 ec->last_error.destination_encoding);
2116 else {
2117 int i;
2118 mesg = rb_sprintf("%s to %s in conversion from %s",
2119 StringValueCStr(dumped),
2120 ec->last_error.destination_encoding,
2121 ec->source_encoding_name);
2122 for (i = 0; i < ec->num_trans; i++) {
2123 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2124 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2125 rb_str_catf(mesg, " to %s",
2126 ec->elems[i].tc->transcoder->dst_encoding);
2129 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2130 idx = rb_enc_find_index(ec->last_error.source_encoding);
2131 if (0 <= idx)
2132 rb_enc_associate_index(bytes, idx);
2133 rb_ivar_set(exc, id_error_char, bytes);
2134 goto set_encs;
2136 return Qnil;
2138 set_encs:
2139 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2140 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2141 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2142 if (0 <= idx)
2143 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2144 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2145 if (0 <= idx)
2146 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2147 return exc;
2150 static void
2151 more_output_buffer(
2152 VALUE destination,
2153 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2154 int max_output,
2155 unsigned char **out_start_ptr,
2156 unsigned char **out_pos,
2157 unsigned char **out_stop_ptr)
2159 size_t len = (*out_pos - *out_start_ptr);
2160 size_t new_len = (len + max_output) * 2;
2161 *out_start_ptr = resize_destination(destination, len, new_len);
2162 *out_pos = *out_start_ptr + len;
2163 *out_stop_ptr = *out_start_ptr + new_len;
2166 static int
2167 make_replacement(rb_econv_t *ec)
2169 rb_transcoding *tc;
2170 const rb_transcoder *tr;
2171 const unsigned char *replacement;
2172 const char *repl_enc;
2173 const char *ins_enc;
2174 size_t len;
2176 if (ec->replacement_str)
2177 return 0;
2179 ins_enc = rb_econv_encoding_to_insert_output(ec);
2181 tc = ec->last_tc;
2182 if (*ins_enc) {
2183 tr = tc->transcoder;
2184 rb_enc_find(tr->dst_encoding);
2185 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2187 else {
2188 replacement = (unsigned char *)"?";
2189 len = 1;
2190 repl_enc = "";
2193 ec->replacement_str = replacement;
2194 ec->replacement_len = len;
2195 ec->replacement_enc = repl_enc;
2196 ec->replacement_allocated = 0;
2197 return 0;
2201 rb_econv_set_replacement(rb_econv_t *ec,
2202 const unsigned char *str, size_t len, const char *encname)
2204 unsigned char *str2;
2205 size_t len2;
2206 const char *encname2;
2208 encname2 = rb_econv_encoding_to_insert_output(ec);
2210 if (!*encname2 || encoding_equal(encname, encname2)) {
2211 str2 = xmalloc(len);
2212 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2213 len2 = len;
2214 encname2 = encname;
2216 else {
2217 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2218 if (!str2)
2219 return -1;
2222 if (ec->replacement_allocated) {
2223 xfree((void *)ec->replacement_str);
2225 ec->replacement_allocated = 1;
2226 ec->replacement_str = str2;
2227 ec->replacement_len = len2;
2228 ec->replacement_enc = encname2;
2229 return 0;
2232 static int
2233 output_replacement_character(rb_econv_t *ec)
2235 int ret;
2237 if (make_replacement(ec) == -1)
2238 return -1;
2240 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2241 if (ret == -1)
2242 return -1;
2244 return 0;
2247 #if 1
2248 #define hash_fallback rb_hash_aref
2250 static VALUE
2251 proc_fallback(VALUE fallback, VALUE c)
2253 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2256 static VALUE
2257 method_fallback(VALUE fallback, VALUE c)
2259 return rb_method_call(1, &c, fallback);
2262 static VALUE
2263 aref_fallback(VALUE fallback, VALUE c)
2265 return rb_funcallv_public(fallback, idAREF, 1, &c);
2268 static void
2269 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2270 const unsigned char *in_stop, unsigned char *out_stop,
2271 VALUE destination,
2272 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2273 const char *src_encoding,
2274 const char *dst_encoding,
2275 int ecflags,
2276 VALUE ecopts)
2278 rb_econv_t *ec;
2279 rb_transcoding *last_tc;
2280 rb_econv_result_t ret;
2281 unsigned char *out_start = *out_pos;
2282 int max_output;
2283 VALUE exc;
2284 VALUE fallback = Qnil;
2285 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2287 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2288 if (!ec)
2289 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2291 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2292 fallback = rb_hash_aref(ecopts, sym_fallback);
2293 if (RB_TYPE_P(fallback, T_HASH)) {
2294 fallback_func = hash_fallback;
2296 else if (rb_obj_is_proc(fallback)) {
2297 fallback_func = proc_fallback;
2299 else if (rb_obj_is_method(fallback)) {
2300 fallback_func = method_fallback;
2302 else {
2303 fallback_func = aref_fallback;
2306 last_tc = ec->last_tc;
2307 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2309 resume:
2310 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2312 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2313 VALUE rep = rb_enc_str_new(
2314 (const char *)ec->last_error.error_bytes_start,
2315 ec->last_error.error_bytes_len,
2316 rb_enc_find(ec->last_error.source_encoding));
2317 rep = (*fallback_func)(fallback, rep);
2318 if (rep != Qundef && !NIL_P(rep)) {
2319 StringValue(rep);
2320 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2321 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2322 if ((int)ret == -1) {
2323 rb_raise(rb_eArgError, "too big fallback string");
2325 goto resume;
2329 if (ret == econv_invalid_byte_sequence ||
2330 ret == econv_incomplete_input ||
2331 ret == econv_undefined_conversion) {
2332 exc = make_econv_exception(ec);
2333 rb_econv_close(ec);
2334 rb_exc_raise(exc);
2337 if (ret == econv_destination_buffer_full) {
2338 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2339 goto resume;
2342 rb_econv_close(ec);
2343 return;
2345 #else
2346 /* sample transcode_loop implementation in byte-by-byte stream style */
2347 static void
2348 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2349 const unsigned char *in_stop, unsigned char *out_stop,
2350 VALUE destination,
2351 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2352 const char *src_encoding,
2353 const char *dst_encoding,
2354 int ecflags,
2355 VALUE ecopts)
2357 rb_econv_t *ec;
2358 rb_transcoding *last_tc;
2359 rb_econv_result_t ret;
2360 unsigned char *out_start = *out_pos;
2361 const unsigned char *ptr;
2362 int max_output;
2363 VALUE exc;
2365 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2366 if (!ec)
2367 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2369 last_tc = ec->last_tc;
2370 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2372 ret = econv_source_buffer_empty;
2373 ptr = *in_pos;
2374 while (ret != econv_finished) {
2375 unsigned char input_byte;
2376 const unsigned char *p = &input_byte;
2378 if (ret == econv_source_buffer_empty) {
2379 if (ptr < in_stop) {
2380 input_byte = *ptr;
2381 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2383 else {
2384 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2387 else {
2388 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2390 if (&input_byte != p)
2391 ptr += p - &input_byte;
2392 switch (ret) {
2393 case econv_invalid_byte_sequence:
2394 case econv_incomplete_input:
2395 case econv_undefined_conversion:
2396 exc = make_econv_exception(ec);
2397 rb_econv_close(ec);
2398 rb_exc_raise(exc);
2399 break;
2401 case econv_destination_buffer_full:
2402 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2403 break;
2405 case econv_source_buffer_empty:
2406 break;
2408 case econv_finished:
2409 break;
2412 rb_econv_close(ec);
2413 *in_pos = in_stop;
2414 return;
2416 #endif
2420 * String-specific code
2423 static unsigned char *
2424 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2426 rb_str_resize(destination, new_len);
2427 return (unsigned char *)RSTRING_PTR(destination);
2430 static int
2431 econv_opts(VALUE opt, int ecflags)
2433 VALUE v;
2434 int newlineflag = 0;
2436 v = rb_hash_aref(opt, sym_invalid);
2437 if (NIL_P(v)) {
2439 else if (v==sym_replace) {
2440 ecflags |= ECONV_INVALID_REPLACE;
2442 else {
2443 rb_raise(rb_eArgError, "unknown value for invalid character option");
2446 v = rb_hash_aref(opt, sym_undef);
2447 if (NIL_P(v)) {
2449 else if (v==sym_replace) {
2450 ecflags |= ECONV_UNDEF_REPLACE;
2452 else {
2453 rb_raise(rb_eArgError, "unknown value for undefined character option");
2456 v = rb_hash_aref(opt, sym_replace);
2457 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2458 ecflags |= ECONV_UNDEF_REPLACE;
2461 v = rb_hash_aref(opt, sym_xml);
2462 if (!NIL_P(v)) {
2463 if (v==sym_text) {
2464 ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
2466 else if (v==sym_attr) {
2467 ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
2469 else if (SYMBOL_P(v)) {
2470 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2472 else {
2473 rb_raise(rb_eArgError, "unexpected value for xml option");
2477 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2478 v = rb_hash_aref(opt, sym_newline);
2479 if (!NIL_P(v)) {
2480 newlineflag = 2;
2481 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2482 if (v == sym_universal) {
2483 ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
2485 else if (v == sym_crlf) {
2486 ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2488 else if (v == sym_cr) {
2489 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2491 else if (v == sym_lf) {
2492 /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2494 else if (SYMBOL_P(v)) {
2495 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2496 rb_sym2str(v));
2498 else {
2499 rb_raise(rb_eArgError, "unexpected value for newline option");
2502 #endif
2504 int setflags = 0;
2506 v = rb_hash_aref(opt, sym_universal_newline);
2507 if (RTEST(v))
2508 setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
2509 newlineflag |= !NIL_P(v);
2511 v = rb_hash_aref(opt, sym_crlf_newline);
2512 if (RTEST(v))
2513 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2514 newlineflag |= !NIL_P(v);
2516 v = rb_hash_aref(opt, sym_cr_newline);
2517 if (RTEST(v))
2518 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2519 newlineflag |= !NIL_P(v);
2521 switch (newlineflag) {
2522 case 1:
2523 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2524 ecflags |= setflags;
2525 break;
2527 case 3:
2528 rb_warning(":newline option precedes other newline options");
2529 break;
2533 return ecflags;
2537 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2539 VALUE newhash = Qnil;
2540 VALUE v;
2542 if (NIL_P(opthash)) {
2543 *opts = Qnil;
2544 return ecflags;
2546 ecflags = econv_opts(opthash, ecflags);
2548 v = rb_hash_aref(opthash, sym_replace);
2549 if (!NIL_P(v)) {
2550 StringValue(v);
2551 if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
2552 VALUE dumped = rb_str_dump(v);
2553 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2554 StringValueCStr(dumped),
2555 rb_enc_name(rb_enc_get(v)));
2557 v = rb_str_new_frozen(v);
2558 newhash = rb_hash_new();
2559 rb_hash_aset(newhash, sym_replace, v);
2562 v = rb_hash_aref(opthash, sym_fallback);
2563 if (!NIL_P(v)) {
2564 VALUE h = rb_check_hash_type(v);
2565 if (NIL_P(h)
2566 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2567 : (v = h, 1)) {
2568 if (NIL_P(newhash))
2569 newhash = rb_hash_new();
2570 rb_hash_aset(newhash, sym_fallback, v);
2574 if (!NIL_P(newhash))
2575 rb_hash_freeze(newhash);
2576 *opts = newhash;
2578 return ecflags;
2582 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
2584 return rb_econv_prepare_options(opthash, opts, 0);
2587 rb_econv_t *
2588 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2590 rb_econv_t *ec;
2591 VALUE replacement;
2593 if (NIL_P(opthash)) {
2594 replacement = Qnil;
2596 else {
2597 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2598 rb_bug("rb_econv_open_opts called with invalid opthash");
2599 replacement = rb_hash_aref(opthash, sym_replace);
2602 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2603 if (!ec)
2604 return ec;
2606 if (!NIL_P(replacement)) {
2607 int ret;
2608 rb_encoding *enc = rb_enc_get(replacement);
2610 ret = rb_econv_set_replacement(ec,
2611 (const unsigned char *)RSTRING_PTR(replacement),
2612 RSTRING_LEN(replacement),
2613 rb_enc_name(enc));
2614 if (ret == -1) {
2615 rb_econv_close(ec);
2616 return NULL;
2619 return ec;
2622 static int
2623 enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2625 rb_encoding *enc;
2626 const char *n;
2627 int encidx;
2628 VALUE encval;
2630 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2631 !(enc = rb_enc_from_index(encidx))) {
2632 enc = NULL;
2633 encidx = 0;
2634 n = StringValueCStr(*arg);
2636 else {
2637 n = rb_enc_name(enc);
2640 *name_p = n;
2641 *enc_p = enc;
2643 return encidx;
2646 static int
2647 str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2648 const char **sname_p, rb_encoding **senc_p,
2649 const char **dname_p, rb_encoding **denc_p)
2651 rb_encoding *senc, *denc;
2652 const char *sname, *dname;
2653 int sencidx, dencidx;
2655 dencidx = enc_arg(arg1, &dname, &denc);
2657 if (NIL_P(*arg2)) {
2658 sencidx = rb_enc_get_index(str);
2659 senc = rb_enc_from_index(sencidx);
2660 sname = rb_enc_name(senc);
2662 else {
2663 sencidx = enc_arg(arg2, &sname, &senc);
2666 *sname_p = sname;
2667 *senc_p = senc;
2668 *dname_p = dname;
2669 *denc_p = denc;
2670 return dencidx;
2673 static int
2674 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2676 VALUE dest;
2677 VALUE str = *self;
2678 VALUE arg1, arg2;
2679 long blen, slen;
2680 unsigned char *buf, *bp, *sp;
2681 const unsigned char *fromp;
2682 rb_encoding *senc, *denc;
2683 const char *sname, *dname;
2684 int dencidx;
2685 int explicitly_invalid_replace = TRUE;
2687 rb_check_arity(argc, 0, 2);
2689 if (argc == 0) {
2690 arg1 = rb_enc_default_internal();
2691 if (NIL_P(arg1)) {
2692 if (!ecflags) return -1;
2693 arg1 = rb_obj_encoding(str);
2695 if (!(ecflags & ECONV_INVALID_MASK)) {
2696 explicitly_invalid_replace = FALSE;
2698 ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
2700 else {
2701 arg1 = argv[0];
2703 arg2 = argc<=1 ? Qnil : argv[1];
2704 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2706 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2707 ECONV_XML_TEXT_DECORATOR|
2708 ECONV_XML_ATTR_CONTENT_DECORATOR|
2709 ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
2710 if (senc && senc == denc) {
2711 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2712 VALUE rep = Qnil;
2713 if (!NIL_P(ecopts)) {
2714 rep = rb_hash_aref(ecopts, sym_replace);
2716 dest = rb_enc_str_scrub(senc, str, rep);
2717 if (NIL_P(dest)) dest = str;
2718 *self = dest;
2719 return dencidx;
2721 return NIL_P(arg2) ? -1 : dencidx;
2723 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2724 if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2725 return dencidx;
2728 if (encoding_equal(sname, dname)) {
2729 return NIL_P(arg2) ? -1 : dencidx;
2732 else {
2733 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2734 rb_encoding *utf8 = rb_utf8_encoding();
2735 str = rb_str_conv_enc(str, senc, utf8);
2736 senc = utf8;
2737 sname = "UTF-8";
2739 if (encoding_equal(sname, dname)) {
2740 sname = "";
2741 dname = "";
2745 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2746 slen = RSTRING_LEN(str);
2747 blen = slen + 30; /* len + margin */
2748 dest = rb_str_tmp_new(blen);
2749 bp = (unsigned char *)RSTRING_PTR(dest);
2751 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2752 if (fromp != sp+slen) {
2753 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2755 buf = (unsigned char *)RSTRING_PTR(dest);
2756 *bp = '\0';
2757 rb_str_set_len(dest, bp - buf);
2759 /* set encoding */
2760 if (!denc) {
2761 dencidx = rb_define_dummy_encoding(dname);
2762 RB_GC_GUARD(arg1);
2763 RB_GC_GUARD(arg2);
2765 *self = dest;
2767 return dencidx;
2770 static int
2771 str_transcode(int argc, VALUE *argv, VALUE *self)
2773 VALUE opt;
2774 int ecflags = 0;
2775 VALUE ecopts = Qnil;
2777 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2778 if (!NIL_P(opt)) {
2779 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2781 return str_transcode0(argc, argv, self, ecflags, ecopts);
2784 static inline VALUE
2785 str_encode_associate(VALUE str, int encidx)
2787 int cr = 0;
2789 rb_enc_associate_index(str, encidx);
2791 /* transcoded string never be broken. */
2792 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2793 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2795 else {
2796 cr = ENC_CODERANGE_VALID;
2798 ENC_CODERANGE_SET(str, cr);
2799 return str;
2803 * call-seq:
2804 * str.encode!(encoding, **options) -> str
2805 * str.encode!(dst_encoding, src_encoding, **options) -> str
2807 * The first form transcodes the contents of <i>str</i> from
2808 * str.encoding to +encoding+.
2809 * The second form transcodes the contents of <i>str</i> from
2810 * src_encoding to dst_encoding.
2811 * The +options+ keyword arguments give details for conversion. See String#encode
2812 * for details.
2813 * Returns the string even if no changes were made.
2816 static VALUE
2817 str_encode_bang(int argc, VALUE *argv, VALUE str)
2819 VALUE newstr;
2820 int encidx;
2822 rb_check_frozen(str);
2824 newstr = str;
2825 encidx = str_transcode(argc, argv, &newstr);
2827 if (encidx < 0) return str;
2828 if (newstr == str) {
2829 rb_enc_associate_index(str, encidx);
2830 return str;
2832 rb_str_shared_replace(str, newstr);
2833 return str_encode_associate(str, encidx);
2836 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2839 * call-seq:
2840 * str.encode(encoding, **options) -> str
2841 * str.encode(dst_encoding, src_encoding, **options) -> str
2842 * str.encode(**options) -> str
2844 * The first form returns a copy of +str+ transcoded
2845 * to encoding +encoding+.
2846 * The second form returns a copy of +str+ transcoded
2847 * from src_encoding to dst_encoding.
2848 * The last form returns a copy of +str+ transcoded to
2849 * <tt>Encoding.default_internal</tt>.
2851 * By default, the first and second form raise
2852 * Encoding::UndefinedConversionError for characters that are
2853 * undefined in the destination encoding, and
2854 * Encoding::InvalidByteSequenceError for invalid byte sequences
2855 * in the source encoding. The last form by default does not raise
2856 * exceptions but uses replacement strings.
2858 * The +options+ keyword arguments give details for conversion.
2859 * The arguments are:
2861 * :invalid ::
2862 * If the value is +:replace+, #encode replaces invalid byte sequences in
2863 * +str+ with the replacement character. The default is to raise the
2864 * Encoding::InvalidByteSequenceError exception
2865 * :undef ::
2866 * If the value is +:replace+, #encode replaces characters which are
2867 * undefined in the destination encoding with the replacement character.
2868 * The default is to raise the Encoding::UndefinedConversionError.
2869 * :replace ::
2870 * Sets the replacement string to the given value. The default replacement
2871 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2872 * :fallback ::
2873 * Sets the replacement string by the given object for undefined
2874 * character. The object should be a Hash, a Proc, a Method, or an
2875 * object which has [] method.
2876 * Its key is an undefined character encoded in the source encoding
2877 * of current transcoder. Its value can be any encoding until it
2878 * can be converted into the destination encoding of the transcoder.
2879 * :xml ::
2880 * The value must be +:text+ or +:attr+.
2881 * If the value is +:text+ #encode replaces undefined characters with their
2882 * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2883 * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2884 * If the value is +:attr+, #encode also quotes the replacement result
2885 * (using '"'), and replaces '"' with "&quot;".
2886 * :cr_newline ::
2887 * Replaces LF ("\n") with CR ("\r") if value is true.
2888 * :crlf_newline ::
2889 * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2890 * :universal_newline ::
2891 * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2894 static VALUE
2895 str_encode(int argc, VALUE *argv, VALUE str)
2897 VALUE newstr = str;
2898 int encidx = str_transcode(argc, argv, &newstr);
2899 return encoded_dup(newstr, str, encidx);
2902 VALUE
2903 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2905 int argc = 1;
2906 VALUE *argv = &to;
2907 VALUE newstr = str;
2908 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2909 return encoded_dup(newstr, str, encidx);
2912 static VALUE
2913 encoded_dup(VALUE newstr, VALUE str, int encidx)
2915 if (encidx < 0) return rb_str_dup(str);
2916 if (newstr == str) {
2917 newstr = rb_str_dup(str);
2918 rb_enc_associate_index(newstr, encidx);
2919 return newstr;
2921 else {
2922 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2924 return str_encode_associate(newstr, encidx);
2928 * Document-class: Encoding::Converter
2930 * Encoding conversion class.
2932 static void
2933 econv_free(void *ptr)
2935 rb_econv_t *ec = ptr;
2936 rb_econv_close(ec);
2939 static size_t
2940 econv_memsize(const void *ptr)
2942 return sizeof(rb_econv_t);
2945 static const rb_data_type_t econv_data_type = {
2946 "econv",
2947 {0, econv_free, econv_memsize,},
2948 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2951 static VALUE
2952 econv_s_allocate(VALUE klass)
2954 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2957 static rb_encoding *
2958 make_dummy_encoding(const char *name)
2960 rb_encoding *enc;
2961 int idx;
2962 idx = rb_define_dummy_encoding(name);
2963 enc = rb_enc_from_index(idx);
2964 return enc;
2967 static rb_encoding *
2968 make_encoding(const char *name)
2970 rb_encoding *enc;
2971 enc = rb_enc_find(name);
2972 if (!enc)
2973 enc = make_dummy_encoding(name);
2974 return enc;
2977 static VALUE
2978 make_encobj(const char *name)
2980 return rb_enc_from_encoding(make_encoding(name));
2984 * call-seq:
2985 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2986 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2988 * Returns the corresponding ASCII compatible encoding.
2990 * Returns nil if the argument is an ASCII compatible encoding.
2992 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2993 * can represents exactly the same characters as the given ASCII incompatible encoding.
2994 * So, no conversion undefined error occurs when converting between the two encodings.
2996 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2997 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2998 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3001 static VALUE
3002 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3004 const char *arg_name, *result_name;
3005 rb_encoding *arg_enc, *result_enc;
3007 enc_arg(&arg, &arg_name, &arg_enc);
3009 result_name = rb_econv_asciicompat_encoding(arg_name);
3011 if (result_name == NULL)
3012 return Qnil;
3014 result_enc = make_encoding(result_name);
3016 return rb_enc_from_encoding(result_enc);
3019 static void
3020 econv_args(int argc, VALUE *argv,
3021 VALUE *snamev_p, VALUE *dnamev_p,
3022 const char **sname_p, const char **dname_p,
3023 rb_encoding **senc_p, rb_encoding **denc_p,
3024 int *ecflags_p,
3025 VALUE *ecopts_p)
3027 VALUE opt, flags_v, ecopts;
3028 int sidx, didx;
3029 const char *sname, *dname;
3030 rb_encoding *senc, *denc;
3031 int ecflags;
3033 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3035 if (!NIL_P(flags_v)) {
3036 if (!NIL_P(opt)) {
3037 rb_error_arity(argc + 1, 2, 3);
3039 ecflags = NUM2INT(rb_to_int(flags_v));
3040 ecopts = Qnil;
3042 else if (!NIL_P(opt)) {
3043 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3045 else {
3046 ecflags = 0;
3047 ecopts = Qnil;
3050 senc = NULL;
3051 sidx = rb_to_encoding_index(*snamev_p);
3052 if (0 <= sidx) {
3053 senc = rb_enc_from_index(sidx);
3055 else {
3056 StringValue(*snamev_p);
3059 denc = NULL;
3060 didx = rb_to_encoding_index(*dnamev_p);
3061 if (0 <= didx) {
3062 denc = rb_enc_from_index(didx);
3064 else {
3065 StringValue(*dnamev_p);
3068 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3069 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3071 *sname_p = sname;
3072 *dname_p = dname;
3073 *senc_p = senc;
3074 *denc_p = denc;
3075 *ecflags_p = ecflags;
3076 *ecopts_p = ecopts;
3079 static int
3080 decorate_convpath(VALUE convpath, int ecflags)
3082 int num_decorators;
3083 const char *decorators[MAX_ECFLAGS_DECORATORS];
3084 int i;
3085 int n, len;
3087 num_decorators = decorator_names(ecflags, decorators);
3088 if (num_decorators == -1)
3089 return -1;
3091 len = n = RARRAY_LENINT(convpath);
3092 if (n != 0) {
3093 VALUE pair = RARRAY_AREF(convpath, n-1);
3094 if (RB_TYPE_P(pair, T_ARRAY)) {
3095 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3096 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3097 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3098 const rb_transcoder *tr = load_transcoder_entry(entry);
3099 if (!tr)
3100 return -1;
3101 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3102 tr->asciicompat_type == asciicompat_encoder) {
3103 n--;
3104 rb_ary_store(convpath, len + num_decorators - 1, pair);
3107 else {
3108 rb_ary_store(convpath, len + num_decorators - 1, pair);
3112 for (i = 0; i < num_decorators; i++)
3113 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3115 return 0;
3118 static void
3119 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3121 VALUE *ary_p = arg;
3122 VALUE v;
3124 if (NIL_P(*ary_p)) {
3125 *ary_p = rb_ary_new();
3128 if (DECORATOR_P(sname, dname)) {
3129 v = rb_str_new_cstr(dname);
3131 else {
3132 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3134 rb_ary_store(*ary_p, depth, v);
3138 * call-seq:
3139 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3140 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3142 * Returns a conversion path.
3144 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3145 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3146 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3148 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3149 * or
3150 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3151 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3152 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3153 * # "universal_newline"]
3155 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3156 * or
3157 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3158 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3159 * # "universal_newline",
3160 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3162 static VALUE
3163 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3165 VALUE snamev, dnamev;
3166 const char *sname, *dname;
3167 rb_encoding *senc, *denc;
3168 int ecflags;
3169 VALUE ecopts;
3170 VALUE convpath;
3172 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3174 convpath = Qnil;
3175 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3177 if (NIL_P(convpath)) {
3178 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3179 RB_GC_GUARD(snamev);
3180 RB_GC_GUARD(dnamev);
3181 rb_exc_raise(exc);
3184 if (decorate_convpath(convpath, ecflags) == -1) {
3185 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3186 RB_GC_GUARD(snamev);
3187 RB_GC_GUARD(dnamev);
3188 rb_exc_raise(exc);
3191 return convpath;
3195 * Check the existence of a conversion path.
3196 * Returns the number of converters in the conversion path.
3197 * result: >=0:success -1:failure
3200 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3202 VALUE convpath = Qnil;
3203 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3204 &convpath);
3205 return RTEST(convpath);
3208 struct rb_econv_init_by_convpath_t {
3209 rb_econv_t *ec;
3210 int index;
3211 int ret;
3214 static void
3215 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3217 struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3218 int ret;
3220 if (a->ret == -1)
3221 return;
3223 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3225 a->ret = ret;
3226 return;
3229 static rb_econv_t *
3230 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3231 const char **sname_p, const char **dname_p,
3232 rb_encoding **senc_p, rb_encoding**denc_p)
3234 rb_econv_t *ec;
3235 long i;
3236 int ret, first=1;
3237 VALUE elt;
3238 rb_encoding *senc = 0, *denc = 0;
3239 const char *sname, *dname;
3241 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3242 DATA_PTR(self) = ec;
3244 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3245 VALUE snamev, dnamev;
3246 VALUE pair;
3247 elt = rb_ary_entry(convpath, i);
3248 if (!NIL_P(pair = rb_check_array_type(elt))) {
3249 if (RARRAY_LEN(pair) != 2)
3250 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3251 snamev = rb_ary_entry(pair, 0);
3252 enc_arg(&snamev, &sname, &senc);
3253 dnamev = rb_ary_entry(pair, 1);
3254 enc_arg(&dnamev, &dname, &denc);
3256 else {
3257 sname = "";
3258 dname = StringValueCStr(elt);
3260 if (DECORATOR_P(sname, dname)) {
3261 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3262 if (ret == -1) {
3263 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3264 RB_GC_GUARD(snamev);
3265 RB_GC_GUARD(dnamev);
3266 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3269 else {
3270 int j = ec->num_trans;
3271 struct rb_econv_init_by_convpath_t arg;
3272 arg.ec = ec;
3273 arg.index = ec->num_trans;
3274 arg.ret = 0;
3275 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3276 if (ret == -1 || arg.ret == -1) {
3277 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3278 RB_GC_GUARD(snamev);
3279 RB_GC_GUARD(dnamev);
3280 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3282 if (first) {
3283 first = 0;
3284 *senc_p = senc;
3285 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3287 *denc_p = denc;
3288 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3292 if (first) {
3293 *senc_p = NULL;
3294 *denc_p = NULL;
3295 *sname_p = "";
3296 *dname_p = "";
3299 ec->source_encoding_name = *sname_p;
3300 ec->destination_encoding_name = *dname_p;
3302 return ec;
3306 * call-seq:
3307 * Encoding::Converter.new(source_encoding, destination_encoding)
3308 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3309 * Encoding::Converter.new(convpath)
3311 * possible options elements:
3312 * hash form:
3313 * :invalid => nil # raise error on invalid byte sequence (default)
3314 * :invalid => :replace # replace invalid byte sequence
3315 * :undef => nil # raise error on undefined conversion (default)
3316 * :undef => :replace # replace undefined conversion
3317 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3318 * :newline => :universal # decorator for converting CRLF and CR to LF
3319 * :newline => :crlf # decorator for converting LF to CRLF
3320 * :newline => :cr # decorator for converting LF to CR
3321 * :universal_newline => true # decorator for converting CRLF and CR to LF
3322 * :crlf_newline => true # decorator for converting LF to CRLF
3323 * :cr_newline => true # decorator for converting LF to CR
3324 * :xml => :text # escape as XML CharData.
3325 * :xml => :attr # escape as XML AttValue
3326 * integer form:
3327 * Encoding::Converter::INVALID_REPLACE
3328 * Encoding::Converter::UNDEF_REPLACE
3329 * Encoding::Converter::UNDEF_HEX_CHARREF
3330 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3331 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3332 * Encoding::Converter::CR_NEWLINE_DECORATOR
3333 * Encoding::Converter::XML_TEXT_DECORATOR
3334 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3335 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3337 * Encoding::Converter.new creates an instance of Encoding::Converter.
3339 * Source_encoding and destination_encoding should be a string or
3340 * Encoding object.
3342 * opt should be nil, a hash or an integer.
3344 * convpath should be an array.
3345 * convpath may contain
3346 * - two-element arrays which contain encodings or encoding names, or
3347 * - strings representing decorator names.
3349 * Encoding::Converter.new optionally takes an option.
3350 * The option should be a hash or an integer.
3351 * The option hash can contain :invalid => nil, etc.
3352 * The option integer should be logical-or of constants such as
3353 * Encoding::Converter::INVALID_REPLACE, etc.
3355 * [:invalid => nil]
3356 * Raise error on invalid byte sequence. This is a default behavior.
3357 * [:invalid => :replace]
3358 * Replace invalid byte sequence by replacement string.
3359 * [:undef => nil]
3360 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3361 * This is a default behavior.
3362 * [:undef => :replace]
3363 * Replace undefined character in destination_encoding with replacement string.
3364 * [:replace => string]
3365 * Specify the replacement string.
3366 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3367 * [:universal_newline => true]
3368 * Convert CRLF and CR to LF.
3369 * [:crlf_newline => true]
3370 * Convert LF to CRLF.
3371 * [:cr_newline => true]
3372 * Convert LF to CR.
3373 * [:xml => :text]
3374 * Escape as XML CharData.
3375 * This form can be used as an HTML 4.0 #PCDATA.
3376 * - '&' -> '&amp;'
3377 * - '<' -> '&lt;'
3378 * - '>' -> '&gt;'
3379 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3380 * [:xml => :attr]
3381 * Escape as XML AttValue.
3382 * The converted result is quoted as "...".
3383 * This form can be used as an HTML 4.0 attribute value.
3384 * - '&' -> '&amp;'
3385 * - '<' -> '&lt;'
3386 * - '>' -> '&gt;'
3387 * - '"' -> '&quot;'
3388 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3390 * Examples:
3391 * # UTF-16BE to UTF-8
3392 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3394 * # Usually, decorators such as newline conversion are inserted last.
3395 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3396 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3397 * # "universal_newline"]
3399 * # But, if the last encoding is ASCII incompatible,
3400 * # decorators are inserted before the last conversion.
3401 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3402 * p ec.convpath #=> ["crlf_newline",
3403 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3405 * # Conversion path can be specified directly.
3406 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3407 * p ec.convpath #=> ["universal_newline",
3408 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3409 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3411 static VALUE
3412 econv_init(int argc, VALUE *argv, VALUE self)
3414 VALUE ecopts;
3415 VALUE snamev, dnamev;
3416 const char *sname, *dname;
3417 rb_encoding *senc, *denc;
3418 rb_econv_t *ec;
3419 int ecflags;
3420 VALUE convpath;
3422 if (rb_check_typeddata(self, &econv_data_type)) {
3423 rb_raise(rb_eTypeError, "already initialized");
3426 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3427 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3428 ecflags = 0;
3429 ecopts = Qnil;
3431 else {
3432 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3433 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3436 if (!ec) {
3437 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3438 RB_GC_GUARD(snamev);
3439 RB_GC_GUARD(dnamev);
3440 rb_exc_raise(exc);
3443 if (!DECORATOR_P(sname, dname)) {
3444 if (!senc)
3445 senc = make_dummy_encoding(sname);
3446 if (!denc)
3447 denc = make_dummy_encoding(dname);
3448 RB_GC_GUARD(snamev);
3449 RB_GC_GUARD(dnamev);
3452 ec->source_encoding = senc;
3453 ec->destination_encoding = denc;
3455 DATA_PTR(self) = ec;
3457 return self;
3461 * call-seq:
3462 * ec.inspect -> string
3464 * Returns a printable version of <i>ec</i>
3466 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3467 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3470 static VALUE
3471 econv_inspect(VALUE self)
3473 const char *cname = rb_obj_classname(self);
3474 rb_econv_t *ec;
3476 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3477 if (!ec)
3478 return rb_sprintf("#<%s: uninitialized>", cname);
3479 else {
3480 const char *sname = ec->source_encoding_name;
3481 const char *dname = ec->destination_encoding_name;
3482 VALUE str;
3483 str = rb_sprintf("#<%s: ", cname);
3484 econv_description(sname, dname, ec->flags, str);
3485 rb_str_cat2(str, ">");
3486 return str;
3490 static rb_econv_t *
3491 check_econv(VALUE self)
3493 rb_econv_t *ec;
3495 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3496 if (!ec) {
3497 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3499 return ec;
3503 * call-seq:
3504 * ec.source_encoding -> encoding
3506 * Returns the source encoding as an Encoding object.
3508 static VALUE
3509 econv_source_encoding(VALUE self)
3511 rb_econv_t *ec = check_econv(self);
3512 if (!ec->source_encoding)
3513 return Qnil;
3514 return rb_enc_from_encoding(ec->source_encoding);
3518 * call-seq:
3519 * ec.destination_encoding -> encoding
3521 * Returns the destination encoding as an Encoding object.
3523 static VALUE
3524 econv_destination_encoding(VALUE self)
3526 rb_econv_t *ec = check_econv(self);
3527 if (!ec->destination_encoding)
3528 return Qnil;
3529 return rb_enc_from_encoding(ec->destination_encoding);
3533 * call-seq:
3534 * ec.convpath -> ary
3536 * Returns the conversion path of ec.
3538 * The result is an array of conversions.
3540 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3541 * p ec.convpath
3542 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3543 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3544 * # "crlf_newline"]
3546 * Each element of the array is a pair of encodings or a string.
3547 * A pair means an encoding conversion.
3548 * A string means a decorator.
3550 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3551 * a converter from ISO-8859-1 to UTF-8.
3552 * "crlf_newline" means newline converter from LF to CRLF.
3554 static VALUE
3555 econv_convpath(VALUE self)
3557 rb_econv_t *ec = check_econv(self);
3558 VALUE result;
3559 int i;
3561 result = rb_ary_new();
3562 for (i = 0; i < ec->num_trans; i++) {
3563 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3564 VALUE v;
3565 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3566 v = rb_str_new_cstr(tr->dst_encoding);
3567 else
3568 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3569 rb_ary_push(result, v);
3571 return result;
3575 * call-seq:
3576 * ec == other -> true or false
3578 static VALUE
3579 econv_equal(VALUE self, VALUE other)
3581 rb_econv_t *ec1 = check_econv(self);
3582 rb_econv_t *ec2;
3583 int i;
3585 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3586 return Qnil;
3588 ec2 = DATA_PTR(other);
3589 if (!ec2) return Qfalse;
3590 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3591 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3592 return Qfalse;
3593 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3594 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3595 return Qfalse;
3596 if (ec1->flags != ec2->flags) return Qfalse;
3597 if (ec1->replacement_enc != ec2->replacement_enc &&
3598 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3599 return Qfalse;
3600 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3601 if (ec1->replacement_str != ec2->replacement_str &&
3602 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3603 return Qfalse;
3605 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3606 for (i = 0; i < ec1->num_trans; i++) {
3607 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3608 return Qfalse;
3610 return Qtrue;
3613 static VALUE
3614 econv_result_to_symbol(rb_econv_result_t res)
3616 switch (res) {
3617 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3618 case econv_incomplete_input: return sym_incomplete_input;
3619 case econv_undefined_conversion: return sym_undefined_conversion;
3620 case econv_destination_buffer_full: return sym_destination_buffer_full;
3621 case econv_source_buffer_empty: return sym_source_buffer_empty;
3622 case econv_finished: return sym_finished;
3623 case econv_after_output: return sym_after_output;
3624 default: return INT2NUM(res); /* should not be reached */
3629 * call-seq:
3630 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3631 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3632 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3633 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3635 * possible opt elements:
3636 * hash form:
3637 * :partial_input => true # source buffer may be part of larger source
3638 * :after_output => true # stop conversion after output before input
3639 * integer form:
3640 * Encoding::Converter::PARTIAL_INPUT
3641 * Encoding::Converter::AFTER_OUTPUT
3643 * possible results:
3644 * :invalid_byte_sequence
3645 * :incomplete_input
3646 * :undefined_conversion
3647 * :after_output
3648 * :destination_buffer_full
3649 * :source_buffer_empty
3650 * :finished
3652 * primitive_convert converts source_buffer into destination_buffer.
3654 * source_buffer should be a string or nil.
3655 * nil means an empty string.
3657 * destination_buffer should be a string.
3659 * destination_byteoffset should be an integer or nil.
3660 * nil means the end of destination_buffer.
3661 * If it is omitted, nil is assumed.
3663 * destination_bytesize should be an integer or nil.
3664 * nil means unlimited.
3665 * If it is omitted, nil is assumed.
3667 * opt should be nil, a hash or an integer.
3668 * nil means no flags.
3669 * If it is omitted, nil is assumed.
3671 * primitive_convert converts the content of source_buffer from beginning
3672 * and store the result into destination_buffer.
3674 * destination_byteoffset and destination_bytesize specify the region which
3675 * the converted result is stored.
3676 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3677 * If destination_byteoffset is nil,
3678 * destination_buffer.bytesize is used for appending the result.
3679 * destination_bytesize specifies maximum number of bytes.
3680 * If destination_bytesize is nil,
3681 * destination size is unlimited.
3682 * After conversion, destination_buffer is resized to
3683 * destination_byteoffset + actually produced number of bytes.
3684 * Also destination_buffer's encoding is set to destination_encoding.
3686 * primitive_convert drops the converted part of source_buffer.
3687 * the dropped part is converted in destination_buffer or
3688 * buffered in Encoding::Converter object.
3690 * primitive_convert stops conversion when one of following condition met.
3691 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3692 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3693 * - unexpected end of source buffer (:incomplete_input)
3694 * this occur only when :partial_input is not specified.
3695 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3696 * - character not representable in output encoding (:undefined_conversion)
3697 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3698 * - after some output is generated, before input is done (:after_output)
3699 * this occur only when :after_output is specified.
3700 * - destination buffer is full (:destination_buffer_full)
3701 * this occur only when destination_bytesize is non-nil.
3702 * - source buffer is empty (:source_buffer_empty)
3703 * this occur only when :partial_input is specified.
3704 * - conversion is finished (:finished)
3706 * example:
3707 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3708 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3709 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3711 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3712 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3713 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3714 * ret = ec.primitive_convert(src, dst="", nil, 1)
3715 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3716 * ret = ec.primitive_convert(src, dst="", nil, 1)
3717 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3718 * ret = ec.primitive_convert(src, dst="", nil, 1)
3719 * p [ret, src, dst] #=> [:finished, "", "i"]
3722 static VALUE
3723 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3725 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3726 rb_econv_t *ec = check_econv(self);
3727 rb_econv_result_t res;
3728 const unsigned char *ip, *is;
3729 unsigned char *op, *os;
3730 long output_byteoffset, output_bytesize;
3731 unsigned long output_byteend;
3732 int flags;
3734 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3736 if (NIL_P(output_byteoffset_v))
3737 output_byteoffset = 0; /* dummy */
3738 else
3739 output_byteoffset = NUM2LONG(output_byteoffset_v);
3741 if (NIL_P(output_bytesize_v))
3742 output_bytesize = 0; /* dummy */
3743 else
3744 output_bytesize = NUM2LONG(output_bytesize_v);
3746 if (!NIL_P(flags_v)) {
3747 if (!NIL_P(opt)) {
3748 rb_error_arity(argc + 1, 2, 5);
3750 flags = NUM2INT(rb_to_int(flags_v));
3752 else if (!NIL_P(opt)) {
3753 VALUE v;
3754 flags = 0;
3755 v = rb_hash_aref(opt, sym_partial_input);
3756 if (RTEST(v))
3757 flags |= ECONV_PARTIAL_INPUT;
3758 v = rb_hash_aref(opt, sym_after_output);
3759 if (RTEST(v))
3760 flags |= ECONV_AFTER_OUTPUT;
3762 else {
3763 flags = 0;
3766 StringValue(output);
3767 if (!NIL_P(input))
3768 StringValue(input);
3769 rb_str_modify(output);
3771 if (NIL_P(output_bytesize_v)) {
3772 #if USE_RVARGC
3773 output_bytesize = rb_str_capacity(output);
3774 #else
3775 output_bytesize = RSTRING_EMBED_LEN_MAX;
3776 #endif
3777 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3778 output_bytesize = RSTRING_LEN(input);
3781 retry:
3783 if (NIL_P(output_byteoffset_v))
3784 output_byteoffset = RSTRING_LEN(output);
3786 if (output_byteoffset < 0)
3787 rb_raise(rb_eArgError, "negative output_byteoffset");
3789 if (RSTRING_LEN(output) < output_byteoffset)
3790 rb_raise(rb_eArgError, "output_byteoffset too big");
3792 if (output_bytesize < 0)
3793 rb_raise(rb_eArgError, "negative output_bytesize");
3795 output_byteend = (unsigned long)output_byteoffset +
3796 (unsigned long)output_bytesize;
3798 if (output_byteend < (unsigned long)output_byteoffset ||
3799 LONG_MAX < output_byteend)
3800 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3802 if (rb_str_capacity(output) < output_byteend)
3803 rb_str_resize(output, output_byteend);
3805 if (NIL_P(input)) {
3806 ip = is = NULL;
3808 else {
3809 ip = (const unsigned char *)RSTRING_PTR(input);
3810 is = ip + RSTRING_LEN(input);
3813 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3814 os = op + output_bytesize;
3816 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3817 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3818 if (!NIL_P(input)) {
3819 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3822 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3823 if (LONG_MAX / 2 < output_bytesize)
3824 rb_raise(rb_eArgError, "too long conversion result");
3825 output_bytesize *= 2;
3826 output_byteoffset_v = Qnil;
3827 goto retry;
3830 if (ec->destination_encoding) {
3831 rb_enc_associate(output, ec->destination_encoding);
3834 return econv_result_to_symbol(res);
3838 * call-seq:
3839 * ec.convert(source_string) -> destination_string
3841 * Convert source_string and return destination_string.
3843 * source_string is assumed as a part of source.
3844 * i.e. :partial_input=>true is specified internally.
3845 * finish method should be used last.
3847 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3848 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3849 * puts ec.finish.dump #=> ""
3851 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3852 * puts ec.convert("\xA4").dump #=> ""
3853 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3854 * puts ec.finish.dump #=> ""
3856 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3857 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3858 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3859 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3860 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3862 * If a conversion error occur,
3863 * Encoding::UndefinedConversionError or
3864 * Encoding::InvalidByteSequenceError is raised.
3865 * Encoding::Converter#convert doesn't supply methods to recover or restart
3866 * from these exceptions.
3867 * When you want to handle these conversion errors,
3868 * use Encoding::Converter#primitive_convert.
3871 static VALUE
3872 econv_convert(VALUE self, VALUE source_string)
3874 VALUE ret, dst;
3875 VALUE av[5];
3876 int ac;
3877 rb_econv_t *ec = check_econv(self);
3879 StringValue(source_string);
3881 dst = rb_str_new(NULL, 0);
3883 av[0] = rb_str_dup(source_string);
3884 av[1] = dst;
3885 av[2] = Qnil;
3886 av[3] = Qnil;
3887 av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3888 ac = 5;
3890 ret = econv_primitive_convert(ac, av, self);
3892 if (ret == sym_invalid_byte_sequence ||
3893 ret == sym_undefined_conversion ||
3894 ret == sym_incomplete_input) {
3895 VALUE exc = make_econv_exception(ec);
3896 rb_exc_raise(exc);
3899 if (ret == sym_finished) {
3900 rb_raise(rb_eArgError, "converter already finished");
3903 if (ret != sym_source_buffer_empty) {
3904 rb_bug("unexpected result of econv_primitive_convert");
3907 return dst;
3911 * call-seq:
3912 * ec.finish -> string
3914 * Finishes the converter.
3915 * It returns the last part of the converted string.
3917 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3918 * p ec.convert("\u3042") #=> "\e$B$\""
3919 * p ec.finish #=> "\e(B"
3921 static VALUE
3922 econv_finish(VALUE self)
3924 VALUE ret, dst;
3925 VALUE av[5];
3926 int ac;
3927 rb_econv_t *ec = check_econv(self);
3929 dst = rb_str_new(NULL, 0);
3931 av[0] = Qnil;
3932 av[1] = dst;
3933 av[2] = Qnil;
3934 av[3] = Qnil;
3935 av[4] = INT2FIX(0);
3936 ac = 5;
3938 ret = econv_primitive_convert(ac, av, self);
3940 if (ret == sym_invalid_byte_sequence ||
3941 ret == sym_undefined_conversion ||
3942 ret == sym_incomplete_input) {
3943 VALUE exc = make_econv_exception(ec);
3944 rb_exc_raise(exc);
3947 if (ret != sym_finished) {
3948 rb_bug("unexpected result of econv_primitive_convert");
3951 return dst;
3955 * call-seq:
3956 * ec.primitive_errinfo -> array
3958 * primitive_errinfo returns important information regarding the last error
3959 * as a 5-element array:
3961 * [result, enc1, enc2, error_bytes, readagain_bytes]
3963 * result is the last result of primitive_convert.
3965 * Other elements are only meaningful when result is
3966 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3968 * enc1 and enc2 indicate a conversion step as a pair of strings.
3969 * For example, a converter from EUC-JP to ISO-8859-1 converts
3970 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3971 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3973 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3974 * error_bytes is discarded portion.
3975 * readagain_bytes is buffered portion which is read again on next conversion.
3977 * Example:
3979 * # \xff is invalid as EUC-JP.
3980 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3981 * ec.primitive_convert(src="\xff", dst="", nil, 10)
3982 * p ec.primitive_errinfo
3983 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
3985 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3986 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3987 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3988 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3989 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3990 * p ec.primitive_errinfo
3991 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3993 * # partial character is invalid
3994 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3995 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3996 * p ec.primitive_errinfo
3997 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3999 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4000 * # partial characters.
4001 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4002 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4003 * p ec.primitive_errinfo
4004 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4006 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4007 * # no low surrogate after high surrogate (\xd8\x00).
4008 * # It is detected by 3rd byte (\00) which is part of next character.
4009 * # So the high surrogate (\xd8\x00) is discarded and
4010 * # the 3rd byte is read again later.
4011 * # Since the byte is buffered in ec, it is dropped from src.
4012 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4013 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4014 * p ec.primitive_errinfo
4015 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4016 * p src
4017 * #=> "@"
4019 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4020 * # The problem is detected by 4th byte.
4021 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4022 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4023 * p ec.primitive_errinfo
4024 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4025 * p src
4026 * #=> ""
4029 static VALUE
4030 econv_primitive_errinfo(VALUE self)
4032 rb_econv_t *ec = check_econv(self);
4034 VALUE ary;
4036 ary = rb_ary_new2(5);
4038 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4039 rb_ary_store(ary, 4, Qnil);
4041 if (ec->last_error.source_encoding)
4042 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4044 if (ec->last_error.destination_encoding)
4045 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4047 if (ec->last_error.error_bytes_start) {
4048 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4049 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4052 return ary;
4056 * call-seq:
4057 * ec.insert_output(string) -> nil
4059 * Inserts string into the encoding converter.
4060 * The string will be converted to the destination encoding and
4061 * output on later conversions.
4063 * If the destination encoding is stateful,
4064 * string is converted according to the state and the state is updated.
4066 * This method should be used only when a conversion error occurs.
4068 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4069 * src = "HIRAGANA LETTER A is \u{3042}."
4070 * dst = ""
4071 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4072 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4073 * ec.insert_output("<err>")
4074 * p ec.primitive_convert(src, dst) #=> :finished
4075 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4077 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4078 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4079 * dst = ""
4080 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4081 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4082 * ec.insert_output "?" # state change required to output "?".
4083 * p ec.primitive_convert(src, dst) #=> :finished
4084 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4087 static VALUE
4088 econv_insert_output(VALUE self, VALUE string)
4090 const char *insert_enc;
4092 int ret;
4094 rb_econv_t *ec = check_econv(self);
4096 StringValue(string);
4097 insert_enc = rb_econv_encoding_to_insert_output(ec);
4098 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4100 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4101 if (ret == -1) {
4102 rb_raise(rb_eArgError, "too big string");
4105 return Qnil;
4109 * call-seq:
4110 * ec.putback -> string
4111 * ec.putback(max_numbytes) -> string
4113 * Put back the bytes which will be converted.
4115 * The bytes are caused by invalid_byte_sequence error.
4116 * When invalid_byte_sequence error, some bytes are discarded and
4117 * some bytes are buffered to be converted later.
4118 * The latter bytes can be put back.
4119 * It can be observed by
4120 * Encoding::InvalidByteSequenceError#readagain_bytes and
4121 * Encoding::Converter#primitive_errinfo.
4123 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4124 * src = "\x00\xd8\x61\x00"
4125 * dst = ""
4126 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4127 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4128 * p ec.putback #=> "a\x00"
4129 * p ec.putback #=> "" # no more bytes to put back
4132 static VALUE
4133 econv_putback(int argc, VALUE *argv, VALUE self)
4135 rb_econv_t *ec = check_econv(self);
4136 int n;
4137 int putbackable;
4138 VALUE str, max;
4140 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4141 n = rb_econv_putbackable(ec);
4143 else {
4144 n = NUM2INT(max);
4145 putbackable = rb_econv_putbackable(ec);
4146 if (putbackable < n)
4147 n = putbackable;
4150 str = rb_str_new(NULL, n);
4151 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4153 if (ec->source_encoding) {
4154 rb_enc_associate(str, ec->source_encoding);
4157 return str;
4161 * call-seq:
4162 * ec.last_error -> exception or nil
4164 * Returns an exception object for the last conversion.
4165 * Returns nil if the last conversion did not produce an error.
4167 * "error" means that
4168 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4169 * Encoding::Converter#convert and
4170 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4171 * Encoding::Converter#primitive_convert.
4173 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4174 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4175 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4176 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4177 * p ec.last_error #=> nil
4180 static VALUE
4181 econv_last_error(VALUE self)
4183 rb_econv_t *ec = check_econv(self);
4184 VALUE exc;
4186 exc = make_econv_exception(ec);
4187 if (NIL_P(exc))
4188 return Qnil;
4189 return exc;
4193 * call-seq:
4194 * ec.replacement -> string
4196 * Returns the replacement string.
4198 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4199 * p ec.replacement #=> "?"
4201 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4202 * p ec.replacement #=> "\uFFFD"
4204 static VALUE
4205 econv_get_replacement(VALUE self)
4207 rb_econv_t *ec = check_econv(self);
4208 int ret;
4209 rb_encoding *enc;
4211 ret = make_replacement(ec);
4212 if (ret == -1) {
4213 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4216 enc = rb_enc_find(ec->replacement_enc);
4217 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4221 * call-seq:
4222 * ec.replacement = string
4224 * Sets the replacement string.
4226 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4227 * ec.replacement = "<undef>"
4228 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4230 static VALUE
4231 econv_set_replacement(VALUE self, VALUE arg)
4233 rb_econv_t *ec = check_econv(self);
4234 VALUE string = arg;
4235 int ret;
4236 rb_encoding *enc;
4238 StringValue(string);
4239 enc = rb_enc_get(string);
4241 ret = rb_econv_set_replacement(ec,
4242 (const unsigned char *)RSTRING_PTR(string),
4243 RSTRING_LEN(string),
4244 rb_enc_name(enc));
4246 if (ret == -1) {
4247 /* xxx: rb_eInvalidByteSequenceError? */
4248 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4251 return arg;
4254 VALUE
4255 rb_econv_make_exception(rb_econv_t *ec)
4257 return make_econv_exception(ec);
4260 void
4261 rb_econv_check_error(rb_econv_t *ec)
4263 VALUE exc;
4265 exc = make_econv_exception(ec);
4266 if (NIL_P(exc))
4267 return;
4268 rb_exc_raise(exc);
4272 * call-seq:
4273 * ecerr.source_encoding_name -> string
4275 * Returns the source encoding name as a string.
4277 static VALUE
4278 ecerr_source_encoding_name(VALUE self)
4280 return rb_attr_get(self, id_source_encoding_name);
4284 * call-seq:
4285 * ecerr.source_encoding -> encoding
4287 * Returns the source encoding as an encoding object.
4289 * Note that the result may not be equal to the source encoding of
4290 * the encoding converter if the conversion has multiple steps.
4292 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4293 * begin
4294 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4295 * rescue Encoding::UndefinedConversionError
4296 * p $!.source_encoding #=> #<Encoding:UTF-8>
4297 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4298 * p $!.source_encoding_name #=> "UTF-8"
4299 * p $!.destination_encoding_name #=> "EUC-JP"
4300 * end
4303 static VALUE
4304 ecerr_source_encoding(VALUE self)
4306 return rb_attr_get(self, id_source_encoding);
4310 * call-seq:
4311 * ecerr.destination_encoding_name -> string
4313 * Returns the destination encoding name as a string.
4315 static VALUE
4316 ecerr_destination_encoding_name(VALUE self)
4318 return rb_attr_get(self, id_destination_encoding_name);
4322 * call-seq:
4323 * ecerr.destination_encoding -> string
4325 * Returns the destination encoding as an encoding object.
4327 static VALUE
4328 ecerr_destination_encoding(VALUE self)
4330 return rb_attr_get(self, id_destination_encoding);
4334 * call-seq:
4335 * ecerr.error_char -> string
4337 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4339 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4340 * begin
4341 * ec.convert("\xa0")
4342 * rescue Encoding::UndefinedConversionError
4343 * puts $!.error_char.dump #=> "\xC2\xA0"
4344 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4345 * end
4348 static VALUE
4349 ecerr_error_char(VALUE self)
4351 return rb_attr_get(self, id_error_char);
4355 * call-seq:
4356 * ecerr.error_bytes -> string
4358 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4360 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4361 * begin
4362 * ec.convert("abc\xA1\xFFdef")
4363 * rescue Encoding::InvalidByteSequenceError
4364 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4365 * puts $!.error_bytes.dump #=> "\xA1"
4366 * puts $!.readagain_bytes.dump #=> "\xFF"
4367 * end
4369 static VALUE
4370 ecerr_error_bytes(VALUE self)
4372 return rb_attr_get(self, id_error_bytes);
4376 * call-seq:
4377 * ecerr.readagain_bytes -> string
4379 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4381 static VALUE
4382 ecerr_readagain_bytes(VALUE self)
4384 return rb_attr_get(self, id_readagain_bytes);
4388 * call-seq:
4389 * ecerr.incomplete_input? -> true or false
4391 * Returns true if the invalid byte sequence error is caused by
4392 * premature end of string.
4394 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4396 * begin
4397 * ec.convert("abc\xA1z")
4398 * rescue Encoding::InvalidByteSequenceError
4399 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4400 * p $!.incomplete_input? #=> false
4401 * end
4403 * begin
4404 * ec.convert("abc\xA1")
4405 * ec.finish
4406 * rescue Encoding::InvalidByteSequenceError
4407 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4408 * p $!.incomplete_input? #=> true
4409 * end
4411 static VALUE
4412 ecerr_incomplete_input(VALUE self)
4414 return rb_attr_get(self, id_incomplete_input);
4418 * Document-class: Encoding::UndefinedConversionError
4420 * Raised by Encoding and String methods when a transcoding operation
4421 * fails.
4425 * Document-class: Encoding::InvalidByteSequenceError
4427 * Raised by Encoding and String methods when the string being
4428 * transcoded contains a byte invalid for the either the source or
4429 * target encoding.
4433 * Document-class: Encoding::ConverterNotFoundError
4435 * Raised by transcoding methods when a named encoding does not
4436 * correspond with a known converter.
4439 void
4440 Init_transcode(void)
4442 transcoder_table = st_init_strcasetable();
4444 id_destination_encoding = rb_intern_const("destination_encoding");
4445 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4446 id_error_bytes = rb_intern_const("error_bytes");
4447 id_error_char = rb_intern_const("error_char");
4448 id_incomplete_input = rb_intern_const("incomplete_input");
4449 id_readagain_bytes = rb_intern_const("readagain_bytes");
4450 id_source_encoding = rb_intern_const("source_encoding");
4451 id_source_encoding_name = rb_intern_const("source_encoding_name");
4453 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4454 sym_undef = ID2SYM(rb_intern_const("undef"));
4455 sym_replace = ID2SYM(rb_intern_const("replace"));
4456 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4457 sym_xml = ID2SYM(rb_intern_const("xml"));
4458 sym_text = ID2SYM(rb_intern_const("text"));
4459 sym_attr = ID2SYM(rb_intern_const("attr"));
4461 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4462 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4463 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4464 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4465 sym_finished = ID2SYM(rb_intern_const("finished"));
4466 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4467 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4468 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4469 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4470 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4471 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4473 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4474 sym_newline = ID2SYM(rb_intern_const("newline"));
4475 sym_universal = ID2SYM(rb_intern_const("universal"));
4476 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4477 sym_cr = ID2SYM(rb_intern_const("cr"));
4478 sym_lf = ID2SYM(rb_intern_const("lf"));
4479 #endif
4481 InitVM(transcode);
4484 void
4485 InitVM_transcode(void)
4487 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4488 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4489 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4491 rb_define_method(rb_cString, "encode", str_encode, -1);
4492 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4494 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4495 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4496 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4497 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4498 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4499 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4500 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4501 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4502 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4503 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4504 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4505 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4506 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4507 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4508 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4509 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4510 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4511 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4512 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4514 /* Document-const: INVALID_MASK
4516 * Mask for invalid byte sequences
4518 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4520 /* Document-const: INVALID_REPLACE
4522 * Replace invalid byte sequences
4524 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4526 /* Document-const: UNDEF_MASK
4528 * Mask for a valid character in the source encoding but no related
4529 * character(s) in destination encoding.
4531 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4533 /* Document-const: UNDEF_REPLACE
4535 * Replace byte sequences that are undefined in the destination encoding.
4537 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4539 /* Document-const: UNDEF_HEX_CHARREF
4541 * Replace byte sequences that are undefined in the destination encoding
4542 * with an XML hexadecimal character reference. This is valid for XML
4543 * conversion.
4545 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4547 /* Document-const: PARTIAL_INPUT
4549 * Indicates the source may be part of a larger string. See
4550 * primitive_convert for an example.
4552 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4554 /* Document-const: AFTER_OUTPUT
4556 * Stop converting after some output is complete but before all of the
4557 * input was consumed. See primitive_convert for an example.
4559 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4561 /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4563 * Decorator for converting CRLF and CR to LF
4565 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4567 /* Document-const: CRLF_NEWLINE_DECORATOR
4569 * Decorator for converting LF to CRLF
4571 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4573 /* Document-const: CR_NEWLINE_DECORATOR
4575 * Decorator for converting LF to CR
4577 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4579 /* Document-const: XML_TEXT_DECORATOR
4581 * Escape as XML CharData
4583 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4585 /* Document-const: XML_ATTR_CONTENT_DECORATOR
4587 * Escape as XML AttValue
4589 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4591 /* Document-const: XML_ATTR_QUOTE_DECORATOR
4593 * Escape as XML AttValue
4595 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4597 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4598 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4599 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4600 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4601 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4603 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4604 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4605 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4606 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4607 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4608 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4609 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4611 Init_newline();